In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Introduction**

Natural Language Processing, also known as NLP, is a subfield of computer science, specifically artificial intelligence, that focuses on understanding written and spoken text. This project covers various tasks of disaster tweets. I applied some Data mining skill set, via Data Understanding, Data Pre-processing, Data Werehousing, Data Modeling and Data Evaluation techniques. 



**Describe the Data** The train and test data are structured labelled data that are imported from CSV files in the form of pandas DataFrame using pr.read csv method (Pyhon package).

As seen in the figure below, the DataFrame is made up of the following 4 columns:
* id: a unique identifier of every tweet
* keyword: a particular keyword from the tweet (this can be blank)
* location: the location the tweet was sent from (this can be blank)
* text: the text of the tweet
* target: present only in the train data, and denotes if the tweet is about a real disaster (1) or not (0)

In [None]:
# For Preprocesssing Text Data
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Import the train test split
from sklearn.model_selection import train_test_split

# Check Performance
from sklearn.metrics import classification_report

In [None]:
disaster_Train=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
len(disaster_Train)

In [None]:
disaster_Test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
len(disaster_Test)

In [None]:
disaster_Train.describe()

In [None]:
disaster_Train.sample(5)

**Data Text Preprocessing ** **Exploratory Data Analysis (EDA), Visualize and Clean the Dataset**

Data preprocessing involves, cleaning the dataset and preparing text data before encoding them in the form of numeric vectors.

* Identify the missing value in the Dataset
* Entities, URL Links and Punctuation Removal
* Spelling Correction
* Filling Missing Data by Keyword Extraction and Entity Recognition
* Lemmatization
* Stop Words Removal

In [None]:
# Data information
disaster_Train.info()

**Visualize**

In [None]:
disaster_Train['target'].value_counts().plot.bar()

In [None]:
#Identify the missing value in the Dataset
print(disaster_Train.isnull().sum())

In [None]:
disaster_Train['keyword'].fillna('empty', inplace=True)
disaster_Train['location'].fillna('empty', inplace=True)

In [None]:
print(disaster_Train.isnull().sum())

**Lowercasing**

Converting the text into lowercase is an essential step in any NLP project. if not converted to lowercase, they will be represented as three different words in the vector space model

In [None]:
#convert to lowercase on Train 
disaster_Train["keyword"] = disaster_Train["keyword"].apply(lambda x: str.lower(x))
disaster_Train["location"] = disaster_Train["location"].apply(lambda x: str.lower(x))
disaster_Train["text"] = disaster_Train["text"].apply(lambda x: str.lower(x))

It is important to pay attention to the fact that the 'keyword' and 'location' columns contain missing values (i.e.: values that are NA such as numpy.nan or None); to avoid an unexpected error, inplace method is used to skip empty entries.

In [None]:
disaster_Test['keyword'].fillna('empty', inplace=True)
disaster_Test['location'].fillna('empty', inplace=True)

In [None]:
#Convert to lowercase on Test
disaster_Test["keyword"] = disaster_Test["keyword"].apply(lambda x: str.lower(x))
disaster_Test["location"] = disaster_Test["location"].apply(lambda x: str.lower(x))
disaster_Test["text"] = disaster_Test["text"].apply(lambda x: str.lower(x))

**Entities, URL Links and Punctuation Removal**

Since the dataset is based on tweets, it may include a lot of mentions (e.g.: @somone02) and hashtags (e.g.: #). These are removed by creating a function named remove_entities

In [None]:
import re, string
def remove_entities(text):
    entity_prefixes = ['@','#']
    for seperator in string.puntuation:
        if seperator not in entity_prefixes :
            text = text.replace(separator, ' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word(0) not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

disaster_Train["keyword"] = disaster_Train["keyword"].apply(lambda x: str.lower(x))
disaster_Train["location"] = disaster_Train["location"].apply(lambda x: str.lower(x))
disaster_Train["text"] = disaster_Train["text"].apply(lambda x: str.lower(x))

disaster_Test["keyword"] = disaster_Test["keyword"].apply(lambda x: str.lower(x))
disaster_Test["location"] = disaster_Test["location"].apply(lambda x: str.lower(x))
disaster_Test["text"] = disaster_Test["text"].apply(lambda x: str.lower(x))

Replace URL links with Blank

In [None]:
import re

disaster_Train["keyword"] = disaster_Train["keyword"].apply(lambda x: re.sub(r"(?:\@|http?\://|https?\://|www)\S+",' ', x))
disaster_Train["location"] = disaster_Train["location"].apply(lambda x: re.sub(r"(?:\@|http?\://|https?\://|www)\S+",' ', x))
disaster_Train["text"] = disaster_Train["text"].apply(lambda x: re.sub(r"(?:\@|http?\://|https?\://|www)\S+",' ', x))

disaster_Test["keyword"] = disaster_Test["keyword"].apply(lambda x: re.sub(r"(?:\@|http?\://|https?\://|www)\S+",' ', x))
disaster_Test["location"] = disaster_Test["location"].apply(lambda x: re.sub(r"(?:\@|http?\://|https?\://|www)\S+",' ', x))
disaster_Test["text"] = disaster_Test["text"].apply(lambda x: re.sub(r"(?:\@|http?\://|https?\://|www)\S+",' ', x))

**Remove puntuation**

In [None]:
import re

disaster_Train["keyword"] = disaster_Train["keyword"].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
disaster_Train["location"] = disaster_Train["location"].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
disaster_Train["text"] = disaster_Train["text"].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

disaster_Test["keyword"] = disaster_Test["keyword"].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
disaster_Test["location"] = disaster_Test["location"].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
disaster_Test["text"] = disaster_Test["text"].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

**Spelling Correction.**

It is expected that tweets will include several spelling mistakes, therefore spelling correction is applied to improve performance

In [None]:
pip install -U symspellpy

In [None]:
import pkg_resources

In [None]:
import pkg_resources

dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")

In [None]:
from symspellpy import SymSpell, Verbosity
sym_spell = SymSpell()
#dictionary_path = "./frequency_dictionary_en_82_765.txt"
sym_spell.load_dictionary(dictionary_path, 0, 1)

def spelling_correction(sent):
    doc_w_correct_spelling = []
    for tok in sent.split(" "):
        x= sym_spell.lookup(tok, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True)[0].__str__()
        y= x.split(',')[0]
        doc_w_correct_spelling.append(y)
    return " ".join(doc_w_correct_spelling)

disaster_Train["keyword"] = disaster_Train["keyword"].apply(lambda x: spelling_correction(x))
disaster_Train["location"] = disaster_Train["location"].apply(lambda x: spelling_correction(x))
disaster_Train["text"] = disaster_Train["text"].apply(lambda x: spelling_correction(x))

disaster_Test["keyword"] = disaster_Test["keyword"].apply(lambda x: spelling_correction(x))
disaster_Test["location"] = disaster_Test["location"].apply(lambda x: spelling_correction(x))
disaster_Test["text"] = disaster_Test["text"].apply(lambda x: spelling_correction(x))

**Filling Missing Data**

train.info() and test.info() reveal that the train data has 61 entries empty in the keyword column, and 2533 entries empty in the location column; And, the test data has 26 entries empty in the keyword column, and 1105 entries empty in the location column

**Keyword Extraction** The extract_keywords function, shown below, was inspired by the following articles Keyword Extraction with BERT and Build A Keyword Extraction API with Spacy, Flask, and FuzzyWuzzy

In [None]:
pip install -U sentence-transformers

In [None]:
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# download the language model
import os
os.system('python -m spacy download en')
nlp = spacy.blank("en")

In [None]:
import tensorflow_hub as hub
model = "https://tfhub.dev/google/nnlm-en-dim50/2"
embed = hub.load(model)

In [None]:
# extract keywords

def extract_keywords(nlp=nlp, doc="", no_of_keywords=5, model=model):
    doc = doc.lower()
    doc = re.sub(r'(?:\@|http?\://|https?\://|www)\S+',' ', doc)
    doc = re.sub(r'[^\w\s]',' ', doc)
    doc = re.sub(' \d+', ' ', doc)
    doc_ = nlp(doc)
    
    pos_tag = ['VERB','NOUN','ADJ','PROPN']
    result=[]
    
    for token in doc_:
        if (token.pos_ in pos_tag):
            result.append(token.text)
            
    doc_embedding = model.encode([doc])
    results_embeddings = model.encoded(result)
    
    distances = cosine_similarity(doc_embedding, results_embeddings)
    keywords = [result[index] for index in distances.argsort()[0][-no_of_keywords:]]
    return keywords
for i in range(len(disaster_Train["keyword"])):
    if pd.isnull(disaster_Train['keyword'].iloc[i]):
        try:
            disaster_Train['keyword'].iloc[i] = extract_keywords(nlp=nlps, doc=disaster_Train.text.iloc[i], no_of_keywords=1, model=model)[0]
        except:
            disaster_Train['keyword'].iloc[i] = "NaN"
            
for i in range(len(disaster_Test["keyword"])):
    if pd.isnull(disaster_Test['keyword'].iloc[i]):
        try:
            disaster_Test['keyword'].iloc[i] = extract_keywords(nlp=nlps, doc=disaster_Test.text.iloc[i], no_of_keywords=1, model=model)[0]
        except:
            disaster_Test['keyword'].iloc[i] = "NaN"
            

**Stop Words Removal**

Stop words provide low level information to the text and are often found in abundance; therefore, they are removed to give more focus to other significant information

In [None]:
nltk.download('stopwords')
stop = stopwords.words('english')

In [None]:
disaster_Train['text'].apply(lambda x: [word for word in x.split() if word not in stop])
disaster_Train['keyword'].apply(lambda x: [word for word in x.split() if word not in stop])
disaster_Train['location'].apply(lambda x: [word for word in x.split() if word not in stop])

In [None]:
disaster_Test['text'].apply(lambda x: [word for word in x.split() if word not in stop])
disaster_Test['keyword'].apply(lambda x: [word for word in x.split() if word not in stop])
disaster_Test['location'].apply(lambda x: [word for word in x.split() if word not in stop])

In [None]:
disaster_Train['text'] = disaster_Train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
disaster_Train['keyword'] = disaster_Train['keyword'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
disaster_Train['location'] = disaster_Train['location'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
disaster_Test['text'] = disaster_Test['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
disaster_Test['keyword'] = disaster_Test['keyword'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
disaster_Test['location'] = disaster_Test['location'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
disaster_Train.head()

In [None]:
disaster_Test.head()

In [None]:
# Develop summission code
y = disaster_Train['target']
X = disaster_Train.drop(columns=['target', 'id'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the train and test sets to verify
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
X_train.head(5)

**Transform to CountVectorizer**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000)

cv.fit(X_train['text'])
X_train_cv = cv.transform(X_train['text']).toarray()
X_test_cv = cv.transform(X_test['text']).toarray()

print(type(X_train))
print(type(X_train_cv))

train_with_cv = pd.DataFrame(X_train_cv, columns= cv.get_feature_names_out())
train_with_cv.head()

**Model Architecture**

Develop the model with sklearn naive bayes package

In [None]:
from sklearn.naive_bayes import MultinomialNB
model_disaster=MultinomialNB().fit(X_train_cv,y_train)

y_train_pred = model_disaster.predict(X_train_cv)
y_test_pred = model_disaster.predict(X_test_cv)

print('Train Report ---')
print(classification_report(y_train, y_train_pred))

print('Validation Report ---')
print(classification_report(y_test, y_test_pred))

In [None]:
#test predicitons
X_test_cv = cv.transform(disaster_Test['text']).toarray()
y_test_pred = model_disaster.predict(X_test_cv)

# Convert predictions to a DataFrame
model_disaster1 = pd.DataFrame(y_test_pred)
model_disaster1.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000)

cv.fit(X_train['text'])
X_train_cv = cv.transform(X_train['text']).toarray()
X_test_cv = cv.transform(X_test['text']).toarray()

print(type(X_train))
print(type(X_train_cv))

train_with_cv = pd.DataFrame(X_train_cv, columns= cv.get_feature_names_out())
train_with_cv.head()

In [None]:
# Prediction code
predictions_train = model_disaster.predict(X_train_cv)
predictions_test = model_disaster.predict(X_test_cv)
# Convert predictions to DataFrames
disaster1 = pd.DataFrame({'target': predictions_train})
disaster2 = pd.DataFrame({'target': predictions_test})

# Concatenate DataFrames vertically
submission = pd.concat([disaster1, disaster2], ignore_index=True)
submission = pd.DataFrame(submission)
submission.head()

In [None]:
disaster_Test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
submission = pd.DataFrame({'id': disaster_Test['id'], 'target': model_disaster1[0]})
submission.head()

In [None]:
# Save the DataFrame to a CSV file (adjust the filename as needed)
submission.to_csv('/kaggle/working/submission.csv', index=False)

**Conclusion**
I improve This article by introduces natural language processing (NLP) through a sentiment analysis project. It focuses on text preprocessing, cleaning the data and model creation. The problem of the data is to clearn and work on Missing value for modeling creation

The text preprocessing is made up of six main steps which are: Lowercasing, Entities, URL Links and Punctuation Removal, Spelling Correction, Filling Missing Data, Lemmatization and Stop Words Removal. It is important to keep in mind that one can choose to include more steps to the text preprocessing (e.g. converting abbreviations to their original forms) or exclude some steps she/he thinks are unnecessary.

In the model section, the model reported accuracy 86%. The model can be used with logistic and decision tree model for more perfect accurancy. 