In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import re
from textblob import TextBlob

from symspellpy import SymSpell, Verbosity

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import (
    RandomForestClassifier, 
    AdaBoostClassifier, 
    GradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn.metrics import f1_score

In [2]:
df_tweet = pd.read_csv('/Users/gautambr/Documents/Kaggle Competition/NLP With Disaster Tweets/DataSet/NLP Disaster Tweets/train.csv')
df_tweet_test = pd.read_csv('/Users/gautambr/Documents/Kaggle Competition/NLP With Disaster Tweets/DataSet/NLP Disaster Tweets/test.csv')

In [3]:
print("Train Shape ",df_tweet.shape)
df_tweet.head()

Train Shape  (7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
print("Test Shape ",df_tweet_test.shape)
df_tweet_test.head()

Test Shape  (3263, 4)


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
df_tweet.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
df_tweet_test.isna().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

**Insight :** From the above data we can see that the keyword and location column has more data missing as we are not considering these columns we can neglect them.

### Preprocessing

#### Clean Text

In [7]:
# Define preprocessing function
def clean_text(text):
    text = re.sub('[^A-Za-z0-9.]+', ' ', text)  # Keep words & numbers
    text = text.lower().strip()  # Convert to lowercase & strip spaces
    return text

In [8]:
df_tweet['text'] = df_tweet['text'].apply(clean_text)
df_tweet_test['text'] = df_tweet_test['text'].apply(clean_text)

#### Handling Misspellings

In [9]:
# Initialize SymSpell
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym_spell.load_dictionary("frequency_dictionary_en_82_765.txt", term_index=0, count_index=1)

2025-02-23 13:35:19,062: E symspellpy.symspellpy] Dictionary file not found at frequency_dictionary_en_82_765.txt.


False

In [10]:
def correct_text(text):
    corrected = sym_spell.lookup_compound(text, max_edit_distance=2)
    return corrected[0].term if corrected else text

In [11]:
df_tweet['text'] = df_tweet['text'].apply(correct_text)
df_tweet_test['text'] = df_tweet_test['text'].apply(correct_text)

#### Lemmatization

In [12]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gautambr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
df_tweet['text'] = df_tweet['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
df_tweet_test['text'] = df_tweet_test['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

#### Removing Stopwords

In [14]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gautambr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
df_tweet['text'] = df_tweet['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df_tweet_test['text'] = df_tweet_test['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

### Train test split

### Select only required columns from your dataframe

In [16]:
df_tweet = df_tweet[['text','target']]
df_tweet.head()

Unnamed: 0,text,target
0,deed reason earthquake may allah forgive u,1
1,forest fire near la ronge sask canada,1
2,resident asked shelter place notified officer ...,1
3,13 000 people receive wildfire evacuation orde...,1
4,got sent photo ruby alaska smoke wildfire pour...,1


In [17]:
df_tweet_test = df_tweet_test[['text']]
df_tweet_test.head()

Unnamed: 0,text
0,happened terrible car crash
1,heard earthquake different city stay safe ever...
2,forest fire spot pond goose fleeing across str...
3,apocalypse lighting spokane wildfire
4,typhoon soudelor kill 28 china taiwan


In [20]:
X_train, X_test, y_train, y_test = train_test_split(df_tweet.text.values, df_tweet.target.values, test_size=0.20, random_state=3)

In [21]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((6090,), (6090,), (1523,), (1523,))

### Vectorize the words

In [24]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [27]:
vectorizer.get_feature_names_out()

array(['00', '000', '0000', ..., 'zzpojgngaj', 'zzsee5hipm', 'zzzz'],
      dtype=object)

In [28]:
X_train.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Modeling

In [36]:
models = {
                "Logistic Regression": LogisticRegression(),
                "Random Forest": RandomForestClassifier(),
                "Decision Tree": DecisionTreeClassifier(),
                "Gradient Boosting": GradientBoostingClassifier(),
                "XGBoost": XGBClassifier(),
                "AdaBoost": AdaBoostClassifier()
                }
model_list = []
f1_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_f1 = f1_score(y_train, y_train_pred, average='micro')

    model_test_f1 = f1_score(y_test, y_test_pred, average='micro')

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- F1 Score: {:.4f}".format(model_train_f1))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- F1 Score: {:.4f}".format(model_test_f1))

    f1_list.append(model_test_f1)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- F1 Score: 0.8905
----------------------------------
Model performance for Test set
- F1 Score: 0.7951


Random Forest
Model performance for Training set
- F1 Score: 0.9975
----------------------------------
Model performance for Test set
- F1 Score: 0.7879


Decision Tree
Model performance for Training set
- F1 Score: 0.9975
----------------------------------
Model performance for Test set
- F1 Score: 0.7026


Gradient Boosting
Model performance for Training set
- F1 Score: 0.7908
----------------------------------
Model performance for Test set
- F1 Score: 0.7492


XGBoost
Model performance for Training set
- F1 Score: 0.8718
----------------------------------
Model performance for Test set
- F1 Score: 0.7636


AdaBoost
Model performance for Training set
- F1 Score: 0.6361
----------------------------------
Model performance for Test set
- F1 Score: 0.6284




In [39]:
pd.DataFrame(list(zip(model_list, f1_list)), columns=['Model Name', 'F1_Score']).sort_values(by=["F1_Score"],ascending=False)

Unnamed: 0,Model Name,F1_Score
0,Logistic Regression,0.795141
1,Random Forest,0.787919
4,XGBoost,0.763624
3,Gradient Boosting,0.749179
2,Decision Tree,0.702561
5,AdaBoost,0.628365


**Insight :** we can see from the dataframe that Logistic Regression model performs well.