# Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob
import warnings
warnings.filterwarnings('ignore')

# Load the Dataset

In [3]:
df = pd.read_csv(r'C:\Users\zubai\Desktop\Data_Science_Jupyter\project_7(NLP Project for Disaster Tweet Classification)\data\cleaned_twitter_disaster.csv')

In [5]:
df.head()

Unnamed: 0,id,keyword,location,text,target,char_length,word_length,cleaned_text,tokens
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,69,13,our deeds are the reason of this earthquake ma...,"['our', 'deeds', 'are', 'the', 'reason', 'of',..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,38,7,forest fire near la ronge sask canada,"['forest', 'fire', 'near', 'la', 'ronge', 'sas..."
2,5,,,All residents asked to 'shelter in place' are ...,1,133,22,all residents asked to shelter in place are be...,"['all', 'residents', 'asked', 'to', 'shelter',..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65,8,13000 people receive wildfires evacuation orde...,"['13000', 'people', 'receive', 'wildfires', 'e..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88,16,just got sent this photo from ruby alaska as s...,"['just', 'got', 'sent', 'this', 'photo', 'from..."


In [7]:
df.shape

(7613, 9)

In [9]:
df.columns

Index(['id', 'keyword', 'location', 'text', 'target', 'char_length',
       'word_length', 'cleaned_text', 'tokens'],
      dtype='object')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            7613 non-null   int64 
 1   keyword       7552 non-null   object
 2   location      5080 non-null   object
 3   text          7613 non-null   object
 4   target        7613 non-null   int64 
 5   char_length   7613 non-null   int64 
 6   word_length   7613 non-null   int64 
 7   cleaned_text  7613 non-null   object
 8   tokens        7613 non-null   object
dtypes: int64(4), object(5)
memory usage: 535.4+ KB


In [13]:
df.describe()

Unnamed: 0,id,target,char_length,word_length
count,7613.0,7613.0,7613.0,7613.0
mean,5441.934848,0.42966,101.037436,14.903586
std,3137.11609,0.49506,33.781325,5.732604
min,1.0,0.0,7.0,1.0
25%,2734.0,0.0,78.0,11.0
50%,5408.0,0.0,107.0,15.0
75%,8146.0,1.0,133.0,19.0
max,10873.0,1.0,157.0,31.0


## Split the dataset

In [15]:
from sklearn.model_selection import train_test_split

X = df['cleaned_text']  
y = df['target'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6090,), (1523,), (6090,), (1523,))

In [19]:
# Convert X_train and X_test to DataFrames to make it easier to add new columns
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

## 2.1- Word Frequencies and TF-IDF Features

In [23]:
# Initialize vectorizers
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

In [25]:
# Fit on training data and transform both training and test sets
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

### Convert to DataFrames

In [29]:
X_train_count_df = pd.DataFrame(X_train_count.toarray(), columns=count_vectorizer.get_feature_names_out())
X_train_count_df.head(3)

Unnamed: 0,0011,005225,010156,010217,0104,010401,0106,012032,012624,015025,...,zombie,zombiefunrun2014,zombies,zone,zonesthank,zonewolf123,zouma,zss,zumiez,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
X_test_count_df = pd.DataFrame(X_test_count.toarray(), columns=count_vectorizer.get_feature_names_out())
X_test_count_df.head(3)

Unnamed: 0,0011,005225,010156,010217,0104,010401,0106,012032,012624,015025,...,zombie,zombiefunrun2014,zombies,zone,zonesthank,zonewolf123,zouma,zss,zumiez,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
X_train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
X_train_tfidf_df.head(3)

Unnamed: 0,0011,005225,010156,010217,0104,010401,0106,012032,012624,015025,...,zombie,zombiefunrun2014,zombies,zone,zonesthank,zonewolf123,zouma,zss,zumiez,zzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
X_test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
X_test_tfidf_df.head(3)

Unnamed: 0,0011,005225,010156,010217,0104,010401,0106,012032,012624,015025,...,zombie,zombiefunrun2014,zombies,zone,zonesthank,zonewolf123,zouma,zss,zumiez,zzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2.2- Sentiment Analysis Features

In [39]:
# Function to calculate polarity and subjectivity
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

In [41]:
# Apply sentiment analysis
X_train_sentiment = X_train.apply(lambda x: pd.Series(get_sentiment(x), index=['polarity', 'subjectivity']))
X_train_sentiment

Unnamed: 0,polarity,subjectivity
0,0.166667,0.333333
1,0.000000,0.000000
2,0.000000,0.000000
3,-0.125000,1.000000
4,0.000000,0.000000
...,...,...
6085,0.500000,0.500000
6086,0.000000,0.500000
6087,0.107143,0.214286
6088,0.000000,1.000000


In [43]:
X_test_sentiment = X_test.apply(lambda x: pd.Series(get_sentiment(x), index=['polarity', 'subjectivity']))
X_test_sentiment

Unnamed: 0,polarity,subjectivity
0,0.136364,0.454545
1,0.000000,0.000000
2,0.000000,0.000000
3,0.400000,0.625000
4,0.000000,0.000000
...,...,...
1518,0.000000,0.000000
1519,0.000000,0.000000
1520,0.100000,0.400000
1521,0.000000,0.000000


## 2.3- Additional Features: Tweet Length, Hashtags, Mentions

In [45]:
# Function to calculate additional features
def extract_additional_features(text):
    tweet_length = len(text)
    num_hashtags = len(re.findall(r"#\w+", text))
    num_mentions = len(re.findall(r"@\w+", text))
    return pd.Series([tweet_length, num_hashtags, num_mentions], index=['tweet_length', 'num_hashtags', 'num_mentions'])

In [47]:
# Apply function to both training and test sets
X_train_additional = X_train.apply(extract_additional_features)
X_train_additional

Unnamed: 0,tweet_length,num_hashtags,num_mentions
0,114,0,0
1,72,0,0
2,102,0,0
3,93,0,0
4,41,0,0
...,...,...,...
6085,81,0,0
6086,131,0,0
6087,76,0,0
6088,111,0,0


In [49]:
X_test_additional = X_test.apply(extract_additional_features)
X_test_additional

Unnamed: 0,tweet_length,num_hashtags,num_mentions
0,64,0,0
1,94,0,0
2,65,0,0
3,111,0,0
4,102,0,0
...,...,...,...
1518,26,0,0
1519,96,0,0
1520,58,0,0
1521,105,0,0


## 2.4- Combine All Features

In [53]:
# Combine all training features
X_train_combined = pd.concat([X_train_count_df, X_train_tfidf_df, X_train_sentiment, X_train_additional], axis=1)
X_train_combined.head(5)

Unnamed: 0,0011,005225,010156,010217,0104,010401,0106,012032,012624,015025,...,zonewolf123,zouma,zss,zumiez,zzzz,polarity,subjectivity,tweet_length,num_hashtags,num_mentions
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.166667,0.333333,114,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,102,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,-0.125,1.0,93,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41,0,0


In [55]:
# Combine all test features
X_test_combined = pd.concat([X_test_count_df, X_test_tfidf_df, X_test_sentiment, X_test_additional], axis=1)
X_test_combined.head(5)

Unnamed: 0,0011,005225,010156,010217,0104,010401,0106,012032,012624,015025,...,zonewolf123,zouma,zss,zumiez,zzzz,polarity,subjectivity,tweet_length,num_hashtags,num_mentions
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.136364,0.454545,64,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.4,0.625,111,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,102,0,0


In [57]:
X_train_combined.shape, X_test_combined.shape

((6090, 27533), (1523, 27533))

## 2.5- Train Models and Evaluate Performance

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score

### 1. Logistic Regression

In [None]:
model_logreg = LogisticRegression()
model_logreg.fit(X_train_combined,y_train)
predlogreg = model_logreg.predict(X_test_combined)

In [None]:
# Calculate the accuracy
accuracy = accuracy_score(y_test, predlogreg)
print(f"Accuracy: {accuracy * 100:.2f}%")

### 2. Random Forest

In [None]:
model_ran_for = RandomForestClassifier()
model_ran_for.fit(X_train_combined,y_train)
pred_random = model_ran_for.predict(X_test_combined)

In [None]:
accuracy = accuracy_score(y_test, pred_random)
print(f"Accuracy: {accuracy * 100:.2f}%")

### 3. MLP Classifier

In [None]:
model_mlp = MLPClassifier()
model_mlp.fit(X_train_combined,y_train)
pred_mlp = model_mlp.predict(X_test_combined)

In [None]:
accuracy = accuracy_score(y_test, pred_mlp)
print(f"Accuracy: {accuracy * 100:.2f}%")