## Disaster tweets classification

In [35]:
import pandas as pd
import numpy as np

In [37]:
data=pd.read_csv("disaster_tweets_train.csv")
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [39]:
data.shape

(7613, 5)

### To see null values

In [42]:
data.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

### To drop columns that are not useful

In [45]:
drop_columns = ['keyword', 'location']
data.drop(columns=drop_columns, inplace=True)
data.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [47]:
data['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

## TO Train and test

In [50]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(data.text,
                                              data.target,
                                              test_size=0.2,
                                              random_state=2024,
                                              stratify=data.target
                                              )


In [52]:
X_train.head()

7187    BUT I will be uploading these videos ASAP so y...
1278            Don't get burned twice by the same flame.
3302    The EFAK would be designed for building occupa...
1872    Only had a crush on one girl in high school an...
6849    games that I really hope to see in AGDQ: Traum...
Name: text, dtype: object

In [54]:
X_train[7187]

'BUT I will be uploading these videos ASAP so you guys get to see the new weapon types in action!'

In [56]:
X_test.head()

2376    24 killed in two simultaneous rail crash as ac...
2607    Black Eye 9: A space battle occurred at Star O...
5685    I liked a @YouTube video http://t.co/45TWHJ0l6...
3136    Survival Kit Whistle Fire Starter Wire Saw Cre...
6432    &lt; 25 Dead In Kuwait Mosque Suicide Bombing ...
Name: text, dtype: object

## KNN with n_neighbors=10, metric=euclidean

In [59]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Define the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 3))),  # Unigrams, bigrams, and trigrams
    ('clf', KNeighborsClassifier(n_neighbors=10, metric='euclidean'))
])


# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

y_pred=pipeline.predict(X_test)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.60      1.00      0.75       869
           1       0.96      0.14      0.24       654

    accuracy                           0.63      1523
   macro avg       0.78      0.57      0.50      1523
weighted avg       0.76      0.63      0.53      1523



## KNN with n_neighbor=10, metric=cosine

In [63]:
# Define the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 3))),  # Unigrams, bigrams, and trigrams
    ('clf', KNeighborsClassifier(n_neighbors=10, metric='cosine'))
])


# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

y_pred=pipeline.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.68      0.94      0.79       869
           1       0.84      0.42      0.56       654

    accuracy                           0.72      1523
   macro avg       0.76      0.68      0.68      1523
weighted avg       0.75      0.72      0.69      1523



## Random Forest

In [66]:

from sklearn.ensemble import RandomForestClassifier

# Define the pipeline
RF = Pipeline([
    ('vect', CountVectorizer(ngram_range=(3, 3))),  # Trigrams
    ('clf', RandomForestClassifier())  # Random Forest classifier
])



# Fit the pipeline on the training data
RF.fit(X_train, y_train)

y_pred=RF.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.65      0.98      0.78       869
           1       0.91      0.31      0.47       654

    accuracy                           0.69      1523
   macro avg       0.78      0.64      0.62      1523
weighted avg       0.76      0.69      0.65      1523



## Multinomial NB with alpha=0.75

In [69]:

from sklearn.naive_bayes import MultinomialNB

# Define the pipeline
NB = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 2))),  # Unigrams and bigrams
    ('clf', MultinomialNB(alpha=0.75))  # Multinomial Naive Bayes classifier with alpha=0.75
])


# Fit the pipeline on the training data
NB.fit(X_train, y_train)

y_pred=NB.predict(X_test)

print(classification_report(y_test,y_pred))



              precision    recall  f1-score   support

           0       0.79      0.88      0.83       869
           1       0.81      0.68      0.74       654

    accuracy                           0.79      1523
   macro avg       0.80      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523



## Apply preprocess text by removing stop words and applying bag of words and lemmatization

In [78]:
import spacy
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text)
    filtered_tokens = []
    
    for token in doc:
        if not token.is_stop and not token.is_punct:
            filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)


In [80]:
data['preprocess_text']=data.text.apply(preprocess_text)

In [82]:
data.head()

Unnamed: 0,id,text,target,preprocess_text
0,1,Our Deeds are the Reason of this #earthquake M...,1,deed Reason earthquake ALLAH forgive
1,4,Forest fire near La Ronge Sask. Canada,1,forest fire near La Ronge Sask Canada
2,5,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,6,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfire evacuation orde..."
4,7,Just got sent this photo from Ruby #Alaska as ...,1,get send photo Ruby Alaska smoke wildfire pour...


## Random Forest

In [91]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(data.preprocess_text,
                                              data.target,
                                              test_size=0.2,
                                              random_state=2024,
                                              stratify=data.target
                                              )


In [93]:

from sklearn.ensemble import RandomForestClassifier

# Define the pipeline
RF = Pipeline([
    ('vect', CountVectorizer(ngram_range=(3, 3))),  # Trigrams
    ('clf', RandomForestClassifier())  # Random Forest classifier
])



# Fit the pipeline on the training data
RF.fit(X_train, y_train)

y_pred=RF.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.66      0.97      0.78       869
           1       0.91      0.32      0.48       654

    accuracy                           0.70      1523
   macro avg       0.78      0.65      0.63      1523
weighted avg       0.76      0.70      0.65      1523



In [95]:

from sklearn.ensemble import RandomForestClassifier

# Define the pipeline
RF = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 2))),  
    ('clf', RandomForestClassifier())  
])



# Fit the pipeline on the training data
RF.fit(X_train, y_train)

y_pred=RF.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.73      0.95      0.83       869
           1       0.89      0.54      0.67       654

    accuracy                           0.77      1523
   macro avg       0.81      0.74      0.75      1523
weighted avg       0.80      0.77      0.76      1523



In [99]:

from sklearn.ensemble import RandomForestClassifier

# Define the pipeline
RF = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', RandomForestClassifier())  
])



# Fit the pipeline on the training data
RF.fit(X_train, y_train)

y_pred=RF.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.75      0.91      0.82       869
           1       0.84      0.59      0.70       654

    accuracy                           0.78      1523
   macro avg       0.80      0.75      0.76      1523
weighted avg       0.79      0.78      0.77      1523



In [105]:

from sklearn.naive_bayes import MultinomialNB

# Define the pipeline
NB = Pipeline([
    ('vect', CountVectorizer()),  
    ('clf', MultinomialNB())  
])


# Fit the pipeline on the training data
NB.fit(X_train, y_train)

y_pred=NB.predict(X_test)

print(classification_report(y_test,y_pred))



              precision    recall  f1-score   support

           0       0.79      0.84      0.81       869
           1       0.77      0.71      0.74       654

    accuracy                           0.78      1523
   macro avg       0.78      0.77      0.78      1523
weighted avg       0.78      0.78      0.78      1523

