In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# suppress warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/nikjohn7/Disaster-Tweets-Kaggle/main/data/train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
# how many rows and columns are in the data set?
df.shape

(7613, 5)

In [4]:
# how many tweets are there in each class?
df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [5]:
# tweets that are about a real disaster
df_real_disaster = df[df['target'] == 1]
df_real_disaster.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
# tweets that are not about a real disaster
df_not_disaster = df[df['target'] == 0]
df_not_disaster.head()

Unnamed: 0,id,keyword,location,text,target
15,23,,,What's up man?,0
16,24,,,I love fruits,0
17,25,,,Summer is lovely,0
18,26,,,My car is so fast,0
19,28,,,What a goooooooaaaaaal!!!!!!,0


In [None]:
# build a text processing and classifier pipeline
# to predict the if the tweet is about a disaster or a non-disaster 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Load the Disaster Tweets dataset
df = pd.read_csv('https://raw.githubusercontent.com/nikjohn7/Disaster-Tweets-Kaggle/main/data/train.csv')

# Display first few rows
print(df.head())

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['target'], test_size=0.2, random_state=42
)

# Create a pipeline that first transforms the text data into TF-IDF vectors, then applies SVM
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)),
    ('svm', LinearSVC())
])

# Train the classifier
text_clf.fit(X_train, y_train)

# Predict the test set results
y_pred = text_clf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       874
           1       0.75      0.72      0.73       649

    accuracy                           0.78      1523
   macro avg       0.77      0.77      0.77      1523
weighted avg       0.78      0.78      0.78      1523



In [13]:
from sklearn.model_selection import GridSearchCV 

# define the parameter grid
param_grid = {
    'tfidf__max_df': [0.9, 0.95, 1.0],
    'tfidf__min_df': [1, 2, 5],
    'svm__C': [0.1, 1, 10]
}

# conduct the grid search
grid_search = GridSearchCV(
    text_clf,
    param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2,
    scoring='f1_macro'
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# show classification report for the best model
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print("\nClassification Report for Best Model:\n")
print(classification_report(y_test, y_pred_best))


Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters: {'svm__C': 1, 'tfidf__max_df': 0.9, 'tfidf__min_df': 1}
Best Cross-Validation Score: 0.7782041869018853

Classification Report for Best Model:

              precision    recall  f1-score   support

           0       0.80      0.82      0.81       874
           1       0.75      0.73      0.74       649

    accuracy                           0.78      1523
   macro avg       0.77      0.77      0.77      1523
weighted avg       0.78      0.78      0.78      1523

