In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jenish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jenish\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jenish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Load the dataset
data = pd.read_csv("disaster_tweets_data.csv")

In [7]:
# Preprocessing function
def preprocess_text(text):
    # Tokenizing words
    words = nltk.word_tokenize(text)
    # Convert words to lower case
    words = [word.lower() for word in words]
    # Removing punctuations
    words = [word for word in words if word.isalnum()]
    # Removing stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if not word in stop_words]
    # Lemmatizing words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

In [8]:
# Applying preprocessing to tweets
data['tweets'] = data['tweets'].apply(preprocess_text)

In [9]:
# Splitting data into independent and dependent features
X = data['tweets']
y = data['target']

In [10]:
# Transforming words into vectors using TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)


In [11]:
# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Model training and evaluation
models = {
    "Multinomial Naïve Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "KNN Classification": KNeighborsClassifier()
}
for name, model in models.items():
    # Training the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)

In [15]:
# Evaluation
print(f"Model: {name}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("\n")

Model: KNN Classification
Confusion Matrix:
[[733 141]
 [214 435]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.84      0.81       874
           1       0.76      0.67      0.71       649

    accuracy                           0.77      1523
   macro avg       0.76      0.75      0.76      1523
weighted avg       0.77      0.77      0.76      1523





In [16]:
# Reporting the model with the best accuracy
best_model = max(models, key=lambda x: models[x].score(X_test, y_test))
print(f"The best model is: {best_model} with an accuracy of {models[best_model].score(X_test, y_test)}")

The best model is: Logistic Regression with an accuracy of 0.7957977675640184
