In [None]:
pip install contractions

# **1. IMPORTING LIBRARIES AND MODULES**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import re
import contractions
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

# 2. **EXPLORATORY DATA ANALYSIS**

## 2.1 IMPORTING TRAINING FILE  

In [None]:
df = pd.read_csv(r"/kaggle/input/nlp-getting-started/train.csv")
df.head()

## 2.2 DATA CLEANING AND FEATURE SELECTION

In [None]:
df.drop('id',axis=1,inplace=True)
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
df.drop('location', axis=1, inplace=True)
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
print("Null Values:", df.isnull().sum())
print("Duplicates:", df.duplicated().sum())

In [None]:
le = LabelEncoder()
df['keyword'] = le.fit_transform(df['keyword'])
df[:40]

In [None]:
df = df.sample(frac=1).reset_index(drop=True)
df

# 3. DATA PRE-PROCESSING

## 3.1 TEXT EXTRACTION AND TOKENIZATION

In [None]:
def text_preprocessing(text):
    
    # 1. Expanding contractions
    text = contractions.fix(text)
    
    # 2. Extracting only alphabets
    text = re.compile(r'[^a-zA-Z\s]').sub('', text)
    
    # 3. Converting to Lower-case
    text = text.lower()
    
    # 4. Removing stopwords
    words = set(stopwords.words('english'))
    text = text.split()
    text = ' '.join([word for word in text if word not in words])
    
    # 5. Stemming remaining words
    ps = PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])

    return text

df['text'] = df['text'].apply(text_preprocessing)
df

## 3.2 FEATURE EXTRACTION AND VECTORIZATION

In [None]:
features = df.drop('target', axis=1)
labels = df['target']

tfidf = TfidfVectorizer(max_features=5000)
text_feature = tfidf.fit_transform(features['text']).toarray()

features = np.concatenate([text_feature,features[['keyword']]],axis=1)
features.shape

# 4. TRAIN_TEST_SPLIT

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# 5. TRAINING THE MODELS

## 5.1 DEFINING DICTIONARIES 

In [None]:
Model_dict = {'XGB': XGBClassifier(), 'XGBRF': XGBRFClassifier(), 'GaussianNB': GaussianNB(), 'BernoulliNB': BernoulliNB(), 'MultinomialNB': MultinomialNB(), 'ComplementNB': ComplementNB(),'DecisionTree': DecisionTreeClassifier(), 'RandomForest': RandomForestClassifier(), 'AdaBoost': AdaBoostClassifier(), 'Bagging': BaggingClassifier(), 'ExtraTrees': ExtraTreesClassifier(), 'GB': GradientBoostingClassifier(), 'HistGB': HistGradientBoostingClassifier(), 'LogisticRegression': LogisticRegression(), 'KNN': KNeighborsClassifier()}
Training_Accuracy_dict = {}
Precision_dict = {}
Recall_dict = {}
F1_score_dict = {}

## 5.2 FITTING AND PREDICTING THE TRAINING DATA AND CONFUSION MATRIX

In [None]:
for key, val in Model_dict.items():
    val.fit(features_train, labels_train)
    
    train_pred = val.predict(features_test)
    Training_Accuracy = accuracy_score(train_pred, labels_test)
    Training_Accuracy_dict.update({key:Training_Accuracy})
    Precision_dict.update({key:precision_score(train_pred,labels_test)})
    Recall_dict.update({key:recall_score(train_pred,labels_test)})
    F1_score_dict.update({key:f1_score(train_pred,labels_test)})
    
    fig, ax = plt.subplots()
    ConfusionMatrixDisplay(confusion_matrix = confusion_matrix(train_pred,labels_test), display_labels=['Not A Disaster', 'Disaster']).plot(ax=ax)
    ax.set_title(key)
    plt.show()

## 5.3 COMPARING ACCURACIES

### 5.3.1 MAKING A DATAFRAME CONTAINING ALL ACCURACIES

In [None]:
accuracy_df = pd.DataFrame()
accuracy_df['Model Name'] = Model_dict.keys()
accuracy_df['Training Accuracy'] = Training_Accuracy_dict.values()
accuracy_df['Precision'] = Precision_dict.values()
accuracy_df['Recall'] = Recall_dict.values()
accuracy_df['F1 Score'] = F1_score_dict.values()
accuracy_df

### 5.3.2 COMPARISON GRAPHS

#### 5.3.2.1 TRAINING ACCURACY GRAPH

In [None]:
plt.figure(figsize=(22, 8))
plt.bar(Training_Accuracy_dict.keys(), Training_Accuracy_dict.values())
plt.xlabel('Model Name')
plt.ylabel('Training Accuracy')
plt.title('Comparison Graph for the Training Accuracy of the Models used')
plt.show()

#### 5.3.2.2 PRECISION GRAPH

In [None]:
plt.figure(figsize=(22, 8))
plt.bar(Precision_dict.keys(), Precision_dict.values())
plt.xlabel('Model Name')
plt.ylabel('Precision Score')
plt.title('Comparison Graph for the Precision Score of the Models used')
plt.show()

#### 5.3.2.3 RECALL GRAPH

In [None]:
plt.figure(figsize=(22, 8))
plt.bar(Recall_dict.keys(), Recall_dict.values())
plt.xlabel('Model Name')
plt.ylabel('Recall')
plt.title('Comparison Graph for the Recall Score of the Models used')
plt.show()

#### 5.3.2.4 F1-SCORE GRAPH

In [None]:
plt.figure(figsize=(22, 8))
plt.bar(F1_score_dict.keys(), F1_score_dict.values())
plt.xlabel('Model Name')
plt.ylabel('F1-Score')
plt.title('Comparison Graph for the F1-Score of the Models used')
plt.show()

## 5.4 FINDING OUT THE BEST MODELS

In [None]:
print("Best Accuracy:", max(Training_Accuracy_dict, key=Training_Accuracy_dict.get))
print("Best Precision:", max(Precision_dict, key=Precision_dict.get))
print("Best Recall:", max(Recall_dict, key=Recall_dict.get))
print("Best F1-Score:", max(F1_score_dict, key=F1_score_dict.get))

# We will take the model with the best Accuracy

# 6. REAL-LIFE DATASET

## 6.1 IMPORTING TEST DATA

In [None]:
test_df = pd.read_csv(r"/kaggle/input/nlp-getting-started/test.csv")
test_df.head()

## 6.2 iNITIALIZING SUBMISSION FILE

In [None]:
submission_df = pd.DataFrame()
submission_df['id'] = test_df['id']
submission_df.head()

## 6.3 PRE-PROCESSING THE TEST DATA

In [None]:
test_df['text'] = test_df['text'].apply(text_preprocessing)
test_df['keyword'] = le.fit_transform(test_df['keyword'])
test_df.drop(columns=['id','location'],axis=1,inplace=True)
test_df.head()

## 6.4 TEST DATA VECTORIZATION

In [None]:
test_df_text = tfidf.fit_transform(test_df['text']).toarray()
test_data = np.concatenate([test_df_text, test_df[['keyword']]], axis=1)
test_data.shape

## 6.5 PREDICTING LABELS USING THE BEST MODEL

In [None]:
BNB = BernoulliNB()
BNB.fit(features_train, labels_train)

pred = BNB.predict(test_data)
pred

## 6.6 PREPARING AND SUBMITTING SUBMISSION FILE

In [None]:
submission_df['target'] = pred
submission_df.to_csv('submission.csv',index=False)