<a href="https://www.kaggle.com/code/jakubwalczykowski/disaster-tweets?scriptVersionId=123390050" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install emot
!pip install symspellpy

In [None]:
import os
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns



import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import pkg_resources
from symspellpy import SymSpell, Verbosity
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re
import pickle
from emot.emo_unicode import UNICODE_EMOJI # For emojis
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS
from wordcloud import WordCloud
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# 1. Data upload and short EDA

In [None]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
sub_df = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
train_df

In [None]:
train_df['target'].hist()

In [None]:
train_df.head(10)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
def visualize(label):
  words = ''
  for msg in train_df[train_df['target'] == label]['text']:
    msg = msg.lower()
    words += msg + ' '
  wordcloud = WordCloud(width=600, height=400).generate(words)
  plt.imshow(wordcloud)
  plt.axis('off')
  plt.show()

In [None]:
#non-disaster tweets before preprocesing
visualize(0)

In [None]:
#disaster tweets before preprocesing
visualize(1)

In [None]:
train_df[train_df["target"] == 0]["text"].values[0]

In [None]:
train_df[train_df["target"] == 1]["text"].values[0]

In [None]:
sub_df.head()

# 2. Preprocessing 
most of preprocessing technic are by Rohit Garud https://www.kaggle.com/code/rohitgarud/all-almost-data-preprocessing-techniques-for-nlp#Final-Stopward-Removal  

In [None]:
#converting emojis
def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
    return text
train_df["text"] = train_df["text"].apply(convert_emojis)
test_df["text"] = test_df["text"].apply(convert_emojis)

In [None]:
#joining keywords with text      
train_df["keyword"] = train_df["keyword"].fillna("")
train_df["text"] = train_df["keyword"] + " " + train_df["text"]
test_df["keyword"] = test_df["keyword"].fillna("")
test_df["text"] = test_df["keyword"] + " " + test_df["text"]
train_df=train_df.drop(['keyword','location'],axis=1)
test_df=test_df.drop(['keyword','location'],axis=1)
train_df

In [None]:
#removing html code
def remove_html(text):
    soup = BeautifulSoup(text)
    text = soup.get_text()
    return text
train_df["text"] = train_df["text"].apply(remove_html)
test_df["text"] = test_df["text"].apply(remove_html)
train_df

In [None]:
#converting capital letters
train_df["text"] = train_df["text"].str.lower()
test_df["text"] = test_df["text"].str.lower()
train_df

In [None]:
#removing urls
def remove_urls(text):
    pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)(/\w*)?')
    text = re.sub(pattern, "", text)
    return text
train_df["text"] = train_df["text"].apply(remove_urls)
test_df["text"] = test_df["text"].apply(remove_urls)
train_df

In [None]:
#removing mentions
def remove_mentions(text):
    pattern = re.compile(r"@\w+")
    text = re.sub(pattern, "", text)
    return text
train_df["text"] = train_df["text"].apply(remove_mentions)
test_df["text"] = test_df["text"].apply(remove_mentions)
train_df

In [None]:
#removing unicode chars
def remove_unicode_chars(text):
    text = text.encode("ascii", "ignore").decode()
    return text
train_df["text"] = train_df["text"].apply(remove_unicode_chars)
test_df["text"] = test_df["text"].apply(remove_unicode_chars)
train_df

In [None]:
#removing punctuations
string.punctuation
def remove_punctuations(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), " ",text)
    return text
train_df["text"] = train_df["text"].apply(remove_punctuations)
test_df["text"] = test_df["text"].apply(remove_punctuations)
train_df

In [None]:
#removing extra spaces
def remove_extra_spaces(text):
    text = re.sub(' +', ' ', text).strip()
    return text
train_df["text"] = train_df["text"].apply(remove_extra_spaces)
test_df["text"] = test_df["text"].apply(remove_extra_spaces)
train_df

In [None]:
#correcting spelling with symspell
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt"
)
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
def correct_spelling_symspell(text):
    words = [
        sym_spell.lookup(
            word, 
            Verbosity.CLOSEST, 
            max_edit_distance=2,
            include_unknown=True
            )[0].term 
        for word in text.split()] 
    text = " ".join(words)
    return text
train_df["text"] = train_df["text"].apply(correct_spelling_symspell)
test_df["text"] = test_df["text"].apply(correct_spelling_symspell)
train_df

In [None]:
#Correcting Componded Words
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt"
)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
def correct_spelling_symspell_compound(text):
    words = [
        sym_spell.lookup_compound(
            word, 
            max_edit_distance=2
            )[0].term 
        for word in text.split()] 
    text = " ".join(words)
    return text
train_df["text"] = train_df["text"].apply(correct_spelling_symspell_compound)
test_df["text"] = test_df["text"].apply(correct_spelling_symspell_compound)
train_df

In [None]:
#removing stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
train_df["text"] = train_df["text"].apply(remove_stopwords)
test_df["text"] = test_df["text"].apply(remove_stopwords)
train_df

In [None]:
# lemmatizing
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = [lemmatizer.lemmatize(word) for word in text.split()]
    text = ' '.join(words)
    return text
train_df["text"] = train_df["text"].apply(lemmatize_text)
test_df["text"] = test_df["text"].apply(lemmatize_text)
train_df

In [None]:
#non-disaster tweets after preprocesing
visualize(0)

In [None]:
#disaster tweets after preprocesing
visualize(1)

# 3. Converting to vectors and training models

In [None]:
tfidf = TfidfVectorizer(max_features=105325,binary=True, analyzer='word',
            ngram_range=(1, 3), use_idf=True,smooth_idf=1,sublinear_tf=1,)
#max_features=10306
X = tfidf.fit_transform(train_df['text']).toarray()
y=train_df['target']
#norm{‘l1’
tfidfX_train, tfidfX_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state=42)
print (tfidfX_train.shape)
print (tfidfX_test.shape)
tfidfX_test

In [None]:
# from imblearn.over_sampling import RandomOverSampler
# RandomOverSampler = RandomOverSampler(random_state = 42)
# tfidfX_train, y_train = RandomOverSampler.fit_resample(tfidfX_train, y_train)
# print (tfidfX_train.shape)

In [None]:
# from imblearn.over_sampling import SMOTE
# SMOTE = SMOTE(random_state = 42)
# tfidfX_train, y_train = SMOTE.fit_resample(tfidfX_train, y_train)
# print(tfidfX_train.shape)


In [None]:
clf1 = LogisticRegression()
clf1.fit(tfidfX_train, y_train)
predictions = clf1.predict_proba(tfidfX_test)
print("Train acc:", clf1.score(tfidfX_train, y_train))
print("Test acc:", clf1.score(tfidfX_test, y_test))

In [None]:
Pr_train = clf1.predict_proba(tfidfX_train)[:, 1]
Pr_test = clf1.predict_proba(tfidfX_test)[:, 1]
print("Train AUC:", roc_auc_score(y_train, Pr_train))
print("Test AUC:", roc_auc_score(y_test, Pr_test))

In [None]:
y_predict = clf1.fit(tfidfX_train, y_train).predict(tfidfX_test)
print(classification_report(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))

In [None]:
clf2 = MultinomialNB()
clf2.fit(tfidfX_train, y_train)
predictions = clf2.predict_proba(tfidfX_test)
print("Train acc:", clf2.score(tfidfX_train, y_train))
print("Test acc:", clf2.score(tfidfX_test, y_test))

In [None]:
Pr_train = clf2.predict_proba(tfidfX_train)[:, 1]
Pr_test = clf2.predict_proba(tfidfX_test)[:, 1]
print("Train AUC:", roc_auc_score(y_train, Pr_train))
print("Test AUC:", roc_auc_score(y_test, Pr_test))

In [None]:
y_predict = clf2.fit(tfidfX_train, y_train).predict(tfidfX_test)
print(classification_report(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))

In [None]:
cvt = CountVectorizer(analyzer='word', binary=True,
            ngram_range=(1, 3) )
X = cvt.fit_transform(train_df['text'])
#.toarray()
y=train_df['target']

cvtX_train, cvtX_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state=42)
print (cvtX_train.shape)
print (cvtX_test.shape)
cvtX_train

In [None]:
from imblearn.over_sampling import RandomOverSampler
RandomOverSampler = RandomOverSampler(
                                      random_state = 42)
cvtX_train, y_train = RandomOverSampler.fit_resample(cvtX_train, y_train)
print (cvtX_train.shape)

In [None]:
# from imblearn.over_sampling import SMOTE
# SMOTE = SMOTE(random_state = 42)
# cvtX_train, y_train = SMOTE.fit_resample(cvtX_train, y_train)
# print(cvtX_train.shape)

In [None]:
clf3 = LogisticRegression()
clf3.fit(cvtX_train, y_train)
predictions = clf3.predict_proba(cvtX_test)######
print("Train acc:", clf3.score(cvtX_train, y_train))
print("Test acc:", clf3.score(cvtX_test, y_test))

In [None]:
Pr_train = clf3.predict_proba(cvtX_train)[:, 1]
Pr_test = clf3.predict_proba(cvtX_test)[:, 1]
print("Train AUC:", roc_auc_score(y_train, Pr_train))
print("Test AUC:", roc_auc_score(y_test, Pr_test))

In [None]:
y_predict = clf3.fit(cvtX_train, y_train).predict(cvtX_test)
print(classification_report(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))

In [None]:
clf4 = MultinomialNB()
clf4.fit(cvtX_train, y_train)
predictions = clf4.predict_proba(cvtX_test)
print("Train acc:", clf4.score(cvtX_train, y_train))
print("Test acc:", clf4.score(cvtX_test, y_test))

In [None]:
Pr_train = clf4.predict_proba(cvtX_train)[:, 1]
Pr_test = clf4.predict_proba(cvtX_test)[:, 1]
print("Train AUC:", roc_auc_score(y_train, Pr_train))
print("Test AUC:", roc_auc_score(y_test, Pr_test))

In [None]:
y_predict = clf4.fit(cvtX_train, y_train).predict(cvtX_test)
print(classification_report(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))

In [None]:
test_df

In [None]:
test_df.text=cvt.transform(test_df.text)

In [None]:
test_predcit = clf3.predict(test_df.text)

test_predcit

In [None]:
sub_df.target = test_predcit
sub_df.head(50)

In [None]:
sub_df['target'].value_counts()

In [None]:
sub_df.to_csv('submission.csv', index=False)



Thank you, feedback is highly appreciated! :)

Please upvote if you found this helpful 👍
