In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Get data

In [None]:
train = pd.read_csv('/home/jeanluca/code/JeanLucaSchindler/Disaster_Tweets/raw_data/train.csv')

In [None]:
train = train.set_index('id')

In [None]:
plt.hist(train.target)

In [None]:
train.isna().sum()

In [None]:
test = pd.read_csv('/home/jeanluca/code/JeanLucaSchindler/Disaster_Tweets/raw_data/test.csv')

In [None]:
test = test.set_index('id')

## Clean data

In [None]:
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

def preprocessing(sentence):

    # Remove whitespace
    sentence = sentence.strip()

    # Lowercase characters
    sentence = sentence.lower()

    #stop_word
    stop_words = set(stopwords.words('english'))
    sentence = ' '.join(word for word in sentence.split() if word not in stop_words)

    # Remove numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())

    # Remove punctuation
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '')

    # Tokenize
    sentence = word_tokenize(sentence)

    # Lemmatize
    verbs_lemmed = [
        WordNetLemmatizer().lemmatize(word, pos='v')
        for word in sentence
    ]
    nouns_lemmed = [
        WordNetLemmatizer().lemmatize(word, pos='n')
        for word in verbs_lemmed
    ]
    return ' '.join(word for word in nouns_lemmed)

In [None]:
train['clean_text'] = train.text.apply(preprocessing)

In [None]:
train.keyword = train.keyword.fillna(0)

In [None]:
train.location = train.location.fillna(0)

## Vectorize words

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=0.001, ngram_range=(1,3), max_features=2341)

In [None]:
vectorized_documents = vectorizer.fit_transform(train['clean_text'])
vectorized_documents = pd.DataFrame(
    vectorized_documents.toarray(),
    columns = vectorizer.get_feature_names_out()
)

vectorized_documents

## Train/Val split

In [None]:
X = vectorized_documents
y = train.target

In [None]:
X.shape

## MultinomailNB 

In [None]:
import numpy as np

from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB
# Cross-validation
cv_results = cross_validate(MultinomialNB(alpha=0.3), X, y, cv = 5, scoring = ["f1"])

In [None]:
cv_results['test_f1'].mean()

In [None]:
model = MultinomialNB(alpha=0.3)

In [None]:
multinomial = model.fit(X,y)

## Predict

In [None]:
test['clean_text'] = test.text.apply(preprocessing)

In [None]:
test_clean = vectorizer.transform(test['clean_text'])

In [None]:
test_clean.shape

In [None]:
results = multinomial.predict(test_clean)

In [None]:
test

In [None]:
results = pd.DataFrame(results)

In [None]:
results = results.set_index(test.index)

In [None]:
results = results.rename(columns={0: 'target'})

In [None]:
results

In [None]:
results.to_csv('multinomialNB_pred.csv')