<a href="https://colab.research.google.com/github/Grg0rry/NLP-Workshop/blob/main/NLP_Workshop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Initialize Dataset and Libraries

In [None]:
!pip install nltk
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Import necessary library & packages
import pandas as pd

import string
import re
import nltk
import contractions
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Import Dataset
train_df = pd.read_csv("train.csv")

### Exploratory Data Analysis (EDA)

In [None]:
# Check Dataframe
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
 # keep only text and target
train_df = train_df[['text', 'target']]
train_df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
# Observe distribution of target
train_df['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [None]:
# Analyse tweets with real disaster (target = 1)
train_df[train_df['target'] == 1][['text']]

Unnamed: 0,text
0,Our Deeds are the Reason of this #earthquake M...
1,Forest fire near La Ronge Sask. Canada
2,All residents asked to 'shelter in place' are ...
3,"13,000 people receive #wildfires evacuation or..."
4,Just got sent this photo from Ruby #Alaska as ...
...,...
7608,Two giant cranes holding a bridge collapse int...
7609,@aria_ahrary @TheTawniest The out of control w...
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611,Police investigating after an e-bike collided ...


In [None]:
# Analyse tweets that are not disaster (target = 0)
train_df[train_df['target'] == 0][['text']]

Unnamed: 0,text
15,What's up man?
16,I love fruits
17,Summer is lovely
18,My car is so fast
19,What a goooooooaaaaaal!!!!!!
...,...
7581,@engineshed Great atmosphere at the British Li...
7582,Cramer: Iger's 3 words that wrecked Disney's s...
7584,These boxes are ready to explode! Exploding Ki...
7587,Sirens everywhere!


Potential preprocessing ideas
1. remove tags (@...)
2. remove urls (https...)
3. expand words (I'm to I am, You're to You are)
4. remove symbols (#, !, etc) and numbers
5. lowercase text

In [None]:
# Check for missing values and datatype
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7613 non-null   object
 1   target  7613 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.1+ KB


In [None]:
train_df.dtypes

text      object
target     int64
dtype: object

In [None]:
train_df['text'].isna().value_counts()

False    7613
Name: text, dtype: int64

In [None]:
train_df['target'].isna().value_counts()

False    7613
Name: target, dtype: int64

### Data Preprocessing

In [None]:
# Filter noise
def filter_noise(text):
    # remove all tags (@name)
    text = re.sub(r'@\w+','',text)
    
    # remove all URLs (http:// or https://)
    text = re.sub(r'http[s]?://\S+','',text) # remove URLs
    
    # expand contractions like I've to I have, I'm to I am
    text = contractions.fix(text)
    
     # remove all symbols and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # convert all case to lowercase
    text = text.lower()

    return text

In [None]:
train_df['clean_text'] = train_df['text'].apply(filter_noise)

In [None]:
train_df[['clean_text']].head(10)

Unnamed: 0,clean_text
0,our deeds are the reason of this earthquake ma...
1,forest fire near la ronge sask canada
2,all residents asked to shelter in place are be...
3,people receive wildfires evacuation orders in...
4,just got sent this photo from ruby alaska as s...
5,rockyfire update california hwy closed in bo...
6,flood disaster heavy rain causes flash floodin...
7,i am on top of the hill and i can see a fire i...
8,there is an emergency evacuation happening now...
9,i am afraid that the tornado is coming to our ...


In [None]:
# Tokenization
def tokenization(text):
    # tokenize by words
    tokens = word_tokenize(text)

    # tokenize by sentences
    # tokens = sent_tokenize(text)
    
    return tokens

In [None]:
train_df['clean_text'] = train_df['clean_text'].apply(tokenization)

In [None]:
train_df[['clean_text']].head(10)

Unnamed: 0,clean_text
0,"[our, deeds, are, the, reason, of, this, earth..."
1,"[forest, fire, near, la, ronge, sask, canada]"
2,"[all, residents, asked, to, shelter, in, place..."
3,"[people, receive, wildfires, evacuation, order..."
4,"[just, got, sent, this, photo, from, ruby, ala..."
5,"[rockyfire, update, california, hwy, closed, i..."
6,"[flood, disaster, heavy, rain, causes, flash, ..."
7,"[i, am, on, top, of, the, hill, and, i, can, s..."
8,"[there, is, an, emergency, evacuation, happeni..."
9,"[i, am, afraid, that, the, tornado, is, coming..."


In [None]:
# Remove Stopwords
def remove_stopwords(tokens):
    cleaned_tokens = []
    
    # list of stopwords
    stop_words = stopwords.words('english')
    
    # filter out stopwords
    for token in tokens:
        if token not in stop_words:
            cleaned_tokens.append(token)
    
    return cleaned_tokens

In [None]:
# Stopwords list
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
train_df['clean_text'] = train_df['clean_text'].apply(remove_stopwords)

In [None]:
train_df[['clean_text']].head(10)

Unnamed: 0,clean_text
0,"[deeds, reason, earthquake, may, allah, forgiv..."
1,"[forest, fire, near, la, ronge, sask, canada]"
2,"[residents, asked, shelter, place, notified, o..."
3,"[people, receive, wildfires, evacuation, order..."
4,"[got, sent, photo, ruby, alaska, smoke, wildfi..."
5,"[rockyfire, update, california, hwy, closed, d..."
6,"[flood, disaster, heavy, rain, causes, flash, ..."
7,"[top, hill, see, fire, woods]"
8,"[emergency, evacuation, happening, building, a..."
9,"[afraid, tornado, coming, area]"


In [None]:
# Stemming/Lemmatization
def stemming(tokens):
    cleaned_tokens = []

    stemmer = PorterStemmer()
    for token in tokens:
        token = stemmer.stem(token)
        cleaned_tokens.append(token)
    return cleaned_tokens


def lemmatization(tokens):
    cleaned_tokens = []

    lemmatizer = WordNetLemmatizer()
    # part of speech tagging
    for token, tag in pos_tag(tokens):
        if tag.startswith('N'): #noun
            pos = 'n'
        elif tag.startswith('V'): #verb
            pos = 'v'
        else: #adjective
            pos = 'a'
        
        token = lemmatizer.lemmatize(token, pos)
        cleaned_tokens.append(token)
    return cleaned_tokens

In [None]:
# Part of Speech Tagging Example
sample_text = 'Natural language processing (NLP) is a field of artificial intelligence (AI) that focuses on enabling computers to understand, interpret, and generate human language.'
cleaned = remove_stopwords(tokenization(filter_noise(sample_text)))

for token, tag in pos_tag(cleaned):
    print(f'word: {token}, label: {tag}')

word: natural, label: JJ
word: language, label: NN
word: processing, label: NN
word: nlp, label: JJ
word: field, label: NN
word: artificial, label: JJ
word: intelligence, label: NN
word: ai, label: NN
word: focuses, label: VBZ
word: enabling, label: VBG
word: computers, label: NNS
word: understand, label: VBP
word: interpret, label: JJ
word: generate, label: NN
word: human, label: JJ
word: language, label: NN


In [None]:
train_df['clean_text'] = train_df['clean_text'].apply(lemmatization)

In [None]:
train_df[['clean_text']].head(10)

Unnamed: 0,clean_text
0,"[deed, reason, earthquake, may, allah, forgive..."
1,"[forest, fire, near, la, ronge, sask, canada]"
2,"[resident, ask, shelter, place, notify, office..."
3,"[people, receive, wildfire, evacuation, order,..."
4,"[get, sent, photo, ruby, alaska, smoke, wildfi..."
5,"[rockyfire, update, california, hwy, close, di..."
6,"[flood, disaster, heavy, rain, cause, flash, f..."
7,"[top, hill, see, fire, wood]"
8,"[emergency, evacuation, happen, building, acro..."
9,"[afraid, tornado, come, area]"


In [None]:
# convert back to string from list
train_df['clean_text'] = train_df['clean_text'].apply(lambda row: ' '.join(row))

In [None]:
train_df[['clean_text']].head(10)

Unnamed: 0,clean_text
0,deed reason earthquake may allah forgive us
1,forest fire near la ronge sask canada
2,resident ask shelter place notify officer evac...
3,people receive wildfire evacuation order calif...
4,get sent photo ruby alaska smoke wildfires pou...
5,rockyfire update california hwy close directio...
6,flood disaster heavy rain cause flash flood st...
7,top hill see fire wood
8,emergency evacuation happen building across st...
9,afraid tornado come area


In [None]:
train_df[['clean_text', 'text']].head(10)

Unnamed: 0,clean_text,text
0,deed reason earthquake may allah forgive us,Our Deeds are the Reason of this #earthquake M...
1,forest fire near la ronge sask canada,Forest fire near La Ronge Sask. Canada
2,resident ask shelter place notify officer evac...,All residents asked to 'shelter in place' are ...
3,people receive wildfire evacuation order calif...,"13,000 people receive #wildfires evacuation or..."
4,get sent photo ruby alaska smoke wildfires pou...,Just got sent this photo from Ruby #Alaska as ...
5,rockyfire update california hwy close directio...,#RockyFire Update => California Hwy. 20 closed...
6,flood disaster heavy rain cause flash flood st...,#flood #disaster Heavy rain causes flash flood...
7,top hill see fire wood,I'm on top of the hill and I can see a fire in...
8,emergency evacuation happen building across st...,There's an emergency evacuation happening now ...
9,afraid tornado come area,I'm afraid that the tornado is coming to our a...


### Feature Extraction

In [None]:
# Bag of Words
bow_vectorizer = CountVectorizer(max_features = 1000)
bow_x = bow_vectorizer.fit_transform(train_df['clean_text'])

In [None]:
# view features extracted
bow_vectorizer.get_feature_names_out()

array(['aba', 'abandon', 'abc', 'ablaze', 'accident', 'account', 'across',
       'act', 'action', 'actually', 'add', 'affect', 'aftershock', 'ago',
       'agree', 'ahead', 'air', 'aircraft', 'airplane', 'airport',
       'alabama', 'alarm', 'album', 'allow', 'almost', 'alone', 'already',
       'also', 'always', 'ambulance', 'america', 'american', 'amid',
       'amp', 'ancient', 'angry', 'animal', 'annihilate', 'annihilation',
       'anniversary', 'another', 'answer', 'anthrax', 'anyone',
       'anything', 'apocalypse', 'apollo', 'appear', 'area', 'arm',
       'armageddon', 'army', 'around', 'arrest', 'arrive', 'arson',
       'arsonist', 'art', 'article', 'as', 'ash', 'ask', 'ass', 'atomic',
       'attack', 'aug', 'august', 'australia', 'avalanche', 'ave',
       'avoid', 'away', 'awesome', 'baby', 'back', 'bad', 'bag', 'ball',
       'ban', 'bang', 'bar', 'base', 'battle', 'bayelsa', 'bbc', 'beach',
       'beat', 'beautiful', 'become', 'bed', 'begin', 'behind', 'believe',
   

In [None]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features = 1000)
tdidf_x = tfidf_vectorizer.fit_transform(train_df['clean_text'])

In [None]:
# view features extracted
tfidf_vectorizer.get_feature_names_out()

array(['aba', 'abandon', 'abc', 'ablaze', 'accident', 'account', 'across',
       'act', 'action', 'actually', 'add', 'affect', 'aftershock', 'ago',
       'agree', 'ahead', 'air', 'aircraft', 'airplane', 'airport',
       'alabama', 'alarm', 'album', 'allow', 'almost', 'alone', 'already',
       'also', 'always', 'ambulance', 'america', 'american', 'amid',
       'amp', 'ancient', 'angry', 'animal', 'annihilate', 'annihilation',
       'anniversary', 'another', 'answer', 'anthrax', 'anyone',
       'anything', 'apocalypse', 'apollo', 'appear', 'area', 'arm',
       'armageddon', 'army', 'around', 'arrest', 'arrive', 'arson',
       'arsonist', 'art', 'article', 'as', 'ash', 'ask', 'ass', 'atomic',
       'attack', 'aug', 'august', 'australia', 'avalanche', 'ave',
       'avoid', 'away', 'awesome', 'baby', 'back', 'bad', 'bag', 'ball',
       'ban', 'bang', 'bar', 'base', 'battle', 'bayelsa', 'bbc', 'beach',
       'beat', 'beautiful', 'become', 'bed', 'begin', 'behind', 'believe',
   

In [None]:
# TF-IDF without max features
tfidf_vectorizer_2 = TfidfVectorizer()
tdidf_2_x = tfidf_vectorizer_2.fit_transform(train_df['clean_text'])

In [None]:
# view features extracted
tfidf_vectorizer_2.get_feature_names_out()

array(['aa', 'aaaa', 'aaaaaaallll', ..., 'zumiez', 'zurich', 'zzzz'],
      dtype=object)

### Model Fitting

BoW Training and Testing

In [None]:
# Split data to training and testing
x = bow_x
y = train_df['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
# Logistic Regression
LR_model = LogisticRegression()
LR_model.fit(x_train, y_train)

print('--- Training ---')
y_pred = LR_model.predict(x_train)
print(classification_report(y_train, y_pred))

print('--- Testing ---')
y_pred = LR_model.predict(x_test)
print(classification_report(y_test, y_pred))

--- Training ---
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      3456
           1       0.87      0.75      0.81      2634

    accuracy                           0.85      6090
   macro avg       0.85      0.83      0.84      6090
weighted avg       0.85      0.85      0.84      6090

--- Testing ---
              precision    recall  f1-score   support

           0       0.79      0.88      0.83       886
           1       0.80      0.67      0.73       637

    accuracy                           0.79      1523
   macro avg       0.79      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523



In [None]:
# Support Vector Machines (SVM)
SVM_model = SVC()
SVM_model.fit(x_train, y_train)

print('--- Training ---')
y_pred = SVM_model.predict(x_train)
print(classification_report(y_train, y_pred))

print('--- Testing ---')
y_pred = SVM_model.predict(x_test)
print(classification_report(y_test, y_pred))

--- Training ---
              precision    recall  f1-score   support

           0       0.88      0.97      0.93      3456
           1       0.96      0.83      0.89      2634

    accuracy                           0.91      6090
   macro avg       0.92      0.90      0.91      6090
weighted avg       0.91      0.91      0.91      6090

--- Testing ---
              precision    recall  f1-score   support

           0       0.78      0.90      0.83       886
           1       0.83      0.64      0.72       637

    accuracy                           0.79      1523
   macro avg       0.80      0.77      0.78      1523
weighted avg       0.80      0.79      0.79      1523



In [None]:
# Naive Bayes
NB_model = MultinomialNB()
NB_model.fit(x_train, y_train)

print('--- Training ---')
y_pred = NB_model.predict(x_train)
print(classification_report(y_train, y_pred))

print('--- Testing ---')
y_pred = NB_model.predict(x_test)
print(classification_report(y_test, y_pred))

--- Training ---
              precision    recall  f1-score   support

           0       0.81      0.88      0.84      3456
           1       0.83      0.72      0.77      2634

    accuracy                           0.81      6090
   macro avg       0.82      0.80      0.81      6090
weighted avg       0.81      0.81      0.81      6090

--- Testing ---
              precision    recall  f1-score   support

           0       0.78      0.85      0.81       886
           1       0.76      0.66      0.71       637

    accuracy                           0.77      1523
   macro avg       0.77      0.76      0.76      1523
weighted avg       0.77      0.77      0.77      1523



TD-IDF Training and Testing

In [None]:
# Split data to training and testing
x = tdidf_x
y = train_df['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
# Logistic Regression
LR_model = LogisticRegression()
LR_model.fit(x_train, y_train)

print('--- Training ---')
y_pred = LR_model.predict(x_train)
print(classification_report(y_train, y_pred))

print('--- Testing ---')
y_pred = LR_model.predict(x_test)
print(classification_report(y_test, y_pred))

--- Training ---
              precision    recall  f1-score   support

           0       0.82      0.91      0.86      3456
           1       0.86      0.73      0.79      2634

    accuracy                           0.83      6090
   macro avg       0.84      0.82      0.83      6090
weighted avg       0.84      0.83      0.83      6090

--- Testing ---
              precision    recall  f1-score   support

           0       0.78      0.88      0.83       886
           1       0.80      0.65      0.72       637

    accuracy                           0.79      1523
   macro avg       0.79      0.77      0.77      1523
weighted avg       0.79      0.79      0.78      1523



In [None]:
# Support Vector Machines (SVM)
SVM_model = SVC()
SVM_model.fit(x_train, y_train)

print('--- Training ---')
y_pred = SVM_model.predict(x_train)
print(classification_report(y_train, y_pred))

print('--- Testing ---')
y_pred = SVM_model.predict(x_test)
print(classification_report(y_test, y_pred))

--- Training ---
              precision    recall  f1-score   support

           0       0.89      0.97      0.93      3456
           1       0.96      0.85      0.90      2634

    accuracy                           0.92      6090
   macro avg       0.93      0.91      0.91      6090
weighted avg       0.92      0.92      0.92      6090

--- Testing ---
              precision    recall  f1-score   support

           0       0.78      0.90      0.84       886
           1       0.83      0.64      0.72       637

    accuracy                           0.79      1523
   macro avg       0.80      0.77      0.78      1523
weighted avg       0.80      0.79      0.79      1523



In [None]:
# Naive Bayes
NB_model = MultinomialNB()
NB_model.fit(x_train, y_train)

print('--- Training ---')
y_pred = NB_model.predict(x_train)
print(classification_report(y_train, y_pred))

print('--- Testing ---')
y_pred = NB_model.predict(x_test)
print(classification_report(y_test, y_pred))

--- Training ---
              precision    recall  f1-score   support

           0       0.80      0.90      0.85      3456
           1       0.84      0.70      0.77      2634

    accuracy                           0.81      6090
   macro avg       0.82      0.80      0.81      6090
weighted avg       0.82      0.81      0.81      6090

--- Testing ---
              precision    recall  f1-score   support

           0       0.77      0.88      0.82       886
           1       0.79      0.64      0.71       637

    accuracy                           0.78      1523
   macro avg       0.78      0.76      0.77      1523
weighted avg       0.78      0.78      0.77      1523



TD-IDF 2 Training and Testing

In [None]:
# Split data to training and testing
x = tdidf_2_x
y = train_df['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [None]:
# Logistic Regression
LR_model = LogisticRegression()
LR_model.fit(x_train, y_train)

print('--- Training ---')
y_pred = LR_model.predict(x_train)
print(classification_report(y_train, y_pred))

print('--- Testing ---')
y_pred = LR_model.predict(x_test)
print(classification_report(y_test, y_pred))

--- Training ---
              precision    recall  f1-score   support

           0       0.85      0.96      0.90      3004
           1       0.94      0.78      0.85      2325

    accuracy                           0.88      5329
   macro avg       0.90      0.87      0.88      5329
weighted avg       0.89      0.88      0.88      5329

--- Testing ---
              precision    recall  f1-score   support

           0       0.79      0.91      0.85      1338
           1       0.84      0.66      0.74       946

    accuracy                           0.81      2284
   macro avg       0.82      0.78      0.79      2284
weighted avg       0.81      0.81      0.80      2284



In [None]:
# Support Vector Machines (SVM)
SVM_model = SVC()
SVM_model.fit(x_train, y_train)

print('--- Training ---')
y_pred = SVM_model.predict(x_train)
print(classification_report(y_train, y_pred))

print('--- Testing ---')
y_pred = SVM_model.predict(x_test)
print(classification_report(y_test, y_pred))

--- Training ---
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      3004
           1       0.98      0.94      0.96      2325

    accuracy                           0.97      5329
   macro avg       0.97      0.96      0.97      5329
weighted avg       0.97      0.97      0.97      5329

--- Testing ---
              precision    recall  f1-score   support

           0       0.78      0.93      0.85      1338
           1       0.86      0.62      0.72       946

    accuracy                           0.80      2284
   macro avg       0.82      0.78      0.79      2284
weighted avg       0.81      0.80      0.80      2284



In [None]:
# Naive Bayes
NB_model = MultinomialNB()
NB_model.fit(x_train, y_train)

print('--- Training ---')
y_pred = NB_model.predict(x_train)
print(classification_report(y_train, y_pred))

print('--- Testing ---')
y_pred = NB_model.predict(x_test)
print(classification_report(y_test, y_pred))

--- Training ---
              precision    recall  f1-score   support

           0       0.86      0.96      0.91      3004
           1       0.94      0.81      0.87      2325

    accuracy                           0.89      5329
   macro avg       0.90      0.88      0.89      5329
weighted avg       0.90      0.89      0.89      5329

--- Testing ---
              precision    recall  f1-score   support

           0       0.80      0.90      0.85      1338
           1       0.83      0.67      0.74       946

    accuracy                           0.81      2284
   macro avg       0.81      0.79      0.79      2284
weighted avg       0.81      0.81      0.80      2284

