<a href="https://colab.research.google.com/github/Jayveersinh-Raj/BaselineModel_NB_Sklearn_NLP/blob/main/disaster_analysis_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Lets check if there is gpu
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-f16f8fc8-1257-91f5-09ea-9a1714a0f0b6)


In [4]:
# Lets import our helper functions
!wget https://raw.githubusercontent.com/Jayveersinh-Raj/helper_functions/main/helper_functions.py
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

--2022-08-24 07:17:39--  https://raw.githubusercontent.com/Jayveersinh-Raj/helper_functions/main/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2022-08-24 07:17:39 (93.2 MB/s) - ‘helper_functions.py’ saved [10246/10246]



In [5]:
# importing the data
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
unzip_data("nlp_getting_started.zip") # This is the name of the zip file that Daniel stored on google

--2022-08-24 07:17:49--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.10.128, 142.251.12.128, 172.217.194.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.10.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip.1’


2022-08-24 07:17:49 (111 MB/s) - ‘nlp_getting_started.zip.1’ saved [607343/607343]



In [6]:
# importing some useful libraries
import pandas as pd
import numpy as np
data_train = pd.read_csv("train.csv")
data_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
# Lets shuffle the data using pandas method
shuffled_train = data_train.sample(frac = 1, random_state = 42) # frac = 1 means 100% of the data shuffle
shuffled_train.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [8]:
# Lets see the value counts to see how many examples we have of each category
shuffled_train.target.value_counts() # Because target contains our categories

0    4342
1    3271
Name: target, dtype: int64

In [9]:
# Lets visualize some random data to get the idea
import random
rand = random.randint(0, len(data_train)-3) # Not to exceed the total length
for i in shuffled_train[["text", "target"]][rand : rand+5 ].itertuples(): # because it will return dataframe as tuples
      _, text, target = i
      print(f"Target : {target}", "real disaster" if target > 0 else "not a diaster")
      print(f"Text : {text} \n")
      print("--- \n")
      

Target : 1 real disaster
Text : @AFK_10 @Dr_JohanFranzen ISIS are orchs. But they don't have the ability to massacre civilians far from the frontlines like the tyrant. 

--- 

Target : 0 not a diaster
Text : 'Up to 40% of businesses affected by a natural or man-made disaster never reopen'
http://t.co/35JyAp0ul9 

--- 

Target : 0 not a diaster
Text : @Jones94Kyle oh fuck sake he is dead ???? 

--- 

Target : 0 not a diaster
Text : @JMastrodonato so the question is: would you crush Ortiz for bunting as your sports writing forefathers crushed Williams? 

--- 

Target : 1 real disaster
Text : The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/Rg9yaybOSA 

--- 



In [10]:
# Lets split the data and get the validation set using sklearn
from sklearn.model_selection import train_test_split
train_lines, val_lines, train_labels, val_labels = train_test_split(shuffled_train["text"].to_numpy(),
                                                                    shuffled_train["target"].to_numpy(), 
                                                                    test_size = 0.1, # 10%
                                                                    random_state = 42)

In [11]:
# lets check them and their lengths
len(train_lines), len(val_lines), len(train_labels), len(val_labels)

(6851, 762, 6851, 762)

In [12]:
# lets see the first 10 examples
import numpy as np
ten_ex = np.array([train_lines[:10],train_labels[:10]]).T
visual = pd.DataFrame(ten_ex, columns = ["Text", "Target"])

visual

Unnamed: 0,Text,Target
0,@mogacola @zamtriossu i screamed after hitting...,0
1,Imagine getting flattened by Kurt Zouma,0
2,@Gurmeetramrahim #MSGDoing111WelfareWorks Gree...,1
3,@shakjn @C7 @Magnums im shaking in fear he's g...,0
4,Somehow find you and I collide http://t.co/Ee8...,0
5,@EvaHanderek @MarleyKnysh great times until th...,1
6,destroy the free fandom honestly,1
7,Weapons stolen from National Guard Armory in N...,0
8,@wfaaweather Pete when will the heat wave pass...,1
9,Patient-reported outcomes in long-term survivo...,1


## Baseline Naive bayes model for diastaster analysis on tweets

In [14]:
# Now lets import our libraries and model using Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline # This is same as Keras's Sequential API

# Create tokenization and modelling pipeline
model_0 = Pipeline([
("tfid", TfidfVectorizer()), # Converts words to numbers using tfidf
("clf", MultinomialNB()) # model the text, "clf" is short for 'classifier', in this case we are using MultinomialNB as classifier
])

# Fit the pipeline to the training data
model_0.fit(train_lines, train_labels) 

Pipeline(steps=[('tfid', TfidfVectorizer()), ('clf', MultinomialNB())])

In [15]:
# Lets evaluate our baseline model
score = model_0.score(val_lines, val_labels)
print(f"The default evaluation metrices is accuracy, which in this case is: \n {score*100:.2f}%")

The default evaluation metrices is accuracy, which in this case is: 
 79.27%


In [16]:
# We can also make predictions, the same as tensorflow
preds = model_0.predict(val_lines)
preds[:5] # To see just the top 5

array([1, 1, 1, 0, 0])

In [17]:
# Lets create the function to give us a dictonary of evulation metrices using sklearn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def evaluation_result(y_true, y_pred):
  accuracy = accuracy_score(y_true, y_pred)
  precision, recall, fscore, _ = precision_recall_fscore_support(y_true, y_pred, average = 'weighted') # underscore is blank in python, because the function will return 4 values and we need just 3
  evaluation = {"Accuracy" : accuracy*100,
                "Precision" : precision,
                "Recall" : recall,
                "F-score" : fscore}
  return pd.DataFrame.from_dict(evaluation, orient = 'index')

In [18]:
# Lets use the above function to see
model_0_evaluation = evaluation_result(val_labels, preds)
model_0_evaluation

Unnamed: 0,0
Accuracy,79.265092
Precision,0.811139
Recall,0.792651
F-score,0.786219
