## importing the libraries


In [None]:
import numpy as np
import pandas as pd
import nltk # used for preprocessing in NLP

## importing the data

In [None]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')


In [None]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
train.shape

(7613, 5)

In [None]:
train.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [None]:
sentences=train['text']
target=train['target']

## cleaning the data


In [None]:
sentences

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lm=WordNetLemmatizer()

In [None]:
corpus=[]
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import re
for i in sentences:
  review=re.sub('[^a-zA-Z]'," ",i)
  review=review.lower()
  review=review.split()
  review=[lm.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
  review=" ".join(review)
  corpus.append(review)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Bag of words model

In [None]:
corpus

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
x=cv.fit_transform(corpus).toarray()

In [None]:
type(x)

numpy.ndarray

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,target,test_size=0.2,random_state=42)

## Random forest classifier on bag of words

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(x_train,y_train)

RandomForestClassifier()

In [None]:
y_pred=classifier.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score
cm=confusion_matrix(y_test,y_pred)
print(cm)
ac=accuracy_score(y_test,y_pred)
print("accuracy",ac)
rc=recall_score(y_test,y_pred)
print("recall",rc)
pre=precision_score(y_test,y_pred)
print("precision",pre)
f1=f1_score(y_test,y_pred)
print("f1 score",f1)

[[801  73]
 [240 409]]
accuracy 0.7944845699277742
recall 0.6302003081664098
precision 0.8485477178423236
f1 score 0.7232537577365163


## TF-IDF model

In [None]:
from nltk.translate.nist_score import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
z=tf.fit_transform(corpus).toarray()

In [None]:
from sklearn.model_selection import train_test_split
z_train,z_test,w_train,w_test=train_test_split(z,target,test_size=0.3,random_state=42)

## Random Forest model on TF-IDF

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(n_estimators=65)
classifier.fit(z_train,w_train)

RandomForestClassifier(n_estimators=65)

In [None]:

z_pred=classifier.predict(z_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm=confusion_matrix(w_test,z_pred)
print(cm)
ac=accuracy_score(w_test,z_pred)
print("accuracy",ac)

[[1215  103]
 [ 395  571]]
accuracy 0.7819614711033275


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
classifier=GradientBoostingClassifier()
classifier.fit(z_train,w_train)



GradientBoostingClassifier()

In [None]:
z_pred=classifier.predict(z_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm=confusion_matrix(w_test,z_pred)
print(cm)
ac=accuracy_score(w_test,z_pred)
print("accuracy",ac)

[[1223   95]
 [ 460  506]]
accuracy 0.7570052539404554
