In [1]:
#Import necessary libraries
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to E:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to E:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to E:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
tweets = pd.read_csv('D:\\tweet_classification-master\\Real or Not.NLP with Disaster Tweets\\Data\\sample_submission.csv', encoding = 'latin-1')
tweets.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [5]:
#load data
train_data=pd.read_csv("train.csv",encoding='latin-1')
test_data=pd.read_csv("test.csv",encoding='latin-1')


In [6]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
#Check shape of data
train_data.shape,test_data.shape

((7613, 5), (3263, 4))

In [8]:
#independent variable
train_text=train_data.text
test_text=test_data.text

In [9]:
#dependent variable 
y=train_data.target

In [10]:
import re

In [11]:
#function for cleaning the text remove hashtags,punctuation and other unwanted signs
def clean_text(text):
    text=text.lower()
    text=re.sub('#','',text)
    text=re.sub('[^a-zA-Z ]','',text)
    return text

In [12]:
train_text.head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

In [13]:
#clean text
train_text=train_text.apply(clean_text)
test_text=test_text.apply(clean_text)

In [17]:
train_text.head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3     people receive wildfires evacuation orders in...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

In [18]:
#lemmatize and remove stopwords from text
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer=WordNetLemmatizer()

In [19]:
train_sequence=[]
for i in range(len(train_text)):
    words=nltk.word_tokenize(train_text.iloc[i])
    words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sent=' '.join(words)
    train_sequence.append(sent)

In [20]:
len(train_sequence)

7613

In [21]:
train_sequence[0]

'deed reason earthquake may allah forgive u'

In [22]:
test_sequence=[]
for i in range(len(test_text)):
    words=nltk.word_tokenize(test_text.iloc[i])
    words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sent=' '.join(words)
    test_sequence.append(sent)

In [24]:
len(test_sequence)

3263

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
#convert text to sequence with the help of tfidf technique
tfidf=TfidfVectorizer(min_df=2,ngram_range=(1,3),max_features=10000)

In [27]:
vectorized_train=tfidf.fit_transform(train_sequence)

In [28]:
vectorized_train.shape

(7613, 10000)

In [29]:
vectorized_test=tfidf.transform(test_sequence)

In [30]:
vectorized_test.shape

(3263, 10000)

In [31]:
#convert sequences into array
vectorized_train=vectorized_train.toarray()
vectorized_test=vectorized_test.toarray()

In [32]:
vectorized_train[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
#split out train and validation data
X_train, X_test, y_train, y_test = train_test_split(vectorized_train,y,test_size=0.2,random_state=0)

In [36]:
#Logistic Regression for classification
classifier=LogisticRegression(C=2)

In [37]:
#fit the training data to our Logistic Regression model
classifier.fit(X_train,y_train)

LogisticRegression(C=2)

In [38]:
#evaluate to get validation score
classifier.score(X_test,y_test)

0.8010505581089954

In [39]:
y_pred=classifier.predict(vectorized_test)

In [40]:
id=test_data.id

In [41]:
output_df=pd.DataFrame({'id':id,"target":y_pred})

In [42]:
output_df.to_csv('submission.csv',index=False)

In [43]:
test_data.iloc[0]

id                                           0
keyword                                    NaN
location                                   NaN
text        Just happened a terrible car crash
Name: 0, dtype: object

In [44]:
#testing function
def predict_result(text):
    text=text.lower()
    text=re.sub('#','',text)
    text=re.sub('[^a-zA-Z ]','',text)
    words=nltk.word_tokenize(text)
    words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentence=' '.join(words)
    temp=[sentence]
    temp1=tfidf.transform(temp)
    result=classifier.predict(temp1)
    return result[0]

In [45]:
predict_result("Just happened a terrible car crash")

1

In [46]:
import pickle

In [47]:
#save classifier to load in our flask server
pickle.dump(classifier,open('disClassifier.pkl','wb'))

In [48]:
#save tfidf object to load in our flask server
pickle.dump(tfidf,open('tfidf.pkl','wb'))