In [1]:
#Import necessary libraries
import numpy as np
import pandas as pd
import nltk
#load data
train_data=pd.read_csv("train.csv")
test_data=pd.read_csv("test.csv")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
#Check shape of data
train_data.shape,test_data.shape

((7613, 5), (3263, 4))

In [3]:
#independent variable
train_text=train_data.text
test_text=test_data.text

In [4]:
#dependent variable 
y=train_data.target

In [5]:
import re

In [6]:
#function for cleaning the text remove hashtags,punctuation and other unwanted signs
def clean_text(text):
    text=text.lower()
    text=re.sub('#','',text)
    text=re.sub('[^a-zA-Z ]','',text)
    return text

In [7]:
train_text.head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

In [8]:
#clean text
train_text=train_text.apply(clean_text)
test_text=test_text.apply(clean_text)

In [9]:
train_text.head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3     people receive wildfires evacuation orders in...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

In [10]:
#lemmatize and remove stopwords from text
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer=WordNetLemmatizer()

In [11]:
train_sequence=[]
for i in range(len(train_text)):
    words=nltk.word_tokenize(train_text.iloc[i])
    words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sent=' '.join(words)
    train_sequence.append(sent)

In [12]:
len(train_sequence)

7613

In [13]:
train_sequence[0]

'deed reason earthquake may allah forgive u'

In [14]:
test_sequence=[]
for i in range(len(test_text)):
    words=nltk.word_tokenize(test_text.iloc[i])
    words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sent=' '.join(words)
    test_sequence.append(sent)

In [15]:
len(test_sequence)

3263

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
#convert text to sequence with the help of tfidf technique
tfidf=TfidfVectorizer(min_df=2,ngram_range=(1,3),max_features=10000)

In [18]:
vectorized_train=tfidf.fit_transform(train_sequence)

In [19]:
vectorized_train.shape

(7613, 10000)

In [20]:
vectorized_test=tfidf.transform(test_sequence)

In [21]:
vectorized_test.shape

(3263, 10000)

In [22]:
#convert sequences into array
vectorized_train=vectorized_train.toarray()
vectorized_test=vectorized_test.toarray()

In [23]:
vectorized_train[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
#split out train and validation data
X_train, X_test, y_train, y_test = train_test_split(vectorized_train,y,test_size=0.2,random_state=0)

In [27]:
#Logistic Regression for classification
classifier=LogisticRegression(C=2)

In [28]:
#fit the training data to our Logistic Regression model
classifier.fit(X_train,y_train)

LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
#evaluate to get validation score
classifier.score(X_test,y_test)

0.8076165462902167

In [30]:
y_pred=classifier.predict(vectorized_test)

In [31]:
id=test_data.id

In [32]:
output_df=pd.DataFrame({'id':id,"target":y_pred})

In [33]:
output_df.to_csv('submission.csv',index=False)

In [34]:
test_data.iloc[0]

id                                           0
keyword                                    NaN
location                                   NaN
text        Just happened a terrible car crash
Name: 0, dtype: object

In [39]:
#testing function
def predict_result(text):
    text=text.lower()
    text=re.sub('#','',text)
    text=re.sub('[^a-zA-Z ]','',text)
    words=nltk.word_tokenize(text)
    words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentence=' '.join(words)
    temp=[sentence]
    temp1=tfidf.transform(temp)
    result=classifier.predict(temp1)
    return result[0]

In [40]:
predict_result("Just happened a terrible car crash")

1

In [41]:
import pickle

In [42]:
#save classifier to load in our flask server
pickle.dump(classifier,open('disClasdifier.pkl','wb'))

In [43]:
#save tfidf object to load in our flask server
pickle.dump(tfidf,open('tfidf.pkl','wb'))