In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

In [2]:
# import the test and train data from the data folder
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
# inspecting the datasets 
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [5]:
# Creating the classification model using the Naive Bayes algorithm
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


In [7]:
# rename the tweet column to text
train.rename(columns={'tweet': 'text'}, inplace=True)
test.rename(columns={'tweet': 'text'}, inplace=True)

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/omkaringale/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/omkaringale/nltk_data...


True

In [15]:
# preparing the data for the model
# creating a new column for the length of the text
train['length'] = train['text'].apply(len)
test['length'] = test['text'].apply(len)
# removing the punctuation
train['text'] = train['text'].str.replace('[^\w\s]','')
test['text'] = test['text'].str.replace('[^\w\s]','')
# converting the text to lower case
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
# removing the stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
test['text'] = test['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
# removing the most common words
freq = pd.Series(' '.join(train['text']).split()).value_counts()[:10]
freq = list(freq.index)
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
test['text'] = test['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
# removing the least common words
freq = pd.Series(' '.join(train['text']).split()).value_counts()[-10:]
freq = list(freq.index)
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
test['text'] = test['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
# lemmatization
from textblob import Word
train['text'] = train['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
test['text'] = test['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
# stemming
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['text'] = train['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
test['text'] = test['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
# creating the bag of words model
cv = CountVectorizer()
X = cv.fit_transform(train['text'])
tfidf = TfidfTransformer()
X = tfidf.fit_transform(X)
# creating the model
model = MultinomialNB()
model.fit(X, train['label'])
# predicting the test data
test_data = cv.transform(test['text'])
test_data = tfidf.transform(test_data)
predictions = model.predict(test_data)
# creating the submission file
submission = pd.DataFrame({'id': test['id'], 'target': predictions})
submission.to_csv('submission.csv', index=False)

  train['text'] = train['text'].str.replace('[^\w\s]','')
  test['text'] = test['text'].str.replace('[^\w\s]','')


In [16]:
# create a dataframe with test tweets and their predicted labels
test['target'] = predictions
test.head()

Unnamed: 0,id,text,length,target
0,31963,studiolif aislif requir passion dedic willpow ...,64,0
1,31964,white supremacist everyon birdsâ movi hereâ,43,0
2,31965,safe way heal acn altwaystoh heal,33,0
3,31966,hp cur child book reserv alreadi ye harrypott ...,64,0
4,31967,3rd amaz hilari nephew eli ahmir uncl dave lov...,55,0


In [18]:
# display full test data
pd.set_option('display.max_colwidth', None)

In [19]:
test

Unnamed: 0,id,text,length,target
0,31963,studiolif aislif requir passion dedic willpow find newmaterialsâ,64,0
1,31964,white supremacist everyon birdsâ movi hereâ,43,0
2,31965,safe way heal acn altwaystoh heal,33,0
3,31966,hp cur child book reserv alreadi ye harrypott pottermor favorit,64,0
4,31967,3rd amaz hilari nephew eli ahmir uncl dave love missesâ,55,0
...,...,...,...,...
17192,49155,thought factori leftright polari trump uselections2016 leadership polit brexit blm gt3,87,1
17193,49156,mermaid hairflip neverreadi formal wed gown dress mermaid,62,0
17194,49157,hillari campaign ohioomg use word assetsampli never clinton say theeword radic,78,0
17195,49158,confer right mindset lead cultureofdevelop organ mindset,56,0


In [20]:
# exporting text to csv
test.to_csv('submission.csv', index=False)

In [21]:
# export the model
import pickle
pickle.dump(model, open('model.pkl', 'wb'))