In [0]:
import os
from time import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

# for classification
from sklearn.ensemble import RandomForestClassifier

# for cross validation
from sklearn.model_selection import StratifiedKFold
# for searching for best params
from sklearn.model_selection import GridSearchCV

# for FastText vectorization
import gensim

import warnings
warnings.simplefilter(action='ignore')

import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# config
DATA_DIR = '/content/gdrive/My Drive/DA_Project'
TRAIN_DATA_FILE = 'train.csv'
TEST_DATA_FILE = 'test.csv'
SUBMISSION_FILE = 'submission.csv'

RANDOM_STATE = 0

In [0]:
train_data = pd.read_csv(os.path.join(DATA_DIR, TRAIN_DATA_FILE)).fillna('')
test_data = pd.read_csv(os.path.join(DATA_DIR, TEST_DATA_FILE)).fillna('')

In [0]:
print(train_data.shape)
train_data.head()

(27486, 4)


Unnamed: 0,textID,text,selected_text,sentiment
0,a3d0a7d5ad,Spent the entire morning in a meeting w/ a ven...,my boss was not happy w/ them. Lots of fun.,neutral
1,251b6a6766,Oh! Good idea about putting them on ice cream,Good,positive
2,c9e8d1ef1c,says good (or should i say bad?) afternoon! h...,says good (or should i say bad?) afternoon!,neutral
3,f14f087215,i dont think you can vote anymore! i tried,i dont think you can vote anymore!,negative
4,bf7473b12d,haha better drunken tweeting you mean?,better,positive


In [0]:
print(test_data.shape)
test_data.head()

(3535, 3)


Unnamed: 0,textID,text,sentiment
0,11aa4945ff,http://twitpic.com/67swx - i wish i was calli...,positive
1,fd1db57dc0,i'm done.haha. HOUSE MD marathon ulet,positive
2,2524332d66,I'm concerned for that family,positive
3,0fb19285b2,HEY GUYS IT'S WORKING NO NEED TO WORRY. i have...,positive
4,e6c9e5e3ab,26th February,neutral


In [0]:
#combining test and train data
data = train_data.append(test_data, ignore_index = True)
data

Unnamed: 0,textID,text,selected_text,sentiment
0,a3d0a7d5ad,Spent the entire morning in a meeting w/ a ven...,my boss was not happy w/ them. Lots of fun.,neutral
1,251b6a6766,Oh! Good idea about putting them on ice cream,Good,positive
2,c9e8d1ef1c,says good (or should i say bad?) afternoon! h...,says good (or should i say bad?) afternoon!,neutral
3,f14f087215,i dont think you can vote anymore! i tried,i dont think you can vote anymore!,negative
4,bf7473b12d,haha better drunken tweeting you mean?,better,positive
...,...,...,...,...
31016,2f8444db6c,@_shannon1234 Shannie im so sorry! didnt mean ...,,negative
31017,11de8c0456,Im Slowing on My Tweets..Cuase I Lost My Phone,,negative
31018,08f6036add,"Invasion of the Old Ladies has just ended, sti...",,neutral
31019,27d6472b81,$#@! My nose stud fell out and I can't find it...,,negative


In [0]:
#cleaning data and converting sentiment to class labels

def getLower(x):
  #print(x)
  if pd.isna(x):
    return ''
  return x.lower()   #lowercased

data = data.astype('string')
data['sentiment'] = data['sentiment'].map({'negative': -1, 'neutral': 0, 'positive': 1})
data['text'] = data['text'].map(getLower)
data['text'] = data['text'].str.replace('[^\w\s]', '') #removing characters #Removing Punctuations, Numbers, and Special Characters
data

Unnamed: 0,textID,text,selected_text,sentiment
0,a3d0a7d5ad,spent the entire morning in a meeting w a vend...,my boss was not happy w/ them. Lots of fun.,0
1,251b6a6766,oh good idea about putting them on ice cream,Good,1
2,c9e8d1ef1c,says good or should i say bad afternoon httpp...,says good (or should i say bad?) afternoon!,0
3,f14f087215,i dont think you can vote anymore i tried,i dont think you can vote anymore!,-1
4,bf7473b12d,haha better drunken tweeting you mean,better,1
...,...,...,...,...
31016,2f8444db6c,_shannon1234 shannie im so sorry didnt mean to...,,-1
31017,11de8c0456,im slowing on my tweetscuase i lost my phone,,-1
31018,08f6036add,invasion of the old ladies has just ended stil...,,0
31019,27d6472b81,my nose stud fell out and i cant find it loo...,,-1


In [0]:
#Removing short words
data['text'] = data['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

In [0]:
#download punkt
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> punkt
    Downloading package punkt to /root/nltk_data...
      Package punkt is already up-to-date!

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [0]:
data['text'] = data['text'].apply(nltk.word_tokenize)
data

Unnamed: 0,textID,text,selected_text,sentiment
0,a3d0a7d5ad,"[spent, the, entire, morning, meeting, vendor,...",my boss was not happy w/ them. Lots of fun.,0
1,251b6a6766,"[good, idea, about, putting, them, ice, cream]",Good,1
2,c9e8d1ef1c,"[says, good, should, say, bad, afternoon, http...",says good (or should i say bad?) afternoon!,0
3,f14f087215,"[dont, think, you, can, vote, anymore, tried]",i dont think you can vote anymore!,-1
4,bf7473b12d,"[haha, better, drunken, tweeting, you, mean]",better,1
...,...,...,...,...
31016,2f8444db6c,"[_shannon1234, shannie, sorry, didnt, mean, up...",,-1
31017,11de8c0456,"[slowing, tweetscuase, lost, phone]",,-1
31018,08f6036add,"[invasion, the, old, ladies, has, just, ended,...",,0
31019,27d6472b81,"[nose, stud, fell, out, and, cant, find, looks...",,-1


In [0]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
data['text'] = data['text'].apply(lambda x: [stemmer.stem(y) for y in x])
data

Unnamed: 0,textID,text,selected_text,sentiment
0,a3d0a7d5ad,"[spent, the, entir, morn, meet, vendor, and, b...",my boss was not happy w/ them. Lots of fun.,0
1,251b6a6766,"[good, idea, about, put, them, ice, cream]",Good,1
2,c9e8d1ef1c,"[say, good, should, say, bad, afternoon, httpp...",says good (or should i say bad?) afternoon!,0
3,f14f087215,"[dont, think, you, can, vote, anymor, tri]",i dont think you can vote anymore!,-1
4,bf7473b12d,"[haha, better, drunken, tweet, you, mean]",better,1
...,...,...,...,...
31016,2f8444db6c,"[_shannon1234, shanni, sorri, didnt, mean, ups...",,-1
31017,11de8c0456,"[slow, tweetscuas, lost, phone]",,-1
31018,08f6036add,"[invas, the, old, ladi, ha, just, end, still, ...",,0
31019,27d6472b81,"[nose, stud, fell, out, and, cant, find, look,...",,-1


In [0]:
from sklearn.feature_extraction.text import CountVectorizer

data['text'] = data['text'].apply(lambda x: ' '.join(x))

count_vect = CountVectorizer()
counts = count_vect.fit_transform(data['text'])
counts

<31021x26878 sparse matrix of type '<class 'numpy.int64'>'
	with 287594 stored elements in Compressed Sparse Row format>

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(counts, data['sentiment'], test_size=3535, shuffle=False, stratify=None)

In [0]:
#RandomForest
classifier = RandomForestClassifier(n_estimators=200)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
import numpy as np

predicted = classifier.predict(X_test)

print(np.mean(predicted == y_test))

0.7190947666195191


In [0]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, predicted))

[[ 646  316   39]
 [ 164 1079  187]
 [  34  253  817]]


In [0]:
from sklearn.metrics import f1_score

print(f1_score(y_test, predicted, average='micro'))
print(f1_score(y_test, predicted, average='macro'))
print(f1_score(y_test, predicted, average='weighted'))

0.7190947666195191
0.7208125209993322
0.7195935672515612


In [0]:
from sklearn.metrics import jaccard_score

print(jaccard_score(y_test, predicted, average='micro'))
print(jaccard_score(y_test, predicted, average='macro'))
print(jaccard_score(y_test, predicted, average='weighted'))

0.5613957597173145
0.5642793059423393
0.5627619419962075


In [0]:
#Decision Tree
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [0]:
import numpy as np

predicted = model.predict(X_test)

print(np.mean(predicted == y_test))

0.6695898161244696


In [0]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, predicted))

[[640 293  68]
 [266 944 220]
 [ 69 252 783]]


In [0]:
from sklearn.metrics import f1_score

print(f1_score(y_test, predicted, average='micro'))
print(f1_score(y_test, predicted, average='macro'))
print(f1_score(y_test, predicted, average='weighted'))

0.6695898161244696
0.6715233758626414
0.6699350906955094


In [0]:
from sklearn.metrics import jaccard_score

print(jaccard_score(y_test, predicted, average='micro'))
print(jaccard_score(y_test, predicted, average='macro'))
print(jaccard_score(y_test, predicted, average='weighted'))

0.5032957686583032
0.5065055332373228
0.5046746126031646
