In [27]:
# general
import pandas as pd
import numpy as np
import re
import joblib
import pickle

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline


# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# sklearn
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB,BernoulliNB, GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kushidhar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
DATASET_COLUMNS = ["label", "ID", "date", "flag", "user", "text"]
df = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding ="ISO-8859-1",names=DATASET_COLUMNS)
df = shuffle(df)
df.head()

Unnamed: 0,label,ID,date,flag,user,text
26174,0,1558639524,Sun Apr 19 08:58:50 PDT 2009,NO_QUERY,Gracebryant,Recovering after an 11 day Easter break in the...
589466,0,2217002338,Wed Jun 17 20:15:27 PDT 2009,NO_QUERY,Clarinetluva321,wahhh the north american house hippo isn't rea...
1526826,4,2177096755,Mon Jun 15 05:26:32 PDT 2009,NO_QUERY,king4ever90,"Campeonato de BattleBall, saiba mais em: tkblo..."
293096,0,1996212998,Mon Jun 01 14:42:51 PDT 2009,NO_QUERY,ashleystover,@Jolenedann i miss you too. how have you been?
123989,0,1834021075,Mon May 18 01:19:29 PDT 2009,NO_QUERY,mlhannah,just got home. i am dead tired


In [13]:
X = list(df[:40000]["text"].values)
y = list(df[:40000]["label"].values)
print(np.unique(y,return_counts=True))
for i in range(len(y)):
    if y[i] == 4:
        y[i] = 1

(array([0, 4]), array([19901, 20099]))


In [14]:
# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")
sws = set(stopwords.words('english'))
negative_stopwords = ["against","aren", "aren't","couldn't","didn't","doesn",
    "doesn't","don","don't","hadn","hadn't","haven","haven't",
    "isn","isn't", "mightn","mightn't","mustn","mustn't","needn",
    "needn't","no","nor","not", "shan","shan't", "shouldn","shouldn't",
    "wasn","wasn't","weren","weren't","wouldn","wouldn't"]
for word in negative_stopwords:
    sws.remove(word)

In [15]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in sws:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [16]:
X_pre = []
for line in X:
    X_pre.append(preprocess(line))

In [17]:
cv = CountVectorizer()
x_vec = cv.fit_transform(X_pre).toarray()

In [36]:
X_train = x_vec[:25000]
y_train = y[:25000]

X_test  = x_vec[25000:30000]
y_test  = y[25000:30000]

X_valid = x_vec[30000:35000]
y_valid = y[30000:35000]

In [19]:
mb = MultinomialNB()
mb.fit(X_train,y_train)

MultinomialNB()

In [34]:
# train accuracy : 
y_pred_train = mb.predict(X_train)
y_pred = mb.predict(X_test)

In [33]:
print("-----------------------------------------------------------------------")
print("Confusion matrix : \n", confusion_matrix(y_test, y_pred))
print("-----------------------------------------------------------------------")
print('\033[1m'+"Train Accuracy : {:.2f}%".format(accuracy_score(y_train,y_pred_train)*100))
print("Test Accuracy  : {:.2f}%".format(accuracy_score(y_test,y_pred)*100))
print("-----------------------------------------------------------------------")

-----------------------------------------------------------------------
Confusion matrix : 
 [[1908  543]
 [ 786 1763]]
-----------------------------------------------------------------------
[1mTrain Accuracy : 87.08%
Test Accuracy  : 73.42%
-----------------------------------------------------------------------


In [41]:
y_valid_pred = mb.predict(X_valid)
sum(y_valid == y_valid_pred)/len(y_valid)

0.7386

In [49]:
text1 = ["It's a wonderful day"]
text2 = ["It is a bad day"]
text1 = preprocess(text1)
text2 = preprocess(text2)
text = [text1,text2]
text_vec = cv.transform(text).toarray()
mb.predict(text_vec)

# 1 : positive
# 0 : negative

array([1, 0])

In [50]:
joblib.dump(mb, 'model.pkl')

['model.pkl']

In [51]:
pickle.dump(cv, open("vector.pickel", "wb"))