In [1]:
import pandas as pd
import numpy as np

In [2]:
# load the dataset
df = pd.read_csv('data/Sentiment-Analysis-Dataset.zip',compression='zip',error_bad_lines = False)

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


In [3]:
df.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


In [4]:
#define our preprocessor to convert characters to their meanings, like '&lt' to '<'
from html import unescape
def preprocessor(doc):
    return unescape(doc).lower()

In [5]:
#lets load the english natural lang processor and disable some functions to make it faster
import spacy
from spacy.lang.en import STOP_WORDS
nlp = spacy.load('en_core_web_sm',disable=['rer','parser','tagger'])

In [6]:
#define a lemmatizer function
def lemmatizer(doc):
    return [word.lemma_ for word in nlp(doc)]

In [7]:
#lets create our stop words lemma
STOP_WORDS_lemma = [word.lemma_ for word in nlp(" ".join(list(STOP_WORDS)))]
#Add ',','.'and ';' to stop words
STOP_WORDS_lemma = set(STOP_WORDS_lemma).union(['.',';',','])

In [8]:
#lets build our model
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer

In [9]:
# vectorizer = TfidfVectorizer(preprocessor=preprocessor,
#                             tokenizer=lemmatizer,
#                             ngram_range=(1,2),
#                             stop_words=STOP_WORDS_lemma)
vectorizer = HashingVectorizer(preprocessor = preprocessor,
#                             tokenizer=lemmatizer,
                               alternate_sign = False,
#                             ngram_range=(1,2),
                            stop_words=STOP_WORDS)
clf = MultinomialNB()
model = Pipeline([('vectorizer',vectorizer),
                 ('classifier',clf)])

In [10]:
#lets split our data into train and test
X = df['SentimentText']
y = df['Sentiment']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

In [11]:
#lets train our model
model.fit(X_train,y_train)

  'stop_words.' % sorted(inconsistent))


Pipeline(steps=[('vectorizer',
                 HashingVectorizer(alternate_sign=False,
                                   preprocessor=<function preprocessor at 0x7efd53f1e0e0>,
                                   stop_words={"'d", "'ll", "'m", "'re", "'s",
                                               "'ve", 'a', 'about', 'above',
                                               'across', 'after', 'afterwards',
                                               'again', 'against', 'all',
                                               'almost', 'alone', 'along',
                                               'already', 'also', 'although',
                                               'always', 'am', 'among',
                                               'amongst', 'amount', 'an', 'and',
                                               'another', 'any', ...})),
                ('classifier', MultinomialNB())])

In [12]:
#Check model accuracy on training data
model.score(X_train,y_train)

0.8073322358497065

In [13]:
#check model accuracy on test data
model.score(X_test,y_test)

0.7699090658583632

In [14]:
import gzip
import dill

with gzip.open('SentimentModel.dill.gz','wb') as f:
    dill.dump(model,f,recurse=True)#recurse = True to make sure all the parameters are saved

In [15]:
import gzip
import dill

with gzip.open('SentimentModel.dill.gz','rb') as f:
    sentiment_model = dill.load(f)

In [16]:
sentiment_model.score(X_test,y_test)

  'stop_words.' % sorted(inconsistent))


0.7699090658583632

In [3]:
!ls -ahl

total 141M
drwxrwxrwx 1 root root 4.0K Dec 12 11:34  .
drwxrwxrwx 1 root root 4.0K Dec  1 11:54  ..
drwxrwxrwx 1 root root    0 Dec  2 05:22  data
drwxrwxrwx 1 root root 4.0K Dec 12 11:54  .idea
drwxrwxrwx 1 root root    0 Dec  1 11:55  .ipynb_checkpoints
drwxrwxrwx 1 root root    0 Dec 12 11:34  __pycache__
-rwxrwxrwx 1 root root 9.7K Dec 11 21:12 'Sentiment Analysis web project.ipynb'
-rwxrwxrwx 1 root root 141M Dec 11 20:29  SentimentModel.dill.gz
-rwxrwxrwx 1 root root  597 Dec 12 11:28  SentimentWebApp.py
