In [44]:
from nltk.corpus import twitter_samples
from sklearn.model_selection import train_test_split
from nltk.tokenize.casual import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from joblib import dump, load

In [34]:
## available data set in nltk twitter corpus

twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [35]:
## pull both negative and positive tweet samples
negTweetsSet = twitter_samples.strings('negative_tweets.json')
posTweetsSet = twitter_samples.strings('positive_tweets.json')

In [36]:
## generate a training and development set for tweets

trainPos, devPos, trainNeg, devNeg = train_test_split(posTweetsSet,negTweetsSet,train_size = .75)

In [37]:
## combine negative and positive sets, and generate class labels for them

trainXStrings = trainPos + trainNeg
trainY = np.array([1 for x in trainPos] + [0 for x in trainNeg])
devXStrings = devPos + devNeg
devY = np.array([1 for x in devPos] + [0 for x in devNeg])

In [38]:
## tokenizer splits strings into word tokens
## countvectorizer generates bag of words representation for tweets

tokenizer = TweetTokenizer(strip_handles=True)
cv = CountVectorizer(lowercase = False, stop_words = 'english',tokenizer=tokenizer.tokenize)
trainX = cv.fit_transform(trainXStrings)
devX = cv.transform(devXStrings)


In [39]:
## generate model and fit to training set

sentModel = MultinomialNB()

sentModel.fit(trainX,trainY)

MultinomialNB()

In [40]:
## score for development data

sentModel.score(devX,devY)

0.9968

In [45]:
## model and vectorizer to be used in data intake pipeline

dump(sentModel,'sentimentAnalysisModel.joblib')
dump(cv,'countVectorizer.joblib')

['countVectorizer.joblib']