In [287]:
import nltk
import numpy as np
import pandas as pd
import re

In [288]:
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import twitter_samples,stopwords
import string 

In [289]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/harshkulkarni/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [290]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/harshkulkarni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [291]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [292]:
print(len(all_positive_tweets), " ", len(all_negative_tweets))


5000   5000


In [293]:
#train test split
x_test=all_positive_tweets[:4000]+all_negative_tweets[:4000]
x_train=all_positive_tweets[4000:]+all_negative_tweets[4000:]
y_train=np.append((np.ones((4000,1))),np.zeros((4000,1)))
y_test=np.append((np.ones((1000,1))),np.zeros((1000,1)))

In [294]:
def process_tweet(tweet):
    stemmer=PorterStemmer()
    tockeniser=TweetTokenizer(preserve_case=False,reduce_len=True,strip_handles=True)
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    stop_words_eng=stopwords.words('english')
    tockens=tockeniser.tokenize(tweet)
    tweet_clean=[]
    for tocken in tockens:
        if(tocken not in stop_words_eng and tocken not in string.punctuation):
            stemmer.stem(tocken)
            tweet_clean.append(tocken)
    return tweet_clean


In [295]:
def bulid_freq(tweets,target):
    ys=np.squeeze(target).tolist()
    freqs={}
    for tweet,y in zip(tweets,ys):
        for word in process_tweet(tweet):
            pair=(word,y)
            if(pair in freqs):
                freqs[pair]+=1
            else:
                freqs[pair]=1
    return freqs

In [296]:
test_freqs=bulid_freq(x_test,y_train)

In [297]:
def lookup(pair,freqs):
    if(pair not in freqs):
        return 0
    else:
        return freqs[pair]

In [298]:
def Train(freqs):
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)
    V_pos=0
    V_neg=0
    N_pos=0
    N_neg=0
    for pair in freqs.keys():
        if pair[1]==1:
            V_pos+=1
            N_pos+=freqs[pair]
        else:
            V_neg+=1
            N_neg+=freqs[pair]
    loglikelihood={}
    for word in vocab:
        x1 = lookup((word,1),freqs)
        x2=lookup((word,0),freqs)
        pos=np.log((x1+1)/(N_pos+V_pos))
        neg=np.log((x2+1)/(N_neg+V_neg))
        loglikelihood[word]=pos-neg
    logprior=np.log(len(all_positive_tweets)/len(x_train))-np.log(len(all_negative_tweets)/len(x_train))

    return loglikelihood,logprior

In [299]:
loglikelihood,logprior=Train(test_freqs)

In [300]:
def test(test_data):
    y_predict=[]
    print(len(test_data))
    for tweet in test_data:
        sum=0
        for word in process_tweet(tweet):
            if(word in loglikelihood):
                sum+=loglikelihood[word]
        sentiment=logprior+sum
        if(sentiment >0):
            y_predict.append(1)
        else:
            y_predict.append(0)
    return y_predict 
            

In [301]:
def predict_accuracy(predict_y, test_y):
    sum = 0
    for i in range(len(predict_y)):
        sum += (predict_y[i] == test_y[i])
    return (sum/len(test_y))

In [302]:
y_predict=test(x_train)
len(y_predict)

2000


2000

In [303]:
predict_accuracy(y_predict,y_test)

0.9945

In [311]:
my_tweet = ['I am happy because I am learning :)','I hate this']
test(my_tweet)

2


[1, 0]