In [352]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import re
import nltk

In [353]:
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


In [354]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/harshkulkarni/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [355]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets=twitter_samples.strings('negative_tweets.json')

In [356]:
len(all_positive_tweets)

5000

In [357]:
all_positive_tweets[0]

'#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'

In [358]:
#train test split
x_train=all_positive_tweets[:4000]+all_negative_tweets[:4000]
x_test=all_positive_tweets[4000:]+all_negative_tweets[4000:]

In [359]:
y_train=np.append(np.ones((4000,1)),np.zeros((4000,1)),axis=0)
y_test=np.append(np.ones((1000,1)),np.zeros((1000,1)),axis=0)

In [360]:
import string

In [361]:
def process_tweet(tweet):
    stemmer=PorterStemmer()
    tockeniser=TweetTokenizer(preserve_case=False,strip_handles=True,reduce_len=True)
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    #tockenising
    tockens=tockeniser.tokenize(tweet)
    stopwords_english=stopwords.words('english')
    tweet_clean=[]
    for word in tockens:
        if(word not in stopwords_english and word not in string.punctuation):
            #stemming
            word=stemmer.stem(word)
            tweet_clean.append(word)
    return tweet_clean


In [362]:
#Calculating Setniment Analysis
def buid_freq(tweets,target):
    freq={}
    ys=np.squeeze(target).tolist()
    for tweet,y in zip(tweets,ys):
        for word in process_tweet(tweet):
            pair=(word,y)
            if(pair in freq):
                freq[pair]+=1
            else:
                freq[pair]=1
    return freq;


In [363]:
freqs=buid_freq(x_train,y_train)

In [364]:
def extract_features(tweet, frequencies):
    words=process_tweet(tweet)
    x = np.zeros((1, 3)) 
    x[0,0]=1
    for word in words:
        x[0][1]+=frequencies.get((word,1),0)
        x[0][2]+=frequencies.get((word,0),0)
    return x

In [365]:
pre_processed=np.zeros((len(x_train),3))
for i in range(len(x_train)):
    tweet=x_train[i]
    features=extract_features(tweet,freqs)
    pre_processed[i][0]=features[0][0]
    pre_processed[i][1]=features[0][1]
    pre_processed[i][2]=features[0][2]
    

In [366]:
pre_processed

array([[1.000e+00, 3.020e+03, 6.100e+01],
       [1.000e+00, 3.573e+03, 4.440e+02],
       [1.000e+00, 3.005e+03, 1.150e+02],
       ...,
       [1.000e+00, 1.440e+02, 7.830e+02],
       [1.000e+00, 2.050e+02, 3.890e+03],
       [1.000e+00, 1.890e+02, 3.974e+03]])

In [367]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score 

In [368]:
model=LogisticRegression(random_state=0).fit(pre_processed,y_train)

  y = column_or_1d(y, warn=True)


In [369]:
test=np.zeros((len(x_test),3))
for i in range(len(x_test)):
    tweet=x_test[i]
    features=extract_features(tweet,freqs)
    test[i][0]=features[0][0]
    test[i][1]=features[0][1]
    test[i][2]=features[0][2]



In [370]:
y_predict=model.predict(test)

In [371]:
acc_score=accuracy_score(y_predict,y_test)
acc_score*100


99.15

In [372]:
confusion_matrix(y_predict,y_test)

array([[988,   5],
       [ 12, 995]])

In [373]:

##predicting sentiment of your won tweet 
my_tweet = 'This is a ridiculously bright movie. The plot was terrible and I was sad until the ending!'
my_freq={}
print( process_tweet(my_tweet))

for word in process_tweet(my_tweet):
    pair=(word,0)
    if(pair in my_freq):
        my_freq[pair]+=1
    else:
        my_freq[pair]=1
my_test=np.zeros((1,3))
my_features=extract_features(my_tweet,freqs)
my_features

['ridicul', 'bright', 'movi', 'plot', 'terribl', 'sad', 'end']


array([[  1.,  38., 170.]])

In [374]:
y_p = model.predict(my_features)
y_p
# acc_score = accuracy_score(y_predict, [0])
# acc_scor

array([0.])