In [329]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import re
import nltk

In [330]:
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


In [331]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/harshkulkarni/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [332]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets=twitter_samples.strings('negative_tweets.json')

In [333]:
len(all_positive_tweets)

5000

In [334]:
all_positive_tweets[0]

'#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'

In [335]:
#train test split
x_train=all_positive_tweets[:4000]+all_negative_tweets[:4000]
x_test=all_positive_tweets[4000:]+all_negative_tweets[4000:]

In [336]:
y_train=np.append(np.ones((4000,1)),np.zeros((4000,1)),axis=0)
y_test=np.append(np.ones((1000,1)),np.zeros((1000,1)),axis=0)

In [337]:
import string

In [338]:
def process_tweet(tweet):
    stemmer=PorterStemmer()
    tockeniser=TweetTokenizer(preserve_case=False,strip_handles=True,reduce_len=True)
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    #tockenising
    tockens=tockeniser.tokenize(tweet)
    stopwords_english=stopwords.words('english')
    tweet_clean=[]
    for word in tockens:
        if(word not in stopwords_english and word not in string.punctuation):
            #stemming
            word=stemmer.stem(word)
            tweet_clean.append(word)
    return tweet_clean


In [339]:
#Calculating Setniment Analysis
def buid_freq(tweets,target):
    freq={}
    ys=np.squeeze(target).tolist()
    for tweet,y in zip(tweets,ys):
        for word in process_tweet(tweet):
            pair=(word,y)
            if(pair in freq):
                freq[pair]+=1
            else:
                freq[pair]=1
    return freq;


In [340]:
freqs=buid_freq(x_train,y_train)
freqs

{('followfriday', 1.0): 23,
 ('top', 1.0): 30,
 ('engag', 1.0): 7,
 ('member', 1.0): 14,
 ('commun', 1.0): 27,
 ('week', 1.0): 72,
 (':)', 1.0): 2847,
 ('hey', 1.0): 60,
 ('jame', 1.0): 7,
 ('odd', 1.0): 2,
 (':/', 1.0): 5,
 ('pleas', 1.0): 80,
 ('call', 1.0): 27,
 ('contact', 1.0): 4,
 ('centr', 1.0): 1,
 ('02392441234', 1.0): 1,
 ('abl', 1.0): 6,
 ('assist', 1.0): 1,
 ('mani', 1.0): 28,
 ('thank', 1.0): 504,
 ('listen', 1.0): 14,
 ('last', 1.0): 39,
 ('night', 1.0): 55,
 ('bleed', 1.0): 2,
 ('amaz', 1.0): 41,
 ('track', 1.0): 5,
 ('scotland', 1.0): 2,
 ('congrat', 1.0): 15,
 ('yeaaah', 1.0): 1,
 ('yipppi', 1.0): 1,
 ('accnt', 1.0): 2,
 ('verifi', 1.0): 2,
 ('rqst', 1.0): 1,
 ('succeed', 1.0): 1,
 ('got', 1.0): 57,
 ('blue', 1.0): 8,
 ('tick', 1.0): 1,
 ('mark', 1.0): 1,
 ('fb', 1.0): 4,
 ('profil', 1.0): 2,
 ('15', 1.0): 4,
 ('day', 1.0): 187,
 ('one', 1.0): 90,
 ('irresist', 1.0): 2,
 ('flipkartfashionfriday', 1.0): 16,
 ('like', 1.0): 187,
 ('keep', 1.0): 55,
 ('love', 1.0): 336,
 

In [341]:
def extract_features(tweet, frequencies):
    words=process_tweet(tweet)
    x = np.zeros((1, 3)) 
    x[0,0]=1
    for word in words:
        x[0][1]+=frequencies.get((word,1),0)
        x[0][2]+=frequencies.get((word,0),0)
    return x

In [342]:
pre_processed=np.zeros((len(x_train),3))
for i in range(len(x_train)):
    tweet=x_train[i]
    features=extract_features(tweet,freqs)
    pre_processed[i][0]=features[0][0]
    pre_processed[i][1]=features[0][1]
    pre_processed[i][2]=features[0][2]
    

In [343]:
pre_processed

array([[1.000e+00, 3.020e+03, 6.100e+01],
       [1.000e+00, 3.573e+03, 4.440e+02],
       [1.000e+00, 3.005e+03, 1.150e+02],
       ...,
       [1.000e+00, 1.440e+02, 7.830e+02],
       [1.000e+00, 2.050e+02, 3.890e+03],
       [1.000e+00, 1.890e+02, 3.974e+03]])

In [344]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score 

In [345]:
model=LogisticRegression(random_state=0).fit(pre_processed,y_train)

  y = column_or_1d(y, warn=True)


In [346]:
test=np.zeros((len(x_test),3))
for i in range(len(x_test)):
    tweet=x_test[i]
    features=extract_features(tweet,freqs)
    test[i][0]=features[0][0]
    test[i][1]=features[0][1]
    test[i][2]=features[0][2]



In [347]:
y_predict=model.predict(test)

In [348]:
acc_score=accuracy_score(y_predict,y_test)
acc_score*100


99.15

In [349]:
confusion_matrix(y_predict,y_test)

array([[988,   5],
       [ 12, 995]])

In [350]:

##predicting sentiment of your won tweet 
my_tweet = 'This is a ridiculously bright movie. The plot was terrible and I was sad until the ending!'
my_freq={}
print( process_tweet(my_tweet))

for word in process_tweet(my_tweet):
    pair=(word,0)
    if(pair in my_freq):
        my_freq[pair]+=1
    else:
        my_freq[pair]=1
my_test=np.zeros((1,3))
my_features=extract_features(my_tweet,freqs)
my_features

['ridicul', 'bright', 'movi', 'plot', 'terribl', 'sad', 'end']


array([[  1.,  38., 170.]])

In [351]:
y_p = model.predict(my_features)
y_p
# acc_score = accuracy_score(y_predict, [0])
# acc_scor

array([0.])