In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re, string, random

from nltk import download
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier

import math

stop_words = stopwords.words('english')

In [2]:
# download('punkt')
# download('wordnet')
# download('averaged_perceptron_tagger')
# download('stopwords')
# download('twitter_samples')

In [3]:
def remove_noise(tweet_tokens, stop_words = stop_words):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        lowered_token = token.lower()
                 
        if len(token) > 0 and token not in string.punctuation and lowered_token not in stop_words:
            cleaned_tokens.append(lowered_token)
    return cleaned_tokens

In [4]:
def get_all_words(tokens_list):
    for tokens in tokens_list:
        for token in tokens:
            yield token

In [5]:
def get_tweets_for_model(tokens_list):
    for tweet_tokens in tokens_list:
        yield dict([token, True] for token in tweet_tokens)

In [6]:
duck_tweets_df = pd.read_csv('data/Donald-Tweets!.csv')
duck_tweets_df.head()

Unnamed: 0,Date,Time,Tweet_Text,Type,Media_Type,Hashtags,Tweet_Id,Tweet_Url,twt_favourites_IS_THIS_LIKE_QUESTION_MARK,Retweets,Unnamed: 10,Unnamed: 11
0,16-11-11,15:26:37,Today we express our deepest gratitude to all ...,text,photo,ThankAVet,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,127213,41112,,
1,16-11-11,13:33:35,Busy day planned in New York. Will soon be mak...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,141527,28654,,
2,16-11-11,11:14:20,Love the fact that the small groups of protest...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,183729,50039,,
3,16-11-11,2:19:44,Just had a very open and successful presidenti...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,214001,67010,,
4,16-11-11,2:10:46,A fantastic day in D.C. Met with President Oba...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,178499,36688,,


In [7]:
temp = duck_tweets_df['Tweet_Text'].apply(lambda row: word_tokenize(row))

In [8]:
temp = duck_tweets_df['Tweet_Text'].apply(lambda row: remove_noise(word_tokenize(row)))

In [9]:
temp[0]

['today',
 'express',
 'deepest',
 'gratitude',
 'serve',
 'armed',
 'force',
 'thankavet',
 'http',
 '//t.co/wpk7qwpk8z']

In [10]:
duck_tweets_df['Retweets'].value_counts()

671      8
1187     8
677      7
1044     7
678      7
        ..
17058    1
2719     1
6813     1
2711     1
4098     1
Name: Retweets, Length: 4742, dtype: int64

In [11]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

In [12]:
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens))

In [13]:
all_pos_words = get_all_words(positive_cleaned_tokens_list)

freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [14]:
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]

In [15]:
positive_dataset

[({'#followfriday': True,
   'top': True,
   'engage': True,
   'member': True,
   'community': True,
   'week': True,
   ':)': True},
  'Positive'),
 ({'hey': True,
   'james': True,
   'odd': True,
   ':/': True,
   'please': True,
   'call': True,
   'contact': True,
   'centre': True,
   '02392441234': True,
   'able': True,
   'assist': True,
   ':)': True,
   'many': True,
   'thanks': True},
  'Positive'),
 ({'listen': True,
   'last': True,
   'night': True,
   ':)': True,
   'bleed': True,
   'amazing': True,
   'track': True,
   'scotland': True},
  'Positive'),
 ({'congrats': True, ':)': True}, 'Positive'),
 ({'yeaaaah': True,
   'yippppy': True,
   'accnt': True,
   'verify': True,
   'rqst': True,
   'succeed': True,
   'get': True,
   'blue': True,
   'tick': True,
   'mark': True,
   'fb': True,
   'profile': True,
   ':)': True,
   '15': True,
   'day': True},
  'Positive'),
 ({'one': True,
   'irresistible': True,
   ':)': True,
   '#flipkartfashionfriday': True},
  'P

In [16]:
positive_cutoff = int(math.floor(len(positive_dataset)*3/4))
negative_cutoff = int(math.floor(len(negative_dataset)*3/4))
train_data = positive_dataset[:positive_cutoff] + negative_dataset[:negative_cutoff]
test_data = positive_dataset[positive_cutoff:] + negative_dataset[negative_cutoff:]

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9916
Most Informative Features
                      :( = True           Negati : Positi =   2214.3 : 1.0
                      :) = True           Positi : Negati =   1073.8 : 1.0
                followed = True           Negati : Positi =     34.3 : 1.0
                follower = True           Positi : Negati =     25.8 : 1.0
                    glad = True           Positi : Negati =     25.7 : 1.0
                     x15 = True           Negati : Positi =     23.7 : 1.0
                  arrive = True           Positi : Negati =     22.2 : 1.0
                     sad = True           Negati : Positi =     21.7 : 1.0
                    sick = True           Negati : Positi =     19.7 : 1.0
               community = True           Positi : Negati =     16.3 : 1.0
None


In [17]:
def classify_tweet(tweet):
    custom_tokens = remove_noise(word_tokenize(tweet))
    return classifier.classify(dict([token, True] for token in custom_tokens))

In [18]:
duck_tweets_df['classifcation'] = duck_tweets_df.apply(lambda row: classify_tweet(row['Tweet_Text']), axis=1)

In [19]:
duck_tweets_df

Unnamed: 0,Date,Time,Tweet_Text,Type,Media_Type,Hashtags,Tweet_Id,Tweet_Url,twt_favourites_IS_THIS_LIKE_QUESTION_MARK,Retweets,Unnamed: 10,Unnamed: 11,classifcation
0,16-11-11,15:26:37,Today we express our deepest gratitude to all ...,text,photo,ThankAVet,7.970000e+17,https://twitter.com/realDonaldTrump/status/797...,127213,41112,,,Positive
1,16-11-11,13:33:35,Busy day planned in New York. Will soon be mak...,text,,,7.970000e+17,https://twitter.com/realDonaldTrump/status/797...,141527,28654,,,Positive
2,16-11-11,11:14:20,Love the fact that the small groups of protest...,text,,,7.970000e+17,https://twitter.com/realDonaldTrump/status/797...,183729,50039,,,Positive
3,16-11-11,2:19:44,Just had a very open and successful presidenti...,text,,,7.970000e+17,https://twitter.com/realDonaldTrump/status/796...,214001,67010,,,Negative
4,16-11-11,2:10:46,A fantastic day in D.C. Met with President Oba...,text,,,7.970000e+17,https://twitter.com/realDonaldTrump/status/796...,178499,36688,,,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7370,15-07-16,13:10:00,I loved firing goofball atheist Penn @pennjill...,text,,,6.220000e+17,https://twitter.com/realDonaldTrump/status/621...,953,431,,,Negative
7371,15-07-16,10:18:31,I hear @pennjillette show on Broadway is terri...,text,,,6.220000e+17,https://twitter.com/realDonaldTrump/status/621...,1175,1086,,,Negative
7372,15-07-16,10:10:17,Irrelevant clown @KarlRove sweats and shakes n...,text,,,6.220000e+17,https://twitter.com/realDonaldTrump/status/621...,1494,930,,,Negative
7373,15-07-16,9:44:07,"""@HoustonWelder: Donald Trump is one of the se...",text,,,6.220000e+17,https://twitter.com/realDonaldTrump/status/621...,1800,1738,,,Positive
