# Import NLTK and Download Data:

In [1]:
import nltk
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [2]:
from nltk.corpus import twitter_samples

In [3]:
positive_tweeet = twitter_samples.strings('positive_tweets.json')
negative_tweeet = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Data Preprocessing:

# Tokenization

In [5]:
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]
print(tweet_tokens[0])

#FollowFriday


# Normalization

In [6]:
# debug step to solve an error appeared when download wordnet and averaged_perceptron_tagger
import socket
socket.getaddrinfo('localhost', 8080)

[(<AddressFamily.AF_INET6: 23>, 0, 0, '', ('::1', 8080, 0, 0)),
 (<AddressFamily.AF_INET: 2>, 0, 0, '', ('127.0.0.1', 8080))]

In [7]:
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [9]:
from nltk.tag import pos_tag
print(pos_tag(tweet_tokens))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


In [10]:
from nltk.stem.wordnet import WordNetLemmatizer
def lemmatize_tweet(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tweet = []
    for word,tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_tweet.append(lemmatizer.lemmatize(word,pos))
    return lemmatized_tweet
print(lemmatize_tweet(tweet_tokens))

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


# Removing Noise

In [11]:
import re,string
def removie_noise(tweet_tokens,stop_words = ()):
    cleaned_tokens = []
    for token,tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","",token)
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token,pos)
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens  

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(removie_noise(tweet_tokens,stop_words))

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [14]:
positive_tweeet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweeet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweeet_tokens:
    positive_cleaned_tokens_list.append(removie_noise(tokens,stop_words))
for tokens in negative_tweeet_tokens:
    negative_cleaned_tokens_list.append(removie_noise(tokens,stop_words))

In [16]:
print(positive_tweeet_tokens[500])
print(positive_cleaned_tokens_list[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', '#fanart', ':d']


# Words Density (positive tweets)

In [17]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token
all_positive_words = get_all_words(positive_cleaned_tokens_list)

In [18]:
from nltk import FreqDist
freq_dist_positive = FreqDist(all_positive_words)
print(freq_dist_positive.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


# Preparing Data and Model:

# Tokens to Dictionary

In [19]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token,True] for token in tweet_tokens)
model_positive_tokens = get_tweets_for_model(positive_cleaned_tokens_list)
model_negative_tokens = get_tweets_for_model(negative_cleaned_tokens_list)

# Splitting the Dataset for Training and Testing

In [20]:
import random
positive_dataset = [(tweet_dict,"Positive") for tweet_dict in model_positive_tokens]
negative_dataset = [(tweet_dict,"Negative") for tweet_dict in model_negative_tokens]
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]

# Build and Test the Model:

In [21]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)
print("Accuracy is:",classify.accuracy(classifier,test_data))
print(classifier.show_most_informative_features(10))

Accuracy is: 0.9936666666666667
Most Informative Features
                      :( = True           Negati : Positi =   2068.7 : 1.0
                     sad = True           Negati : Positi =     55.9 : 1.0
                follower = True           Positi : Negati =     36.2 : 1.0
                     bam = True           Positi : Negati =     20.3 : 1.0
                  arrive = True           Positi : Negati =     18.1 : 1.0
                     x15 = True           Negati : Positi =     17.7 : 1.0
                followed = True           Negati : Positi =     14.7 : 1.0
                 welcome = True           Positi : Negati =     14.1 : 1.0
               community = True           Positi : Negati =     13.6 : 1.0
                    miss = True           Negati : Positi =     11.8 : 1.0
None


In [22]:
from nltk.tokenize import word_tokenize
customer_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."
customer_tokens = removie_noise(word_tokenize(customer_tweet))
print(classifier.classify(dict([token,True] for token in customer_tokens)))

Negative


In [23]:
from nltk.tokenize import word_tokenize
customer_tweet1 = "Congrats #SportStar on your 7th best goal from last season winning goal of the year :) #Baller #Topbin #oneofmanyworldies"
customer_tokens1 = removie_noise(word_tokenize(customer_tweet1))
print(classifier.classify(dict([token,True] for token in customer_tokens1)))

Positive


In [24]:
from nltk.tokenize import word_tokenize
customer_tweet3 = "i feel sick today :) #sickness #sick leave "
customer_tokens3 = removie_noise(word_tokenize(customer_tweet3))
print(classifier.classify(dict([token,True] for token in customer_tokens3)))

Negative
