In [28]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from nltk.classify import ClassifierI
import numpy as np

## Importing 

In [29]:
nyt = pd.read_csv('../3. Sentiment Analysis preparation/sentiment_prepared.csv', index_col = 0)
nyt.head()

Unnamed: 0,articleWordCount,headline,snippet,tokens,label_headline
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...,"['leadership', 'poised', 'topple', 'obama', 'p...",Positive
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...,"['fractured', 'world', 'tested', 'hope', 'youn...",Positive
2,445,Little Troublemakers,Chuck Deodene puts us in a bubbly mood.,"['little', 'troublemakerschuck', 'deodene', 'p...",Neutral
3,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim...","['angela', 'merkel', 'russia', 'next', 'target...",Negative
4,309,Boots for a Stranger on a Bus,Witnessing an act of generosity on a rainy day.,"['boots', 'stranger', 'buswitnessing', 'act', ...",Neutral


## New dataframe to eliminate neutral words

In [30]:
new_nyt = nyt[nyt.label_headline != 'Neutral'].reset_index(drop=True).copy()

In [31]:
new_nyt = new_nyt.replace(['Positive', 'Negative'], ['pos', 'neg'])     

In [32]:
new_nyt.head()

Unnamed: 0,articleWordCount,headline,snippet,tokens,label_headline
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...,"['leadership', 'poised', 'topple', 'obama', 'p...",pos
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...,"['fractured', 'world', 'tested', 'hope', 'youn...",pos
2,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim...","['angela', 'merkel', 'russia', 'next', 'target...",neg
3,557,Sprint and Mr. Trump’s Fictional Jobs,"The emerging, and dangerous, new form of crony...","['sprint', 'mr', 'trump', 'fictional', 'jobsth...",neg
4,1109,"Fighting Diabetes, and Leading by Example","Eric L. Adams, the Brooklyn borough president,...","['fighting', 'diabetes', 'leading', 'exampleer...",pos


## Functions

The first function gets words individually. This had been done in the previous Jupyter Notebook, but when saving the csv the column turned into a string.

The second function gathers all the words and its frequency in a dictionary.

Finally, we obtain the most frequent words with a predefined function from the nltk library, and select the most common 3.000.

In [33]:
def clean_list_str(text):
    text = text.strip('[]').split(",")
    cleaned_text = [word.strip("' ") for word in text]
    return cleaned_text

In [34]:
all_words = {}

for token in new_nyt.tokens:
    cleaned_token = clean_list_str(token)
    for word in cleaned_token:
        all_words[word] = 1 if word not in all_words.keys() else all_words[word] + 1

In [35]:
all_words = nltk.FreqDist(all_words)
most_common_freq = all_words.most_common(3000)

In [36]:
most_common_words = [word for word, freq in most_common_freq]

## Find features

In this list we create a tuple with the dictionary selected above and the label containing whether the piece of news has a positive or a negative approach.

In [37]:
featuresets = []

for idx in range(new_nyt.shape[0]):
    # Read data
    tokens, label = new_nyt.loc[idx, ['tokens', 'label_headline']]
    # Transform str to list
    cleaned_tokens = clean_list_str(tokens)
    # Create token feature
    token_dict = {word: word in most_common_words for word in cleaned_tokens}
    # Save feature
    feature = (token_dict, label)
    featuresets.append(feature)

Here it is calculated the 80% of the features to further split them into training and testing.

In [53]:
len(featuresets)*0.8

4223.2

In [54]:
training_set = featuresets[:4223]
testing_set = featuresets[4223:]

## Naïve Bayes classifier

Training and result of the classifier

In [55]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [56]:
print("Naive Bayes accuracy percent:", 
      (nltk.classify.accuracy(classifier, testing_set))*100)

Naive Bayes accuracy percent: 79.0719696969697


## Most informative features

This are the most informative features of the classifier. Each of them has a calculated probability of conveying a positive or a negative sentiment per every time they don't.

In [57]:
classifier.show_most_informative_features(12)

Most Informative Features
                  sexual = True              neg : pos    =     19.4 : 1.0
                favorite = True              pos : neg    =     19.2 : 1.0
                problems = True              neg : pos    =     17.3 : 1.0
                  attack = True              neg : pos    =     16.6 : 1.0
                 variety = True              pos : neg    =     16.1 : 1.0
              harassment = True              neg : pos    =     15.9 : 1.0
                  crisis = True              neg : pos    =     15.0 : 1.0
                   super = True              pos : neg    =     14.2 : 1.0
                 success = True              pos : neg    =     14.2 : 1.0
                   ready = True              pos : neg    =     12.9 : 1.0
                   death = True              neg : pos    =     11.6 : 1.0
                 scandal = True              neg : pos    =     11.6 : 1.0
