In [173]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.sentiment.util import *
from nltk.corpus import opinion_lexicon

In [174]:
nyt = pd.read_csv('../2. Data cleaning/nyt.csv')
nyt.head()

Unnamed: 0,articleWordCount,headline,snippet
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...
2,445,Little Troublemakers,Chuck Deodene puts us in a bubbly mood.
3,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim..."
4,309,Boots for a Stranger on a Bus,Witnessing an act of generosity on a rainy day.


With a Lexikon we can label the sentiment of each headline and snippet of text. With Hu and Liu Sentiment Lexicon here a label is set to every headline, so it will be possible to train the model.

In [175]:
nyt.head()

Unnamed: 0,articleWordCount,headline,snippet
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...
2,445,Little Troublemakers,Chuck Deodene puts us in a bubbly mood.
3,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim..."
4,309,Boots for a Stranger on a Bus,Witnessing an act of generosity on a rainy day.


## Tokenize data

In order to create a frequency list of the most used words, words need to be arranged in such a manner that we can get rid of those that convey no meaning.

In the first place 

In [176]:
def clean_up(s):
    s = re.sub('[^A-Za-z]+', ' ', s)
    return s.lower().strip()

In [177]:
def tokenize(s):
    s = s.split(' ')
    return s


In [178]:
def remove_stopwords(sent):
    filtered_words = []
    for word in sent:
        if word not in stopwords.words('english'):
            filtered_words.append(word)
    return filtered_words

In [179]:
def remove_one_character(sent):
    longer_words = []
    for word in sent:
        if len(word) != 1:
            longer_words.append(word)
    return longer_words

In [180]:
nyt['headline_token'] = nyt['headline'].apply(clean_up)

In [181]:
nyt['headline_token'] = nyt['headline_token'].apply(tokenize)

In [182]:
nyt['headline_token'] = nyt['headline_token'].apply(remove_stopwords)

In [183]:
nyt['headline_token'] = nyt['headline_token'].apply(remove_one_character)

In [184]:
nyt.head(10)

Unnamed: 0,articleWordCount,headline,snippet,headline_token
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...,"[leadership, poised, topple, obama, pillars]"
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...,"[fractured, world, tested, hope, young, presid..."
2,445,Little Troublemakers,Chuck Deodene puts us in a bubbly mood.,"[little, troublemakers]"
3,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim...","[angela, merkel, russia, next, target]"
4,309,Boots for a Stranger on a Bus,Witnessing an act of generosity on a rainy day.,"[boots, stranger, bus]"
5,2180,"Molder of Navajo Youth, Where a Game Is Sacred","In a place of poverty, social ills and fractur...","[molder, navajo, youth, game, sacred]"
6,1146,"‘The Affair’ Season 3, Episode 6: Noah Goes Home","As usual, Noah’s story this week was dour.","[affair, season, episode, noah, goes, home]"
7,557,Sprint and Mr. Trump’s Fictional Jobs,"The emerging, and dangerous, new form of crony...","[sprint, mr, trump, fictional, jobs]"
8,784,America Becomes a Stan,Rule of law is for the little people.,"[america, becomes, stan]"
9,1109,"Fighting Diabetes, and Leading by Example","Eric L. Adams, the Brooklyn borough president,...","[fighting, diabetes, leading, example]"


In [185]:
nyt['snippet_token'] = nyt['snippet'].apply(clean_up)

In [186]:
nyt['snippet_token'] = nyt['snippet_token'].apply(tokenize)

In [187]:
nyt.head()

Unnamed: 0,articleWordCount,headline,snippet,headline_token,snippet_token
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...,"[leadership, poised, topple, obama, pillars]","[the, most, powerful, and, ambitious, republic..."
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...,"[fractured, world, tested, hope, young, presid...","[a, strategy, that, went, from, a, good, war, ..."
2,445,Little Troublemakers,Chuck Deodene puts us in a bubbly mood.,"[little, troublemakers]","[chuck, deodene, puts, us, in, a, bubbly, mood]"
3,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim...","[angela, merkel, russia, next, target]","[with, a, friend, entering, the, white, house,..."
4,309,Boots for a Stranger on a Bus,Witnessing an act of generosity on a rainy day.,"[boots, stranger, bus]","[witnessing, an, act, of, generosity, on, a, r..."


In [188]:
nyt['snippet_token'] = nyt['snippet_token'].apply(remove_stopwords)

In [189]:
nyt['snippet_token'] = nyt['snippet_token'].apply(remove_one_character)

In [190]:
nyt.head()

Unnamed: 0,articleWordCount,headline,snippet,headline_token,snippet_token
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...,"[leadership, poised, topple, obama, pillars]","[powerful, ambitious, republican, led, congres..."
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...,"[fractured, world, tested, hope, young, presid...","[strategy, went, good, war, shorthand, afghan,..."
2,445,Little Troublemakers,Chuck Deodene puts us in a bubbly mood.,"[little, troublemakers]","[chuck, deodene, puts, us, bubbly, mood]"
3,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim...","[angela, merkel, russia, next, target]","[friend, entering, white, house, vladimir, put..."
4,309,Boots for a Stranger on a Bus,Witnessing an act of generosity on a rainy day.,"[boots, stranger, bus]","[witnessing, act, generosity, rainy, day]"


In [192]:
nyt['tokens'] = nyt['headline_token'] + nyt['snippet_token']
nyt.head()

Unnamed: 0,articleWordCount,headline,snippet,headline_token,snippet_token,tokens
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...,"[leadership, poised, topple, obama, pillars]","[powerful, ambitious, republican, led, congres...","[leadership, poised, topple, obama, pillars, p..."
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...,"[fractured, world, tested, hope, young, presid...","[strategy, went, good, war, shorthand, afghan,...","[fractured, world, tested, hope, young, presid..."
2,445,Little Troublemakers,Chuck Deodene puts us in a bubbly mood.,"[little, troublemakers]","[chuck, deodene, puts, us, bubbly, mood]","[little, troublemakers, chuck, deodene, puts, ..."
3,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim...","[angela, merkel, russia, next, target]","[friend, entering, white, house, vladimir, put...","[angela, merkel, russia, next, target, friend,..."
4,309,Boots for a Stranger on a Bus,Witnessing an act of generosity on a rainy day.,"[boots, stranger, bus]","[witnessing, act, generosity, rainy, day]","[boots, stranger, bus, witnessing, act, genero..."


In [193]:
nyt.drop(columns=['headline_token', 'snippet_token'])

Unnamed: 0,articleWordCount,headline,snippet,tokens
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...,"[leadership, poised, topple, obama, pillars, p..."
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...,"[fractured, world, tested, hope, young, presid..."
2,445,Little Troublemakers,Chuck Deodene puts us in a bubbly mood.,"[little, troublemakers, chuck, deodene, puts, ..."
3,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim...","[angela, merkel, russia, next, target, friend,..."
4,309,Boots for a Stranger on a Bus,Witnessing an act of generosity on a rainy day.,"[boots, stranger, bus, witnessing, act, genero..."
...,...,...,...,...
7663,785,Dear Match Book: Fiction that Takes Us Back to...,"From the bones of 15th-century monarchs, to Ne...","[dear, match, book, fiction, takes, us, back, ..."
7664,1004,This Common Question Reinforces the Gender Pay...,Several states and cities have ordered employe...,"[common, question, reinforces, gender, pay, ga..."
7665,1043,"Anna, Llama and Me","The beginning, middle and end of a picture boo...","[anna, llama, beginning, middle, end, picture,..."
7666,659,Gen. Michael Hayden Has One Regret: Russia,"The former N.S.A. and C.I.A. chief on Trump, S...","[gen, michael, hayden, one, regret, russia, fo..."


## Labeling

With a Lexikon we can label the sentiment of each headline and snippet of text. With Hu and Liu Sentiment Lexicon here a label is set to every headline, so it will be possible to train the model.

In [111]:
def simple_sentiment(text):
    #tokens = [word.lower() for word in word_tokenize(text)]
    pos_words = 0
    neg_words = 0
    
    for word in text:
        if word in opinion_lexicon.positive():
            pos_words += 1
            #y.append(1) # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            #y.append(-1) # negative
        #else:
            #y.append(0) # neutral

    if pos_words > neg_words:
        return 'Positive'
    elif pos_words < neg_words:
        return 'Negative'
    elif pos_words == neg_words:
        return 'Neutral'

In [114]:
nyt['label_headline'] = nyt['tokens'].apply(simple_sentiment)

nyt.head(15)

KeyboardInterrupt: 

In [None]:
def stem_and_lemmatize(sent):
    stemmed_l = []
    lemmatized_l = []
    for word in sent:
        stemmed_l.append(ps.stem(word))
    for word in stemmed_l:
        lemmatized_l.append(lemmatizer.lemmatize(word))
    return lemmatized_l

In [None]:
nyt['headline_token'] = nyt['headline_token'].apply(stem_and_lemmatize)