In [211]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.sentiment.util import *
from nltk.corpus import opinion_lexicon

In [212]:
nyt = pd.read_csv('../2. Data cleaning/nyt.csv')
nyt.head()

Unnamed: 0,articleWordCount,headline,snippet
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...
2,445,Little Troublemakers,Chuck Deodene puts us in a bubbly mood.
3,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim..."
4,309,Boots for a Stranger on a Bus,Witnessing an act of generosity on a rainy day.


With a Lexikon we can label the sentiment of each headline and snippet of text. With Hu and Liu Sentiment Lexicon here a label is set to every headline, so it will be possible to train the model.

In [213]:
nyt.head()

Unnamed: 0,articleWordCount,headline,snippet
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...
2,445,Little Troublemakers,Chuck Deodene puts us in a bubbly mood.
3,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim..."
4,309,Boots for a Stranger on a Bus,Witnessing an act of generosity on a rainy day.


## Tokenize data

In order to create a frequency list of the most used words, words need to be arranged in such a manner that we can get rid of those that convey no meaning.

In the first place 

In [214]:
def clean_up(s):
    s = re.sub('[^A-Za-z]+', ' ', s)
    return s.lower().strip()

In [215]:
def tokenize(s):
    s = s.split(' ')
    return s


In [216]:
def remove_stopwords(sent):
    filtered_words = []
    for word in sent:
        if word not in stopwords.words('english'):
            filtered_words.append(word)
    return filtered_words

In [217]:
def remove_one_character(sent):
    longer_words = []
    for word in sent:
        if len(word) != 1:
            longer_words.append(word)
    return longer_words

In [218]:
nyt['headline_token'] = nyt['headline'].apply(clean_up)

In [219]:
nyt['snippet_token'] = nyt['snippet'].apply(clean_up)

In [220]:
nyt['tokens'] = nyt['headline_token'] + nyt['snippet_token']
nyt.head()

Unnamed: 0,articleWordCount,headline,snippet,headline_token,snippet_token,tokens
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...,g o p leadership poised to topple obama s pillars,the most powerful and ambitious republican led...,g o p leadership poised to topple obama s pill...
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...,fractured world tested the hope of a young pre...,a strategy that went from a good war to the sh...,fractured world tested the hope of a young pre...
2,445,Little Troublemakers,Chuck Deodene puts us in a bubbly mood.,little troublemakers,chuck deodene puts us in a bubbly mood,little troublemakerschuck deodene puts us in a...
3,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim...",angela merkel russia s next target,with a friend entering the white house vladimi...,angela merkel russia s next targetwith a frien...
4,309,Boots for a Stranger on a Bus,Witnessing an act of generosity on a rainy day.,boots for a stranger on a bus,witnessing an act of generosity on a rainy day,boots for a stranger on a buswitnessing an act...


In [221]:
nyt.drop(columns=['headline_token', 'snippet_token'], inplace= True)

In [222]:
nyt['tokens'] = nyt['tokens'].apply(tokenize)

In [223]:
nyt['tokens'] = nyt['tokens'].apply(remove_stopwords)

In [224]:
nyt['tokens'] = nyt['tokens'].apply(remove_one_character)

In [225]:
nyt.head(10)

Unnamed: 0,articleWordCount,headline,snippet,tokens
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...,"[leadership, poised, topple, obama, pillarsthe..."
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...,"[fractured, world, tested, hope, young, presid..."
2,445,Little Troublemakers,Chuck Deodene puts us in a bubbly mood.,"[little, troublemakerschuck, deodene, puts, us..."
3,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim...","[angela, merkel, russia, next, targetwith, fri..."
4,309,Boots for a Stranger on a Bus,Witnessing an act of generosity on a rainy day.,"[boots, stranger, buswitnessing, act, generosi..."
5,2180,"Molder of Navajo Youth, Where a Game Is Sacred","In a place of poverty, social ills and fractur...","[molder, navajo, youth, game, sacredin, place,..."
6,1146,"‘The Affair’ Season 3, Episode 6: Noah Goes Home","As usual, Noah’s story this week was dour.","[affair, season, episode, noah, goes, homeas, ..."
7,557,Sprint and Mr. Trump’s Fictional Jobs,"The emerging, and dangerous, new form of crony...","[sprint, mr, trump, fictional, jobsthe, emergi..."
8,784,America Becomes a Stan,Rule of law is for the little people.,"[america, becomes, stanrule, law, little, people]"
9,1109,"Fighting Diabetes, and Leading by Example","Eric L. Adams, the Brooklyn borough president,...","[fighting, diabetes, leading, exampleeric, ada..."


## Labeling

With a Lexikon we can label the sentiment of each headline and snippet of text. With Hu and Liu Sentiment Lexicon here a label is set to every headline, so it will be possible to train the model.

In [233]:
def simple_sentiment(text):
    #tokens = [word.lower() for word in word_tokenize(text)]
    pos_words = 0
    neg_words = 0
    
    for word in text:
        if word in opinion_lexicon.positive():
            pos_words += 1
            #y.append(1) # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            #y.append(-1) # negative
        #else:
            #y.append(0) # neutral

    if pos_words > neg_words:
        return 'Positive'
    elif pos_words < neg_words:
        return 'Negative'
    elif pos_words == neg_words:
        return 'Neutral'

In [234]:
nyt['label_headline'] = nyt['tokens'].apply(simple_sentiment)

nyt.head(15)

Unnamed: 0,articleWordCount,headline,snippet,tokens,snippet_token,label_headline
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...,"[leadership, poised, topple, obama, pillarsthe...","[powerful, ambitious, republican, led, congres...",Positive
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...,"[fractured, world, tested, hope, young, presid...","[strategy, went, good, war, shorthand, afghan,...",Positive
2,445,Little Troublemakers,Chuck Deodene puts us in a bubbly mood.,"[little, troublemakerschuck, deodene, puts, us...","[chuck, deodene, puts, us, bubbly, mood]",Neutral
3,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim...","[angela, merkel, russia, next, targetwith, fri...","[friend, entering, white, house, vladimir, put...",Negative
4,309,Boots for a Stranger on a Bus,Witnessing an act of generosity on a rainy day.,"[boots, stranger, buswitnessing, act, generosi...","[witnessing, act, generosity, rainy, day]",Neutral
5,2180,"Molder of Navajo Youth, Where a Game Is Sacred","In a place of poverty, social ills and fractur...","[molder, navajo, youth, game, sacredin, place,...","[place, poverty, social, ills, fractured, fami...",Neutral
6,1146,"‘The Affair’ Season 3, Episode 6: Noah Goes Home","As usual, Noah’s story this week was dour.","[affair, season, episode, noah, goes, homeas, ...","[usual, noah, story, week, dour]",Neutral
7,557,Sprint and Mr. Trump’s Fictional Jobs,"The emerging, and dangerous, new form of crony...","[sprint, mr, trump, fictional, jobsthe, emergi...","[emerging, dangerous, new, form, crony, capita...",Negative
8,784,America Becomes a Stan,Rule of law is for the little people.,"[america, becomes, stanrule, law, little, people]","[rule, law, little, people]",Neutral
9,1109,"Fighting Diabetes, and Leading by Example","Eric L. Adams, the Brooklyn borough president,...","[fighting, diabetes, leading, exampleeric, ada...","[eric, adams, brooklyn, borough, president, sa...",Positive


In [235]:
nyt_clean = nyt.drop(columns=['snippet_token'])
nyt_clean.head()

Unnamed: 0,articleWordCount,headline,snippet,tokens,label_headline
0,1324,G.O.P. Leadership Poised to Topple Obama’s Pi...,The most powerful and ambitious Republican-led...,"[leadership, poised, topple, obama, pillarsthe...",Positive
1,2836,Fractured World Tested the Hope of a Young Pre...,A strategy that went from a “good war” to the ...,"[fractured, world, tested, hope, young, presid...",Positive
2,445,Little Troublemakers,Chuck Deodene puts us in a bubbly mood.,"[little, troublemakerschuck, deodene, puts, us...",Neutral
3,864,"Angela Merkel, Russia’s Next Target","With a friend entering the White House, Vladim...","[angela, merkel, russia, next, targetwith, fri...",Negative
4,309,Boots for a Stranger on a Bus,Witnessing an act of generosity on a rainy day.,"[boots, stranger, buswitnessing, act, generosi...",Neutral


In [237]:
nyt_clean.to_csv('./sentiment_prepared.csv')