In [107]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
from afinn import Afinn
from nltk.corpus import sentiwordnet as swn
from nltk import pos_tag
nltk.download('sentiwordnet')
nltk.download('averaged_perceptron_tagger')
from datetime import datetime


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ange/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/ange/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ange/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [108]:
df = pd.read_csv("news_data/news_data_preprocessed.csv")

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242020 entries, 0 to 242019
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   abstract          241962 non-null  object
 1   web_url           242020 non-null  object
 2   snippet           241691 non-null  object
 3   pub_date          242020 non-null  object
 4   document_type     242020 non-null  object
 5   news_desk         233939 non-null  object
 6   type_of_material  239295 non-null  object
 7   word_count        242020 non-null  int64 
 8   headline.main     242007 non-null  object
 9   headline_lemmas   242020 non-null  object
 10  snippet_lemmas    242020 non-null  object
dtypes: int64(1), object(10)
memory usage: 20.3+ MB


In [110]:
df.head()

Unnamed: 0,abstract,web_url,snippet,pub_date,document_type,news_desk,type_of_material,word_count,headline.main,headline_lemmas,snippet_lemmas
0,President Trump’s aides were delighted that ne...,https://www.nytimes.com/2019/11/30/us/politics...,President Trump’s aides were delighted that ne...,2019-12-01T00:15:08+0000,article,Washington,News,1341,A Leak-Prone White House Finally Manages to Ke...,"['a', 'white', 'house', 'finally', 'manages', ...","['president', 'trump', 'aide', 'delighted', 'n..."
1,The Buckeyes quarterback threw one of his four...,https://www.nytimes.com/2019/11/30/sports/ncaa...,The Buckeyes quarterback threw one of his four...,2019-12-01T00:33:57+0000,article,Sports,News,727,Justin Fields Has ‘Heisman Moment’ in Ohio Sta...,"['justin', 'field', 'ha', 'heisman', 'moment',...","['the', 'buckeye', 'quarterback', 'threw', 'on..."
2,A personal loss has prompted Eileen Shiffrin t...,https://www.nytimes.com/2019/11/30/sports/skii...,A personal loss has prompted Eileen Shiffrin t...,2019-12-01T00:37:15+0000,article,Sports,News,762,Mikaela Shiffrin Learns a New Way to Win: With...,"['mikaela', 'shiffrin', 'learns', 'new', 'way'...","['a', 'personal', 'loss', 'prompted', 'eileen'..."
3,The actor and environmentalist released a stat...,https://www.nytimes.com/2019/11/30/world/ameri...,The actor and environmentalist released a stat...,2019-12-01T01:03:22+0000,article,Express,News,540,Leonardo DiCaprio Responds to Brazil’s Preside...,"['leonardo', 'dicaprio', 'responds', 'brazil',...","['the', 'actor', 'environmentalist', 'released..."
4,A blocked pipe caused sewage to back up severa...,https://www.nytimes.com/2019/11/30/nyregion/Qu...,A blocked pipe caused sewage to back up severa...,2019-12-01T01:47:35+0000,article,Express,News,781,Cooking Grease Down a Drain Eyed in Sewage Flo...,"['cooking', 'grease', 'down', 'drain', 'eyed',...","['a', 'blocked', 'pipe', 'caused', 'sewage', '..."


# VADER

In [111]:
sia = SentimentIntensityAnalyzer()

In [112]:
df['snippet_VADER'] = df['snippet'].astype(str).apply(lambda x: sia.polarity_scores(x)['compound'])

In [113]:
df['headline_VADER'] = df['headline.main'].astype(str).apply(lambda x: sia.polarity_scores(x)['compound'])

# AFINN

In [93]:
afinn = Afinn()
df["snippet_AFINN"] = df["snippet_lemmas"].explode().apply(lambda word: afinn.score(word)).groupby(level=0).sum()
df["headline_AFINN"] = df["headline_lemmas"].explode().apply(lambda word: afinn.score(word)).groupby(level=0).sum()

In [94]:
df.describe()

Unnamed: 0,word_count,snippet_VADER,headline_VADER,snippet_AFINN,headline_AFINN
count,242020.0,242020.0,242020.0,242020.0,242020.0
mean,817.467734,0.029682,-0.023386,-0.05331,-0.191034
std,676.706438,0.424378,0.33094,2.694871,1.949935
min,0.0,-0.9847,-0.9776,-18.0,-13.0
25%,396.0,-0.25,-0.0772,-1.0,-1.0
50%,791.0,0.0,0.0,0.0,0.0
75%,1130.0,0.3612,0.0,1.0,0.0
max,27976.0,0.9938,0.9432,33.0,14.0


# Aggregate

In [114]:
df["datetime"] = pd.to_datetime(df["pub_date"])
# df.set_index('datetime', inplace=True)

In [97]:
df_agg = df.resample('D').agg({
    'word_count': ['mean', 'std'],
    'snippet_VADER': ['mean', 'std'],
    'headline_VADER': ['mean', 'std'],
    'snippet_AFINN': ['mean', 'std'],
    'headline_AFINN': ['mean', 'std']    
})
df_agg.reset_index(inplace=True)
df_agg.columns = [f"{col[0]}_{col[1]}" for col in df_agg.columns]

In [101]:
df_agg

Unnamed: 0,datetime_,word_count_mean,word_count_std,snippet_VADER_mean,snippet_VADER_std,headline_VADER_mean,headline_VADER_std,snippet_AFINN_mean,snippet_AFINN_std,headline_AFINN_mean,headline_AFINN_std
0,2007-07-07 00:00:00+00:00,163.500000,155.532240,0.072180,0.152487,0.089240,0.190568,0.400000,0.966092,0.300000,0.674949
1,2007-07-08 00:00:00+00:00,,,,,,,,,,
2,2007-07-09 00:00:00+00:00,,,,,,,,,,
3,2007-07-10 00:00:00+00:00,,,,,,,,,,
4,2007-07-11 00:00:00+00:00,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
4556,2019-12-27 00:00:00+00:00,921.179487,656.869950,0.092650,0.411484,0.010347,0.393647,0.589744,2.639729,-0.059829,2.248718
4557,2019-12-28 00:00:00+00:00,974.980392,649.728666,0.028182,0.452458,-0.088896,0.355121,-0.470588,2.886887,-0.666667,2.268627
4558,2019-12-29 00:00:00+00:00,944.100000,669.962804,-0.077767,0.460476,-0.038853,0.377736,-0.683333,2.965860,-0.416667,1.924786
4559,2019-12-30 00:00:00+00:00,988.185185,699.355540,-0.025621,0.467592,-0.066853,0.365315,-0.814815,3.309991,-0.370370,2.198524


In [115]:
df["datetime"]

0        2019-12-01 00:15:08+00:00
1        2019-12-01 00:33:57+00:00
2        2019-12-01 00:37:15+00:00
3        2019-12-01 01:03:22+00:00
4        2019-12-01 01:47:35+00:00
                    ...           
242015   2016-01-31 23:09:07+00:00
242016   2016-01-31 23:25:58+00:00
242017   2016-01-31 23:35:00+00:00
242018   2016-01-31 23:53:20+00:00
242019   2016-01-31 23:55:29+00:00
Name: datetime, Length: 242020, dtype: datetime64[ns, UTC]

In [116]:
df.sort_values(by='datetime', inplace=True)


In [117]:
df

Unnamed: 0,abstract,web_url,snippet,pub_date,document_type,news_desk,type_of_material,word_count,headline.main,headline_lemmas,snippet_lemmas,snippet_VADER,headline_VADER,datetime
137554,It was a long antipodean night. While there’s ...,https://artsbeat.blogs.nytimes.com/2007/07/07/...,,2007-07-07T14:46:20+0000,article,Culture,News,266,Live Earth: Earnest Crooners and a “Supersonic...,"['live', 'earth', 'earnest', 'crooner', 'super...",['nan'],0.0000,0.5106,2007-07-07 14:46:20+00:00
137555,,https://artsbeat.blogs.nytimes.com/2007/07/07/...,,2007-07-07T14:46:54+0000,article,Culture,News,1,"Live Earth: Shakira, Shakira","['live', 'earth', 'shakira', 'shakira']",['nan'],0.0000,0.0000,2007-07-07 14:46:54+00:00
137556,A record-breaking heat wave in the Southwest i...,https://www.nytimes.com/slideshow/2007/07/07/u...,A record-breaking heat wave in the Southwest i...,2007-07-07T15:06:00+0000,multimedia,U.S.,Slideshow,0,Heat Wave,"['heat', 'wave']","['a', 'heat', 'wave', 'southwest', 'pushing', ...",0.0000,0.0000,2007-07-07 15:06:00+00:00
137557,,https://artsbeat.blogs.nytimes.com/2007/07/07/...,,2007-07-07T15:12:39+0000,article,Culture,News,1,Live Earth: Rocking Out Around the World,"['live', 'earth', 'rocking', 'out', 'around', ...",['nan'],0.0000,0.0000,2007-07-07 15:12:39+00:00
137558,The Lewis Hamilton effect is felt everywhere. ...,https://rendezvous.blogs.nytimes.com/2007/07/0...,,2007-07-07T15:48:45+0000,article,,News,391,"Hamilton, Hamilton, Hamilton","['hamilton', 'hamilton', 'hamilton']",['nan'],0.0000,0.0000,2007-07-07 15:48:45+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3980,Chief Justice John Roberts’s year-end report o...,https://www.nytimes.com/2019/12/31/us/john-rob...,Chief Justice John Roberts’s year-end report o...,2019-12-31T23:00:07+0000,article,Washington,News,931,"Impeachment Trial Looming, Chief Justice Refle...","['impeachment', 'trial', 'looming', 'chief', '...","['chief', 'justice', 'john', 'robert', 'report...",0.5781,0.4404,2019-12-31 23:00:07+00:00
3981,"Zaosong Zheng, a promising cancer researcher, ...",https://www.nytimes.com/2019/12/31/us/chinese-...,"Zaosong Zheng, a promising cancer researcher, ...",2019-12-31T23:01:57+0000,article,National,News,872,Stolen Research: Chinese Scientist Is Accused ...,"['stolen', 'research', 'chinese', 'scientist',...","['zaosong', 'zheng', 'promising', 'cancer', 'r...",-0.7096,-0.8176,2019-12-31 23:01:57+00:00
3982,"The 2010s, reviewed.",https://www.nytimes.com/2019/12/31/opinion/dec...,"The 2010s, reviewed.",2019-12-31T23:05:04+0000,article,OpEd,Op-Ed,2116,"The Extremely Online Decade of ‘Get Out,’ Fake...","['the', 'extremely', 'online', 'decade', 'get'...","['the', 'reviewed']",0.0000,-0.4767,2019-12-31 23:05:04+00:00
3983,"After receiving presidential clemency, Edward ...",https://www.nytimes.com/2019/12/31/us/navy-sea...,"After receiving presidential clemency, Edward ...",2019-12-31T23:17:09+0000,article,National,News,1285,"From the Brig to Mar-a-Lago, Former Navy SEAL ...","['from', 'brig', 'former', 'navy', 'seal', 'ca...","['after', 'receiving', 'presidential', 'clemen...",0.0000,0.4404,2019-12-31 23:17:09+00:00
