In [17]:
import pandas as pd
import gensim
from tqdm import tqdm
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
tqdm.pandas()
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /Users/ange/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ange/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Import Data

In [18]:
news_df = pd.read_csv("news_data/archive_data.csv")
news_df['headline.main'] = news_df['headline.main'].astype(str)
news_df['snippet'] = news_df['snippet'].astype(str)
news_df

Unnamed: 0,abstract,web_url,snippet,pub_date,document_type,news_desk,type_of_material,word_count,headline.main
0,President Trump’s aides were delighted that ne...,https://www.nytimes.com/2019/11/30/us/politics...,President Trump’s aides were delighted that ne...,2019-12-01T00:15:08+0000,article,Washington,News,1341,A Leak-Prone White House Finally Manages to Ke...
1,The Buckeyes quarterback threw one of his four...,https://www.nytimes.com/2019/11/30/sports/ncaa...,The Buckeyes quarterback threw one of his four...,2019-12-01T00:33:57+0000,article,Sports,News,727,Justin Fields Has ‘Heisman Moment’ in Ohio Sta...
2,A personal loss has prompted Eileen Shiffrin t...,https://www.nytimes.com/2019/11/30/sports/skii...,A personal loss has prompted Eileen Shiffrin t...,2019-12-01T00:37:15+0000,article,Sports,News,762,Mikaela Shiffrin Learns a New Way to Win: With...
3,The actor and environmentalist released a stat...,https://www.nytimes.com/2019/11/30/world/ameri...,The actor and environmentalist released a stat...,2019-12-01T01:03:22+0000,article,Express,News,540,Leonardo DiCaprio Responds to Brazil’s Preside...
4,A blocked pipe caused sewage to back up severa...,https://www.nytimes.com/2019/11/30/nyregion/Qu...,A blocked pipe caused sewage to back up severa...,2019-12-01T01:47:35+0000,article,Express,News,781,Cooking Grease Down a Drain Eyed in Sewage Flo...
...,...,...,...,...,...,...,...,...,...
242015,"Anna Wintour, Condé Nast’s artistic director, ...",https://www.nytimes.com/2016/02/01/business/me...,"Anna Wintour, Condé Nast’s artistic director, ...",2016-01-31T23:09:07+0000,article,Business,News,1348,"Condé Nast Adapts to New Forces, Leaving Some ..."
242016,Microsoft sank a data center on the ocean floo...,https://www.nytimes.com/2016/02/01/technology/...,Microsoft sank a data center on the ocean floo...,2016-01-31T23:25:58+0000,article,Business,News,1223,Microsoft Plumbs Ocean’s Depths to Test Underw...
242017,"David Eisenhauer, a freshman charged with abdu...",https://www.nytimes.com/2016/02/01/us/two-virg...,"David Eisenhauer, a freshman charged with abdu...",2016-01-31T23:35:00+0000,article,National,News,459,Two Virginia Tech Students Arrested in Girl’s ...
242018,A time-capsule look at how the show as markete...,https://artsbeat.blogs.nytimes.com/2016/01/31/...,A time-capsule look at how the show as markete...,2016-01-31T23:53:20+0000,article,Culture,News,161,Theater Flashback: ‘Grease’ on Broadway


# Preprocessing
* lower casing
* removing all the stop words
* only considering the alphabetic characters
* lemmatizing

In [19]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

# Download required NLTK data
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(summary):
    # Ensure the text is a string
    if not isinstance(summary, str):
        return []

    # Get English stop words
    stop_words = set(stopwords.words('english'))

    # Tokenize the text
    tokens = nltk.word_tokenize(summary)

    # Lowercasing, stop word removal, and lemmatization
    lemmas = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]

    return lemmas

# Apply the function to your DataFrame
news_df['headline_lemmas'] = news_df['headline.main'].apply(tokenize_and_lemmatize)
news_df['snippet_lemmas'] = news_df['snippet'].apply(tokenize_and_lemmatize)


[nltk_data] Downloading package wordnet to /Users/ange/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ange/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ange/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
news_df['pub_date'] = pd.to_datetime(news_df['pub_date'])

# Convert 'pub_date' to just the date component (year-month-day)
news_df['pub_date'] = news_df['pub_date'].dt.date

# Now group by the adjusted 'pub_date' and aggregate
news = news_df.groupby('pub_date').agg({
    'headline.main': lambda x: ' '.join(x),  # Concatenate all headlines of the day
    'snippet': lambda x: ' '.join(x),        # Concatenate all snippets of the day
    'headline_lemmas': lambda x: sum(x, []), # Concatenate all headline lemmas of the day
    'snippet_lemmas': lambda x: sum(x, [])   # Concatenate all snippet lemmas of the day
}).reset_index()

In [31]:
news = news_df.groupby('pub_date').agg({
    'headline.main': lambda x: ' '.join(x),  # Concatenate all headlines of the day
    'snippet': lambda x: ' '.join(x),  # Concatenate all snippets of the day
    'headline_lemmas': lambda x: sum(x, []),
    'snippet_lemmas': lambda x: sum(x, [])
}).reset_index()

In [33]:
news['pub_date'] = pd.to_datetime(news['pub_date'])
news = news[news['pub_date'].dt.year.between(2016, 2019)]

# Export

In [34]:
news

Unnamed: 0,pub_date,headline.main,snippet,headline_lemmas,snippet_lemmas
1,2016-01-01,No More Statutes of Limitations for Rape Fredd...,Bill Cosby came close to escaping sexual assau...,"[statute, limitation, rape, freddie, gray, com...","[bill, cosby, came, close, escaping, sexual, a..."
2,2016-01-02,Wishes for the New Year A Smarter Plan to Make...,"From Times Square to the web, we asked people ...","[wish, new, year, smarter, plan, make, retirem...","[time, square, web, asked, people, tell, u, wi..."
3,2016-01-03,"For ‘Game of Thrones’ Readers, New HBO Season ...",The author George R.R. Martin said he had miss...,"[game, throne, reader, new, hbo, season, may, ...","[author, george, martin, said, missed, deadlin..."
4,2016-01-04,"Auto Sales, the Fed’s December Meeting and Emp...","This week, automakers are expected to post big...","[auto, sale, fed, december, meeting, employmen...","[week, automaker, expected, post, big, gain, f..."
5,2016-01-05,Saudi Arabia’s Dangerous Sectarian Game G.M. C...,The Saudi royals think that stoking hatred of ...,"[saudi, arabia, dangerous, sectarian, game, ch...","[saudi, royal, think, stoking, hatred, shiite,..."
...,...,...,...,...,...
1456,2019-12-27,E-Bikes and E-Scooter Rentals Won’t Be Allowed...,Gov. Andrew Cuomo vetoed a bill that would hav...,"[rental, allowed, anytime, soon, william, grei...","[gov, andrew, cuomo, vetoed, bill, would, lega..."
1457,2019-12-28,The Patriarchy of Alcoholics Anonymous Arthur ...,Women who drink too much need help. But we don...,"[patriarchy, alcoholic, anonymous, arthur, sin...","[woman, drink, much, need, help, need, give, p..."
1458,2019-12-29,"Fred P. Graham, Legal Affairs Reporter and Cou...",Mr. Graham covered the Supreme Court for The N...,"[fred, graham, legal, affair, reporter, court,...","[graham, covered, supreme, court, new, york, t..."
1459,2019-12-30,Bill Barr Thinks America Is Going to Hell John...,And he’s on a mission to use the “authority” o...,"[bill, barr, think, america, going, hell, john...","[mission, use, authority, executive, branch, s..."


In [35]:
news.head()

Unnamed: 0,pub_date,headline.main,snippet,headline_lemmas,snippet_lemmas
1,2016-01-01,No More Statutes of Limitations for Rape Fredd...,Bill Cosby came close to escaping sexual assau...,"[statute, limitation, rape, freddie, gray, com...","[bill, cosby, came, close, escaping, sexual, a..."
2,2016-01-02,Wishes for the New Year A Smarter Plan to Make...,"From Times Square to the web, we asked people ...","[wish, new, year, smarter, plan, make, retirem...","[time, square, web, asked, people, tell, u, wi..."
3,2016-01-03,"For ‘Game of Thrones’ Readers, New HBO Season ...",The author George R.R. Martin said he had miss...,"[game, throne, reader, new, hbo, season, may, ...","[author, george, martin, said, missed, deadlin..."
4,2016-01-04,"Auto Sales, the Fed’s December Meeting and Emp...","This week, automakers are expected to post big...","[auto, sale, fed, december, meeting, employmen...","[week, automaker, expected, post, big, gain, f..."
5,2016-01-05,Saudi Arabia’s Dangerous Sectarian Game G.M. C...,The Saudi royals think that stoking hatred of ...,"[saudi, arabia, dangerous, sectarian, game, ch...","[saudi, royal, think, stoking, hatred, shiite,..."


In [36]:
news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   pub_date         1460 non-null   datetime64[ns]
 1   headline.main    1460 non-null   object        
 2   snippet          1460 non-null   object        
 3   headline_lemmas  1460 non-null   object        
 4   snippet_lemmas   1460 non-null   object        
dtypes: datetime64[ns](1), object(4)
memory usage: 68.4+ KB


In [37]:
news.columns

Index(['pub_date', 'headline.main', 'snippet', 'headline_lemmas',
       'snippet_lemmas'],
      dtype='object')

In [38]:
news.to_csv("news_data/news_data_preprocessed.csv", index=False)