In [1]:
import pandas as pd
import gensim
from tqdm import tqdm
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
tqdm.pandas()
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /Users/ange/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ange/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Import Data

In [4]:
news_df = pd.read_csv("news_data/archive_data.csv")
news_df['headline.main'] = news_df['headline.main'].astype(str)
news_df['snippet'] = news_df['snippet'].astype(str)
news_df

Unnamed: 0,abstract,web_url,snippet,pub_date,document_type,news_desk,type_of_material,word_count,headline.main
0,President Trump’s aides were delighted that ne...,https://www.nytimes.com/2019/11/30/us/politics...,President Trump’s aides were delighted that ne...,2019-12-01T00:15:08+0000,article,Washington,News,1341,A Leak-Prone White House Finally Manages to Ke...
1,The Buckeyes quarterback threw one of his four...,https://www.nytimes.com/2019/11/30/sports/ncaa...,The Buckeyes quarterback threw one of his four...,2019-12-01T00:33:57+0000,article,Sports,News,727,Justin Fields Has ‘Heisman Moment’ in Ohio Sta...
2,A personal loss has prompted Eileen Shiffrin t...,https://www.nytimes.com/2019/11/30/sports/skii...,A personal loss has prompted Eileen Shiffrin t...,2019-12-01T00:37:15+0000,article,Sports,News,762,Mikaela Shiffrin Learns a New Way to Win: With...
3,The actor and environmentalist released a stat...,https://www.nytimes.com/2019/11/30/world/ameri...,The actor and environmentalist released a stat...,2019-12-01T01:03:22+0000,article,Express,News,540,Leonardo DiCaprio Responds to Brazil’s Preside...
4,A blocked pipe caused sewage to back up severa...,https://www.nytimes.com/2019/11/30/nyregion/Qu...,A blocked pipe caused sewage to back up severa...,2019-12-01T01:47:35+0000,article,Express,News,781,Cooking Grease Down a Drain Eyed in Sewage Flo...
...,...,...,...,...,...,...,...,...,...
242015,"Anna Wintour, Condé Nast’s artistic director, ...",https://www.nytimes.com/2016/02/01/business/me...,"Anna Wintour, Condé Nast’s artistic director, ...",2016-01-31T23:09:07+0000,article,Business,News,1348,"Condé Nast Adapts to New Forces, Leaving Some ..."
242016,Microsoft sank a data center on the ocean floo...,https://www.nytimes.com/2016/02/01/technology/...,Microsoft sank a data center on the ocean floo...,2016-01-31T23:25:58+0000,article,Business,News,1223,Microsoft Plumbs Ocean’s Depths to Test Underw...
242017,"David Eisenhauer, a freshman charged with abdu...",https://www.nytimes.com/2016/02/01/us/two-virg...,"David Eisenhauer, a freshman charged with abdu...",2016-01-31T23:35:00+0000,article,National,News,459,Two Virginia Tech Students Arrested in Girl’s ...
242018,A time-capsule look at how the show as markete...,https://artsbeat.blogs.nytimes.com/2016/01/31/...,A time-capsule look at how the show as markete...,2016-01-31T23:53:20+0000,article,Culture,News,161,Theater Flashback: ‘Grease’ on Broadway


# Preprocessing
* lower casing
* removing all the stop words
* only considering the alphabetic characters
* lemmatizing

In [5]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(summary):
    stop = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(summary)
    # Apply lowercasing and stop word removal from PS2
    tokens = [token.lower() for token in tokens if token.isalpha() and token not in stop]
    # Lemmatize the tokens
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmas

# nltk.download('wordnet')

news_df['headline_lemmas'] = news_df['headline.main'].apply(tokenize_and_lemmatize)
news_df['snippet_lemmas'] = news_df['snippet'].apply(tokenize_and_lemmatize)

# Export

In [9]:
news_df

Unnamed: 0,abstract,web_url,snippet,pub_date,document_type,news_desk,type_of_material,word_count,headline.main,headline_lemmas,snippet_lemmas
0,President Trump’s aides were delighted that ne...,https://www.nytimes.com/2019/11/30/us/politics...,President Trump’s aides were delighted that ne...,2019-12-01T00:15:08+0000,article,Washington,News,1341,A Leak-Prone White House Finally Manages to Ke...,"[a, white, house, finally, manages, keep, secret]","[president, trump, aide, delighted, news, than..."
1,The Buckeyes quarterback threw one of his four...,https://www.nytimes.com/2019/11/30/sports/ncaa...,The Buckeyes quarterback threw one of his four...,2019-12-01T00:33:57+0000,article,Sports,News,727,Justin Fields Has ‘Heisman Moment’ in Ohio Sta...,"[justin, field, ha, heisman, moment, ohio, sta...","[the, buckeye, quarterback, threw, one, four, ..."
2,A personal loss has prompted Eileen Shiffrin t...,https://www.nytimes.com/2019/11/30/sports/skii...,A personal loss has prompted Eileen Shiffrin t...,2019-12-01T00:37:15+0000,article,Sports,News,762,Mikaela Shiffrin Learns a New Way to Win: With...,"[mikaela, shiffrin, learns, new, way, win, wit...","[a, personal, loss, prompted, eileen, shiffrin..."
3,The actor and environmentalist released a stat...,https://www.nytimes.com/2019/11/30/world/ameri...,The actor and environmentalist released a stat...,2019-12-01T01:03:22+0000,article,Express,News,540,Leonardo DiCaprio Responds to Brazil’s Preside...,"[leonardo, dicaprio, responds, brazil, preside...","[the, actor, environmentalist, released, state..."
4,A blocked pipe caused sewage to back up severa...,https://www.nytimes.com/2019/11/30/nyregion/Qu...,A blocked pipe caused sewage to back up severa...,2019-12-01T01:47:35+0000,article,Express,News,781,Cooking Grease Down a Drain Eyed in Sewage Flo...,"[cooking, grease, down, drain, eyed, sewage, f...","[a, blocked, pipe, caused, sewage, back, sever..."
...,...,...,...,...,...,...,...,...,...,...,...
242015,"Anna Wintour, Condé Nast’s artistic director, ...",https://www.nytimes.com/2016/02/01/business/me...,"Anna Wintour, Condé Nast’s artistic director, ...",2016-01-31T23:09:07+0000,article,Business,News,1348,"Condé Nast Adapts to New Forces, Leaving Some ...","[condé, nast, adapts, new, force, leaving, som...","[anna, wintour, condé, nast, artistic, directo..."
242016,Microsoft sank a data center on the ocean floo...,https://www.nytimes.com/2016/02/01/technology/...,Microsoft sank a data center on the ocean floo...,2016-01-31T23:25:58+0000,article,Business,News,1223,Microsoft Plumbs Ocean’s Depths to Test Underw...,"[microsoft, plumb, ocean, depth, test, underwa...","[microsoft, sank, data, center, ocean, floor, ..."
242017,"David Eisenhauer, a freshman charged with abdu...",https://www.nytimes.com/2016/02/01/us/two-virg...,"David Eisenhauer, a freshman charged with abdu...",2016-01-31T23:35:00+0000,article,National,News,459,Two Virginia Tech Students Arrested in Girl’s ...,"[two, virginia, tech, student, arrested, girl,...","[david, eisenhauer, freshman, charged, abducti..."
242018,A time-capsule look at how the show as markete...,https://artsbeat.blogs.nytimes.com/2016/01/31/...,A time-capsule look at how the show as markete...,2016-01-31T23:53:20+0000,article,Culture,News,161,Theater Flashback: ‘Grease’ on Broadway,"[theater, flashback, grease, broadway]","[a, look, show, marketed, first, opened, broad..."


In [10]:
news_df.head()

Unnamed: 0,abstract,web_url,snippet,pub_date,document_type,news_desk,type_of_material,word_count,headline.main,headline_lemmas,snippet_lemmas
0,President Trump’s aides were delighted that ne...,https://www.nytimes.com/2019/11/30/us/politics...,President Trump’s aides were delighted that ne...,2019-12-01T00:15:08+0000,article,Washington,News,1341,A Leak-Prone White House Finally Manages to Ke...,"[a, white, house, finally, manages, keep, secret]","[president, trump, aide, delighted, news, than..."
1,The Buckeyes quarterback threw one of his four...,https://www.nytimes.com/2019/11/30/sports/ncaa...,The Buckeyes quarterback threw one of his four...,2019-12-01T00:33:57+0000,article,Sports,News,727,Justin Fields Has ‘Heisman Moment’ in Ohio Sta...,"[justin, field, ha, heisman, moment, ohio, sta...","[the, buckeye, quarterback, threw, one, four, ..."
2,A personal loss has prompted Eileen Shiffrin t...,https://www.nytimes.com/2019/11/30/sports/skii...,A personal loss has prompted Eileen Shiffrin t...,2019-12-01T00:37:15+0000,article,Sports,News,762,Mikaela Shiffrin Learns a New Way to Win: With...,"[mikaela, shiffrin, learns, new, way, win, wit...","[a, personal, loss, prompted, eileen, shiffrin..."
3,The actor and environmentalist released a stat...,https://www.nytimes.com/2019/11/30/world/ameri...,The actor and environmentalist released a stat...,2019-12-01T01:03:22+0000,article,Express,News,540,Leonardo DiCaprio Responds to Brazil’s Preside...,"[leonardo, dicaprio, responds, brazil, preside...","[the, actor, environmentalist, released, state..."
4,A blocked pipe caused sewage to back up severa...,https://www.nytimes.com/2019/11/30/nyregion/Qu...,A blocked pipe caused sewage to back up severa...,2019-12-01T01:47:35+0000,article,Express,News,781,Cooking Grease Down a Drain Eyed in Sewage Flo...,"[cooking, grease, down, drain, eyed, sewage, f...","[a, blocked, pipe, caused, sewage, back, sever..."


In [11]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242020 entries, 0 to 242019
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   abstract          241962 non-null  object
 1   web_url           242020 non-null  object
 2   snippet           242020 non-null  object
 3   pub_date          242020 non-null  object
 4   document_type     242020 non-null  object
 5   news_desk         233939 non-null  object
 6   type_of_material  239295 non-null  object
 7   word_count        242020 non-null  int64 
 8   headline.main     242020 non-null  object
 9   headline_lemmas   242020 non-null  object
 10  snippet_lemmas    242020 non-null  object
dtypes: int64(1), object(10)
memory usage: 20.3+ MB


In [12]:
news_df.columns

Index(['abstract', 'web_url', 'snippet', 'pub_date', 'document_type',
       'news_desk', 'type_of_material', 'word_count', 'headline.main',
       'headline_lemmas', 'snippet_lemmas'],
      dtype='object')

In [13]:
news_df.to_csv("news_data/news_data_preprocessed.csv", index=False)