This tests the code for TfidfVectorizer, just to make sure it works. This uses a much smaller dataset (not one we actually care about) 

In [32]:
import pandas as pd

In [57]:
df = pd.read_csv("News_ds_cleaned.csv")

In [58]:
temp = df.head(100)

In [59]:
temp.groupby("category").count().sort_values(by=["link"])

Unnamed: 0_level_0,Unnamed: 0,link,headline,short_description,authors,date,headline_cleaned,short_description_cleaned
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
COMEDY,1,1,1,1,1,1,1,1
EDUCATION,1,1,1,1,1,1,1,1
PARENTING,1,1,1,1,1,1,1,1
TECH,2,2,2,2,2,2,2,2
WEIRD NEWS,2,2,2,2,1,2,2,2
CULTURE & ARTS,5,5,5,5,5,5,5,5
ENVIRONMENT,5,5,5,5,5,5,5,5
SPORTS,7,7,7,7,5,7,7,7
POLITICS,16,16,16,16,15,16,16,16
ENTERTAINMENT,17,17,17,17,15,17,17,17


In [80]:
## create new df with only categories we want
df = temp.loc[temp['category'].isin(["U.S. NEWS", "WORLD NEWS", "ENTERTAINMENT", "POLITICS", "SPORTS"])]

In [81]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,link,headline,category,short_description,authors,date,headline_cleaned,short_description_cleaned
0,0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,4 million american roll sleeve omicron target ...,health expert say early predict whether demand...
1,1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,american airline flyer charge ban life punch f...,subdue passenger crew flee back aircraft confr...


In [68]:
## NB!! reset the indices! 
## Need this in order to accurately match the rows in this df with their if-tdf vectors
df = df.reset_index(drop=True)

## drop the second column of indices
df.drop('Unnamed: 0', axis=1, inplace=True)

In [69]:
df.head(2)

Unnamed: 0,link,headline,category,short_description,authors,date,headline_cleaned,short_description_cleaned
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,4 million american roll sleeve omicron target ...,health expert say early predict whether demand...
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,american airline flyer charge ban life punch f...,subdue passenger crew flee back aircraft confr...


In [70]:
## create new column with two cleaned cells concatenated
df['cleaned_words'] = df['headline_cleaned'] + ' ' + df['short_description_cleaned']

In [71]:
## rename single word columns in df 
## (it causes problems if one of the words from text is same as another column name)
df = df.rename(columns={"link":"link_col", 
                        "headline":"headline_col", 
                        "category":"category_col",
                        "authors":"authors_col", 
                        "date":"date_col"})

In [72]:
## Train_test_split
from sklearn.model_selection import train_test_split

news_train, news_test = train_test_split(df,
                                         random_state = 546,
                                         shuffle = True,
                                         test_size = .2,
                                         stratify = df['category_col'])

In [73]:
## NB!! reset the indices! 
## Need this in order to accurately match the rows in this df with their if-tdf vectors
news_train = news_train.reset_index(drop=True)
news_test = news_test.reset_index(drop=True)

## TF-IDF for training data

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
## Calcuate TF-IDF and create vectors
v = TfidfVectorizer()
x = v.fit_transform(news_train['cleaned_words'])

In [75]:
## Save the list of vectors as a df, along with the words each point corresponds to
vectors = pd.DataFrame(x.toarray(), columns=v.get_feature_names_out())


## concatenate the two dfs on the vertical axis
train_df = pd.concat([news_train, vectors], axis = 1)

In [83]:
train_df.head()

Unnamed: 0,link_col,headline_col,category_col,short_description,authors_col,date_col,headline_cleaned,short_description_cleaned,cleaned_words,000,...,worker,world,worldwide,worsen,would,writer,year,young,youtube,zaporizhzhia
0,https://www.huffpost.com/entry/kim-kardashian-...,Kim Kardashian's Next Boyfriend Could Be A Bio...,ENTERTAINMENT,The reality star told James Corden she's branc...,Ron Dicker,2022-09-15,kim kardashian next boyfriend could biochemist,reality star tell james corden branch `` clear...,kim kardashian next boyfriend could biochemist...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,https://www.huffpost.com/entry/2022-emmys-rati...,Emmy Awards Viewership Dips To A Record-Low As...,ENTERTAINMENT,The television awards ceremony lost roughly 1....,"David Bauder, AP",2022-09-14,emmy award viewership dip record low audience ...,television award ceremony lose roughly 1.5 mil...,emmy award viewership dip record low audience ...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,https://www.huffpost.com/entry/cardi-b-bronx-s...,"Cardi B Donates $100,000 To Her Old Middle Sch...",ENTERTAINMENT,The Grammy-winning rapper surprised star-struc...,Marco Margaritoff,2022-09-14,"cardi b donate 100,000 old middle school bronx",grammy win rapper surprise star struck child a...,"cardi b donate 100,000 old middle school bronx...",0.214723,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,https://www.huffpost.com/entry/russia-ukraine-...,Last Reactor At Ukraine's Zaporizhzhia Nuclear...,WORLD NEWS,"The plant, one of the 10 biggest atomic power ...","Karl Ritter, AP",2022-09-11,last reactor ukraine zaporizhzhia nuclear plan...,plant one 10 big atomic power station world oc...,last reactor ukraine zaporizhzhia nuclear plan...,0.0,...,0.0,0.162945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.196038
4,https://www.huffpost.com/entry/seth-magaziner-...,Democrats Nominate Seth Magaziner In Key Rhode...,POLITICS,The state's general treasurer is slated to fac...,Daniel Marans,2022-09-14,democrat nominate seth magaziner key rhode isl...,state general treasurer slat face former crans...,democrat nominate seth magaziner key rhode isl...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
#train_df.to_csv('./news_train.csv') 

## TF-IDF for test data

In [76]:
## Calcuate TF-IDF and create vectors
v2 = TfidfVectorizer()
x2 = v2.fit_transform(news_test['cleaned_words'])

In [77]:
## Save the list of vectors as a df, along with the words each point corresponds to
vectors2 = pd.DataFrame(x2.toarray(), columns=v2.get_feature_names_out())

## concatenate the two dfs on the vertical axis
test_df = pd.concat([news_test, vectors2], axis = 1)

In [82]:
test_df.head()

Unnamed: 0,link_col,headline_col,category_col,short_description,authors_col,date_col,headline_cleaned,short_description_cleaned,cleaned_words,100k,...,weekend,whether,white,will,win,world,would,wreath,year,york
0,https://www.huffpost.com/entry/newspaper-front...,'Our Hearts Are Broken': Historic Front Pages ...,WORLD NEWS,Both British and international newspapers hono...,Kate Nicholson,2022-09-09,'our heart break historic front page mark quee...,british international newspaper honor passing ...,'our heart break historic front page mark quee...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,https://www.huffpost.com/entry/jackson-water-p...,Mississippi Governor Says Water Pressure Is No...,U.S. NEWS,"The city remains under a boil water notice, wh...",Marita Vlachou,2022-09-06,mississippi governor say water pressure 'solid...,city remain boil water notice first go effect ...,mississippi governor say water pressure 'solid...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,https://www.huffpost.com/entry/europe-britain-...,Biden Says Queen's Death Left 'Giant Hole' For...,POLITICS,"U.S. President Joe Biden, in London for the fu...","Darlene Superville, AP",2022-09-18,biden say queen death leave 'giant hole royal ...,u.s. president joe biden london funeral queen ...,biden say queen death leave 'giant hole royal ...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,https://www.huffpost.com/entry/sept-11-anniver...,"Biden Honors 9/11 Victims, Vows Commitment To ...",POLITICS,President Joe Biden has marked the 21st annive...,"COLLEEN LONG and AAMER MADHANI, AP",2022-09-11,biden honor 9/11 victim vow commitment thwart ...,president joe biden mark 21st anniversary sept...,biden honor 9/11 victim vow commitment thwart ...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188185,0.0,0.0
4,https://www.huffpost.com/entry/podcasters-offe...,Podcasters Offer $100K Reward For Information ...,ENTERTAINMENT,The Las Vegas podcasters are hoping to find ou...,Jazmin Tolliver,2022-09-15,podcasters offer 100k reward information 2pac ...,la vega podcasters hop find shoot rap icon 26 ...,podcasters offer 100k reward information 2pac ...,0.224947,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.196419,0.0


In [85]:
#test_df.to_csv('./news_train.csv') 