In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("News_ds_cleaned.csv")

In [3]:
temp = df.head(100)

In [4]:
temp.groupby("category").count().sort_values(by=["link"])

Unnamed: 0_level_0,Unnamed: 0,link,headline,short_description,authors,date,headline_cleaned,short_description_cleaned
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
COMEDY,1,1,1,1,1,1,1,1
EDUCATION,1,1,1,1,1,1,1,1
PARENTING,1,1,1,1,1,1,1,1
TECH,2,2,2,2,2,2,2,2
WEIRD NEWS,2,2,2,2,1,2,2,2
CULTURE & ARTS,5,5,5,5,5,5,5,5
ENVIRONMENT,5,5,5,5,5,5,5,5
SPORTS,7,7,7,7,5,7,7,7
POLITICS,16,16,16,16,15,16,16,16
ENTERTAINMENT,17,17,17,17,15,17,17,17


In [10]:
## create new df with only categories we want
df = temp.loc[temp['category'].isin(["U.S. NEWS", "WORLD NEWS", "ENTERTAINMENT", "POLITICS", "SPORTS"])]

In [14]:
## NB!! reset the indices! 
## Need this in order to accurately match the rows in this df with their if-tdf vectors
df = df.reset_index(drop=True)

## drop the second column of indices
df.drop('Unnamed: 0', axis=1, inplace=True)

In [23]:
df.head(2)

Unnamed: 0,link_col,headline_col,category_col,short_description,authors_col,date_col,headline_cleaned,short_description_cleaned,cleaned_words
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,Over 4 Million Americans Roll Up Sleeves For O...,Health experts said it is too early to predict...,Over 4 Million Americans Roll Up Sleeves For O...
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,"American Airlines Flyer Charged, Banned For Li...",He was subdued by passengers and crew when he ...,"American Airlines Flyer Charged, Banned For Li..."


In [17]:
## create new column with two cleaned cells concatenated
df['cleaned_words'] = df['headline_cleaned'] + ' ' + df['short_description_cleaned']

## TF-IDF

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
## Calcuate TF-IDF and create vectors
v = TfidfVectorizer()
x = v.fit_transform(df['cleaned_words'])

In [21]:
## Save the list of vectors as a df, along with the words each point corresponds to
df_vectors = pd.DataFrame(x.toarray(), columns=v.get_feature_names_out())

## rename columns in original df 
## (it causes problems if one of the words from text is same as another column name)
df = df.rename(columns={"link":"link_col", 
                        "headline":"headline_col", 
                        "category":"category_col",
                        "authors":"authors_col", 
                        "date":"date_col"})

## concatenate the two dfs on the vertical axis
whole_df = pd.concat([df, df_vectors], axis = 1)

In [24]:
whole_df.head()

Unnamed: 0,link_col,headline_col,category_col,short_description,authors_col,date_col,headline_cleaned,short_description_cleaned,cleaned_words,000,...,would,wreath,writer,year,years,york,you,youngest,youtube,zaporizhzhia
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,Over 4 Million Americans Roll Up Sleeves For O...,Health experts said it is too early to predict...,Over 4 Million Americans Roll Up Sleeves For O...,0.0,...,0.152814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,"American Airlines Flyer Charged, Banned For Li...",He was subdued by passengers and crew when he ...,"American Airlines Flyer Charged, Banned For Li...",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22,Woman Who Called Cops On Black Bird-Watcher Lo...,Amy Cooper accused investment firm Franklin Te...,Woman Who Called Cops On Black Bird-Watcher Lo...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,https://www.huffpost.com/entry/belk-worker-fou...,Cleaner Was Dead In Belk Bathroom For 4 Days B...,U.S. NEWS,The 63-year-old woman was seen working at the ...,,2022-09-22,Cleaner Was Dead In Belk Bathroom For 4 Days B...,The 63-year-old woman was seen working at the ...,Cleaner Was Dead In Belk Bathroom For 4 Days B...,0.0,...,0.0,0.0,0.0,0.124343,0.0,0.0,0.0,0.0,0.0,0.0
4,https://www.huffpost.com/entry/reporter-gets-a...,Reporter Gets Adorable Surprise From Her Boyfr...,U.S. NEWS,"""Who's that behind you?"" an anchor for New Yor...",Elyse Wanshel,2022-09-22,Reporter Gets Adorable Surprise From Her Boyfr...,"""Who's that behind you?"" an anchor for New Yor...",Reporter Gets Adorable Surprise From Her Boyfr...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.187868,0.205451,0.0,0.0,0.0


## train_test_split 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
## Make the split
news_train, news_test = train_test_split(whole_df,
                                            random_state = 546,
                                            shuffle = True,
                                            test_size = .2,
                                            stratify = whole_df['category_col'])

In [None]:
## save to csv
news_train.to_csv('./news_train.csv') 
news_test.to_csv('./news_test.csv') 