In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [39]:
og_df = pd.read_csv("News_ds_cleaned.csv")

## Making a df with relevant entries

In [3]:
## Conjoin categories which are essentially the same, just different names; specifically
## "PARENTS" and "PARENTING"  |  "WELLNESS" and "HEALTY LIVING"
# og_df.loc[og_df.category == "PARENTS", 'category'] = "PARENTING"
# og_df.loc[og_df.category == "HEALTHY LIVING", 'category'] = "WELLNESS"

In [88]:
## create new df with only categories we want
curated_cats = ["POLITICS", "WELLNESS", "ENTERTAINMENT"] # "TRAVEL", "PARENTING", , "STYLE & BEAUTY"
                #"QUEER VOICES", "FOOD & DRINK", "BUSINESS", # "SPORTS", "BLACK VOICES", "WORLD NEWS"]

df = og_df.loc[og_df['category'].isin(curated_cats)]

In [89]:
## drop entries where headline/short_description ends up null
df = df[df['short_description_cleaned'].notna()]
df = df[df['headline_cleaned'].notna()]

In [90]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,link,headline,category,short_description,authors,date,headline_cleaned,short_description_cleaned
20,20,https://www.huffpost.com/entry/golden-globes-r...,Golden Globes Returning To NBC In January Afte...,ENTERTAINMENT,"For the past 18 months, Hollywood has effectiv...",,2022-09-20,golden globe return nbc january year air,past 18 month hollywood effectively boycott gl...
21,21,https://www.huffpost.com/entry/biden-us-forces...,Biden Says U.S. Forces Would Defend Taiwan If ...,POLITICS,President issues vow as tensions with China rise.,,2022-09-19,biden say u.s force would defend taiwan china ...,president issue vow tension china rise


In [91]:
## reset the indices
## Need this in order to accurately match the rows in this df with their if-tdf vectors
df = df.reset_index(drop=True)

## drop the second column of indices
df.drop('Unnamed: 0', axis=1, inplace=True)

In [92]:
## create new column with two cleaned cells concatenated
df['cleaned_words'] = df['headline_cleaned'] + ' ' + df['short_description_cleaned']

In [93]:
df.head(2)

Unnamed: 0,link,headline,category,short_description,authors,date,headline_cleaned,short_description_cleaned,cleaned_words
0,https://www.huffpost.com/entry/golden-globes-r...,Golden Globes Returning To NBC In January Afte...,ENTERTAINMENT,"For the past 18 months, Hollywood has effectiv...",,2022-09-20,golden globe return nbc january year air,past 18 month hollywood effectively boycott gl...,golden globe return nbc january year air past ...
1,https://www.huffpost.com/entry/biden-us-forces...,Biden Says U.S. Forces Would Defend Taiwan If ...,POLITICS,President issues vow as tensions with China rise.,,2022-09-19,biden say u.s force would defend taiwan china ...,president issue vow tension china rise,biden say u.s force would defend taiwan china ...


In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65078 entries, 0 to 65077
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   link                       65078 non-null  object
 1   headline                   65078 non-null  object
 2   category                   65078 non-null  object
 3   short_description          65078 non-null  object
 4   authors                    56031 non-null  object
 5   date                       65078 non-null  object
 6   headline_cleaned           65078 non-null  object
 7   short_description_cleaned  65078 non-null  object
 8   cleaned_words              65078 non-null  object
dtypes: object(9)
memory usage: 4.5+ MB


In [95]:
 df.groupby("category").count().sort_values(by=["link"])

Unnamed: 0_level_0,link,headline,short_description,authors,date,headline_cleaned,short_description_cleaned,cleaned_words
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENTERTAINMENT,14754,14754,14754,13443,14754,14754,14754,14754
WELLNESS,17922,17922,17922,12927,17922,17922,17922,17922
POLITICS,32402,32402,32402,29661,32402,32402,32402,32402


## Calculating TF-IDF

In [102]:
## Calcuate TF-IDF and create vectors
v = TfidfVectorizer(min_df=8)
x = v.fit_transform(df['cleaned_words'])

In [103]:
## Save the list of vectors as a df, along with the words each point corresponds to
df_vectors = pd.DataFrame(x.toarray(), columns=v.get_feature_names_out())

## rename columns in original df 
## (it causes problems if one of the words from text is same as another column name)
df = df.rename(columns={"link":"link_col", 
                        "headline":"headline_col", 
                        "category":"category_col",
                        "authors":"authors_col", 
                        "date":"date_col"})

## concatenate the two dfs on the vertical axis
whole_df = pd.concat([df, df_vectors], axis = 1)

MemoryError: Unable to allocate 5.11 GiB for an array with shape (65078, 10536) and data type float64

## Do train_test_split, save results to csv

In [None]:
## Make the split
news_train, news_test = train_test_split(whole_df,
                                            random_state = 546,
                                            shuffle = True,
                                            test_size = .15,
                                            stratify = whole_df['category_col'])

In [None]:
## save to csv
news_train.to_csv('./news_train.csv') 
news_test.to_csv('./news_test.csv') 