In [25]:
import pandas as pd
import gensim
import pprint
from gensim import corpora
from gensim.utils import simple_preprocess

In [26]:
og_df = pd.read_pickle("News_ds_cleaned.pkl")

## Making a df with relevant entries

In [28]:
## create new df with only categories we want
curated_cats = ["POLITICS", "WELLNESS", "ENTERTAINMENT", "TRAVEL", "PARENTING", "STYLE & BEAUTY",
                "QUEER VOICES", "FOOD & DRINK", "BUSINESS", "SPORTS", "BLACK VOICES", "WORLD NEWS"]

df = og_df.loc[og_df['category'].isin(curated_cats)]

In [29]:
## drop entries where headline/short_description ends up null
df = df[df['short_description_cleaned'].notna()]
df = df[df['headline_cleaned'].notna()]

In [31]:
## reset the indices
## Need this in order to accurately match the rows in this df with their if-tdf vectors
df = df.reset_index(drop=True)

In [32]:
## create new column with two cleaned cells concatenated
df['cleaned_words'] = df['headline_cleaned'] + ' ' + df['short_description_cleaned']
df=df[df['cleaned_words']!=' ']

In [33]:
df.head(2)

Unnamed: 0,link,headline,category,short_description,authors,date,headline_cleaned,short_description_cleaned,cleaned_words
0,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,funny tweet parent week sept. 17 23,accidentally put grow toothpaste toddler tooth...,funny tweet parent week sept. 17 23 accidental...
1,https://www.huffpost.com/entry/puerto-rico-wat...,Puerto Ricans Desperate For Water After Hurric...,WORLD NEWS,More than half a million people remained witho...,"DÁNICA COTO, AP",2022-09-22,puerto ricans desperate water hurricane fiona ...,half million people remain without water servi...,puerto ricans desperate water hurricane fiona ...


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131019 entries, 0 to 131022
Data columns (total 9 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   link                       131019 non-null  object        
 1   headline                   131019 non-null  object        
 2   category                   131019 non-null  object        
 3   short_description          131019 non-null  object        
 4   authors                    131019 non-null  object        
 5   date                       131019 non-null  datetime64[ns]
 6   headline_cleaned           131019 non-null  object        
 7   short_description_cleaned  131019 non-null  object        
 8   cleaned_words              131019 non-null  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 10.0+ MB


In [35]:
 df.groupby("category").count().sort_values(by=["link"])

Unnamed: 0_level_0,link,headline,short_description,authors,date,headline_cleaned,short_description_cleaned,cleaned_words
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
WORLD NEWS,3299,3299,3299,3299,3299,3299,3299,3299
BLACK VOICES,4583,4583,4583,4583,4583,4583,4583,4583
SPORTS,5077,5077,5077,5077,5077,5077,5077,5077
BUSINESS,5990,5990,5990,5990,5990,5990,5990,5990
FOOD & DRINK,6340,6340,6340,6340,6340,6340,6340,6340
QUEER VOICES,6344,6344,6344,6344,6344,6344,6344,6344
PARENTING,8790,8790,8790,8790,8790,8790,8790,8790
STYLE & BEAUTY,9807,9807,9807,9807,9807,9807,9807,9807
TRAVEL,9900,9900,9900,9900,9900,9900,9900,9900
ENTERTAINMENT,17361,17361,17361,17361,17361,17361,17361,17361


In [36]:
## rename single word columns in df 
## (it causes problems if one of the words from text is same as another column name)
df = df.rename(columns={"link":"link_col", 
                        "headline":"headline_col", 
                        "category":"category_col",
                        "authors":"authors_col", 
                        "date":"date_col"})

In [37]:
## Train_test_split
from sklearn.model_selection import train_test_split

news_train, news_test = train_test_split(df,
                                         random_state = 546,
                                         shuffle = True,
                                         test_size = .2,
                                         stratify = df['category_col'])

In [38]:
## reset the indices
## Need this in order to accurately match the rows in this df with their if-tdf vectors
news_train = news_train.reset_index(drop=True)
news_test = news_test.reset_index(drop=True)

In [39]:
## Split the train set again into another train/validation split
news_ttrain, news_tvalid = train_test_split(news_train,
                                         random_state = 305,
                                         shuffle = True,
                                         test_size = .2,
                                         stratify = news_train['category_col'])

In [40]:
## Save datasets to output files
news_ttrain.to_pickle('./news_ttrain.pkl')
news_tvalid.to_pickle('./news_tvalid.pkl')
news_train.to_pickle('./news_train.pkl') 
news_test.to_pickle('./news_test.pkl') 