In [23]:
import pandas as pd
import gensim
import pprint
from gensim import corpora
from gensim.utils import simple_preprocess

In [2]:
og_df = pd.read_csv("News_ds_cleaned.csv")

## Making a df with relevant entries

In [23]:
## Conjoin categories which are essentially the same, just different names; specifically
## "PARENTS" and "PARENTING"  |  "WELLNESS" and "HEALTY LIVING"
# og_df.loc[og_df.category == "PARENTS", 'category'] = "PARENTING"
# og_df.loc[og_df.category == "HEALTHY LIVING", 'category'] = "WELLNESS"

In [5]:
## create new df with only categories we want
curated_cats = ["POLITICS", "WELLNESS", "ENTERTAINMENT", "TRAVEL", "PARENTING", "STYLE & BEAUTY",
                "QUEER VOICES", "FOOD & DRINK", "BUSINESS", "SPORTS", "BLACK VOICES", "WORLD NEWS"]

df = og_df.loc[og_df['category'].isin(curated_cats)]

In [6]:
## drop entries where headline/short_description ends up null
df = df[df['short_description_cleaned'].notna()]
df = df[df['headline_cleaned'].notna()]

In [7]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,link,headline,category,short_description,authors,date,headline_cleaned,short_description_cleaned
3,3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,funny tweet parent week sept. 17 23,`` accidentally put grow toothpaste toddler to...
7,7,https://www.huffpost.com/entry/puerto-rico-wat...,Puerto Ricans Desperate For Water After Hurric...,WORLD NEWS,More than half a million people remained witho...,"DÁNICA COTO, AP",2022-09-22,puerto ricans desperate water hurricane fiona ...,half million people remain without water servi...


In [8]:
## reset the indices
## Need this in order to accurately match the rows in this df with their if-tdf vectors
#df = df.reset_index(drop=True)

## drop the second column of indices
df.drop('Unnamed: 0', axis=1, inplace=True)

In [9]:
## create new column with two cleaned cells concatenated
df['cleaned_words'] = df['headline_cleaned'] + ' ' + df['short_description_cleaned']

In [10]:
df.head(2)

Unnamed: 0,link,headline,category,short_description,authors,date,headline_cleaned,short_description_cleaned,cleaned_words
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,funny tweet parent week sept. 17 23,`` accidentally put grow toothpaste toddler to...,funny tweet parent week sept. 17 23 `` acciden...
7,https://www.huffpost.com/entry/puerto-rico-wat...,Puerto Ricans Desperate For Water After Hurric...,WORLD NEWS,More than half a million people remained witho...,"DÁNICA COTO, AP",2022-09-22,puerto ricans desperate water hurricane fiona ...,half million people remain without water servi...,puerto ricans desperate water hurricane fiona ...


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122005 entries, 3 to 209485
Data columns (total 9 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   link                       122005 non-null  object
 1   headline                   122005 non-null  object
 2   category                   122005 non-null  object
 3   short_description          122005 non-null  object
 4   authors                    101022 non-null  object
 5   date                       122005 non-null  object
 6   headline_cleaned           122005 non-null  object
 7   short_description_cleaned  122005 non-null  object
 8   cleaned_words              122005 non-null  object
dtypes: object(9)
memory usage: 9.3+ MB


In [12]:
 df.groupby("category").count().sort_values(by=["link"])

Unnamed: 0_level_0,link,headline,short_description,authors,date,headline_cleaned,short_description_cleaned,cleaned_words
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
WORLD NEWS,3297,3297,3297,2678,3297,3297,3297,3297
BLACK VOICES,4176,4176,4176,3312,4176,4176,4176,4176
SPORTS,4406,4406,4406,3601,4406,4406,4406,4406
BUSINESS,5130,5130,5130,4372,5130,5130,5130,5130
QUEER VOICES,5596,5596,5596,4695,5596,5596,5596,5596
FOOD & DRINK,6329,6329,6329,4526,6329,6329,6329,6329
PARENTING,8781,8781,8781,6512,8781,8781,8781,8781
TRAVEL,9419,9419,9419,8022,9419,9419,9419,9419
STYLE & BEAUTY,9793,9793,9793,7273,9793,9793,9793,9793
ENTERTAINMENT,14754,14754,14754,13443,14754,14754,14754,14754


In [13]:
## rename single word columns in df 
## (it causes problems if one of the words from text is same as another column name)
df = df.rename(columns={"link":"link_col", 
                        "headline":"headline_col", 
                        "category":"category_col",
                        "authors":"authors_col", 
                        "date":"date_col"})

In [14]:
## Train_test_split
from sklearn.model_selection import train_test_split

news_train, news_test = train_test_split(df,
                                         random_state = 546,
                                         shuffle = True,
                                         test_size = .2,
                                         stratify = df['category_col'])

In [15]:
## reset the indices
## Need this in order to accurately match the rows in this df with their if-tdf vectors
news_train = news_train.reset_index(drop=True)
news_test = news_test.reset_index(drop=True)

In [None]:
## save to csv
train_df.to_csv('./news_train.csv') 
test_df.to_csv('./news_test.csv') 

## TF-IDF for training data

In [20]:
import numpy as np
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

In [24]:
train_tokenized = [simple_preprocess(line) for line in news_train['cleaned_words']]
train_dict = corpora.Dictionary()
train_corpus = [train_dict.doc2bow(line, allow_update=True) for line in train_tokenized]

In [25]:
train_tfidf = TfidfModel(train_corpus, smartirs='ntc')

In [26]:
 # apply model to the first corpus document
vector = train_tfidf[train_corpus[0]] 

#etc for other docs in corpus

In [27]:
vector

[(0, 0.21744169315184333),
 (1, 0.3618448148680886),
 (2, 0.21802640014757815),
 (3, 0.2238970949090863),
 (4, 0.19056966589632973),
 (5, 0.25140181801266853),
 (6, 0.13174885342746356),
 (7, 0.26769462667343497),
 (8, 0.17802583109395193),
 (9, 0.17648047015395238),
 (10, 0.27287366056311535),
 (11, 0.3298001145213803),
 (12, 0.2366594716946781),
 (13, 0.28469782194419974),
 (14, 0.3904573166949825)]

## TF-IDF for test data


In [29]:
test_tokenized = [simple_preprocess(line) for line in news_test['cleaned_words']]
test_dict = corpora.Dictionary()
test_corpus = [test_dict.doc2bow(line, allow_update=True) for line in test_tokenized]

In [28]:
## Need to do TF-IDF  for test data, BUT we dont want to use the IDF for the test data - the relavant IDF is the training IDF