This tests what the output of Gensim TFIDF looks like. This uses a much smaller dataset (not one we actually care about) 

In [1]:
import pandas as pd
import gensim
import pprint
from gensim import corpora
from gensim.utils import simple_preprocess

In [2]:
df = pd.read_csv("News_ds_cleaned.csv")

In [3]:
temp = df.head(100)

In [4]:
temp.groupby("category").count().sort_values(by=["link"])

Unnamed: 0_level_0,Unnamed: 0,link,headline,short_description,authors,date,headline_cleaned,short_description_cleaned
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
COMEDY,1,1,1,1,1,1,1,1
EDUCATION,1,1,1,1,1,1,1,1
PARENTING,1,1,1,1,1,1,1,1
TECH,2,2,2,2,2,2,2,2
WEIRD NEWS,2,2,2,2,1,2,2,2
CULTURE & ARTS,5,5,5,5,5,5,5,5
ENVIRONMENT,5,5,5,5,5,5,5,5
SPORTS,7,7,7,7,5,7,7,7
POLITICS,16,16,16,16,15,16,16,16
ENTERTAINMENT,17,17,17,17,15,17,17,17


In [5]:
## create new df with only categories we want
df = temp.loc[temp['category'].isin(["U.S. NEWS", "WORLD NEWS", "ENTERTAINMENT", "POLITICS", "SPORTS"])]

In [6]:
## NB!! reset the indices! 
## Need this in order to accurately match the rows in this df with their if-tdf vectors
df = df.reset_index(drop=True)

## drop the second column of indices
df.drop('Unnamed: 0', axis=1, inplace=True)

In [7]:
df.head(2)

Unnamed: 0,link,headline,category,short_description,authors,date,headline_cleaned,short_description_cleaned
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,4 million american roll sleeve omicron target ...,health expert say early predict whether demand...
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,american airline flyer charge ban life punch f...,subdue passenger crew flee back aircraft confr...


In [8]:
## create new column with two cleaned cells concatenated
df['cleaned_words'] = df['headline_cleaned'] + ' ' + df['short_description_cleaned']

In [11]:
## rename single word columns in df 
## (it causes problems if one of the words from text is same as another column name)
df = df.rename(columns={"link":"link_col", 
                        "headline":"headline_col", 
                        "category":"category_col",
                        "authors":"authors_col", 
                        "date":"date_col"})

In [12]:
## Train_test_split
from sklearn.model_selection import train_test_split

news_train, news_test = train_test_split(df,
                                         random_state = 546,
                                         shuffle = True,
                                         test_size = .2,
                                         stratify = df['category_col'])

## TF-IDF for training data

In [None]:
import numpy as np
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

In [13]:
train_tokenized = [simple_preprocess(line) for line in news_train['cleaned_words']]
train_dict = corpora.Dictionary()
train_corpus = [train_dict.doc2bow(line, allow_update=True) for line in train_tokenized]

In [14]:
train_tfidf = TfidfModel(train_corpus, smartirs='ntc')

In [22]:
for doc in train_tfidf[train_corpus]:
   print([[train_dict[id], np.around(freq,decimals=2)] for id, freq in doc])

[['biochemist', 0.29], ['boyfriend', 0.29], ['branch', 0.29], ['clearly', 0.29], ['corden', 0.29], ['could', 0.19], ['james', 0.24], ['kardashian', 0.29], ['kim', 0.29], ['next', 0.24], ['reality', 0.29], ['star', 0.21], ['tell', 0.19], ['whatever', 0.29], ['work', 0.16]]
[['audience', 0.21], ['award', 0.41], ['ceremony', 0.21], ['compare', 0.25], ['continue', 0.25], ['dip', 0.25], ['drop', 0.25], ['emmy', 0.25], ['lose', 0.18], ['low', 0.21], ['million', 0.16], ['program', 0.25], ['record', 0.21], ['roughly', 0.25], ['television', 0.25], ['viewer', 0.25], ['viewership', 0.25]]
[['star', 0.17], ['alexander', 0.23], ['announcement', 0.23], ['bronx', 0.23], ['cardi', 0.23], ['child', 0.2], ['donate', 0.23], ['grammy', 0.23], ['height', 0.23], ['macomb', 0.23], ['make', 0.13], ['middle', 0.23], ['morris', 0.23], ['old', 0.16], ['rapper', 0.23], ['school', 0.39], ['struck', 0.23], ['surprise', 0.23], ['win', 0.17]]
[['atomic', 0.2], ['big', 0.24], ['day', 0.13], ['early', 0.24], ['force', 

In [23]:
vector = train_tfidf[train_corpus[0]]  # apply model to the first corpus document

In [24]:
vector

[(0, 0.28666861522523085),
 (1, 0.28666861522523085),
 (2, 0.28666861522523085),
 (3, 0.28666861522523085),
 (4, 0.28666861522523085),
 (5, 0.19215348159476153),
 (6, 0.2394110484099962),
 (7, 0.28666861522523085),
 (8, 0.28666861522523085),
 (9, 0.2394110484099962),
 (10, 0.28666861522523085),
 (11, 0.21176714394775942),
 (12, 0.19215348159476153),
 (13, 0.28666861522523085),
 (14, 0.16450957713252476)]

## TF-IDF for test corpus

In [25]:
test_tokenized = [simple_preprocess(line) for line in news_test['cleaned_words']]
test_dict = corpora.Dictionary()
test_corpus = [test_dict.doc2bow(line, allow_update=True) for line in test_tokenized]

In [26]:
test_tfidf = TfidfModel(test_corpus, smartirs='ntc')