## TF-IDF for training data

In [1]:
import numpy as np
import pandas as pd
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim import corpora

In [2]:
news_train = pd.read_csv("news_train.csv")

In [3]:
train_tokenized = [simple_preprocess(line) for line in news_train['cleaned_words']]
train_dict = corpora.Dictionary()
train_corpus = [train_dict.doc2bow(line, allow_update=True) for line in train_tokenized]

In [4]:
train_tfidf = TfidfModel(train_corpus, smartirs='ntc')

In [10]:
 # apply model to the first corpus document
vector = train_tfidf[train_corpus[0]] 

#etc for other docs in corpus

In [9]:
vector

[(0, 0.3899757375336991),
 (1, 0.45716592324545857),
 (2, 0.14915494661219156),
 (3, 0.3265268032796511),
 (4, 0.2140317550860369),
 (5, 0.2959838858806),
 (6, 0.1991798419941692),
 (7, 0.13450994338932348),
 (8, 0.16011292944815195),
 (9, 0.4259774812878671),
 (10, 0.2145638580082504),
 (11, 0.2563963538770435)]

In [13]:
from gensim.matutils import corpus2dense
print(train_tfidf)
print(train_corpus[1])
print(train_tfidf[train_corpus[1]])
dense_vec = corpus2dense(train_tfidf[train_corpus[1:5]], num_terms=train_tfidf.num_nnz).transpose()
dense_vec
#corpus_tfidf_sparse = corpus2csc(corpus_tfidf, num_terms, num_docs)

TfidfModel(num_docs=104651, num_nnz=1769515)
[(12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1)]
[(12, 0.19625959819721403), (13, 0.18114835462564435), (14, 0.1442343339923965), (15, 0.23020045613760776), (16, 0.1215821887009755), (17, 0.2899603875144407), (18, 0.3392770863731662), (19, 0.17358249142787363), (20, 0.25704246343193127), (21, 0.16477335472980023), (22, 0.4250038937385465), (23, 0.2115885821189835), (24, 0.17199755598460853), (25, 0.37272612481649886), (26, 0.26182342144952586), (27, 0.23472115669795413)]


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [15]:
##### This cell should fit the pca in batches #####
from gensim.matutils import corpus2dense
from sklearn.decomposition import IncrementalPCA
pca = IncrementalPCA(n_components=3) ###n_components will determine the number of dimensions to keep after applying dimensionality reduction.
i=0
batch_size=10 ###If you set the batch size to 1, it will throw an error, since the slice will no longer be a list of lists
while True:
    #####Temp is a numpy array corresponding to the dense rows generated from the gensim tfidf
    temp = corpus2dense(train_tfidf[train_corpus[min(batch_size*i, len(train_corpus)-1):min(batch_size*(i+1), len(train_corpus)-1)]], num_terms=train_tfidf.num_nnz).transpose()
    if temp.shape[0]==0:
        break
    #####We incrementally fit the pca in batches
    pca.partial_fit(temp)
    i+=1
    
    #####This line should be removed once we want to run this code on the entire dataset
    if i==2:
        break

In [16]:
#####This cell should generate the transformed data in batches
i=0
batch_size=10
while True:
    temp = corpus2dense(train_tfidf[train_corpus[min(batch_size*i, len(train_corpus)-1):min(batch_size*(i+1), len(train_corpus)-1)]], num_terms=train_tfidf.num_nnz).transpose()
    if temp.shape[0]==0:
        break
        
    #####out is a matrix containing the dimension reduced columns. Output to a pandas df not yet implemented. 
    out=pca.transform(temp)
    print(out)
    i+=1
    
    #####This line should be removed once we want to run this code on the entire dataset
    if i==2:
        break

[[-0.40063031 -0.13960853  0.36688407]
 [-0.00932635  0.00512648 -0.31076145]
 [-0.41745011 -0.14478315  0.32948692]
 [-0.00874053  0.0051453  -0.29972531]
 [ 0.2729355  -0.19642445  0.06004003]
 [-0.11649877 -0.03346866 -0.131255  ]
 [ 0.31896489 -0.13724791  0.12150999]
 [ 0.14831346  0.62669759  0.20950098]
 [-0.00998653  0.00394718 -0.30063629]
 [ 0.18249696  0.60496079  0.22683801]]
[[-0.13822499 -0.00624933 -0.15842667]
 [ 0.00738285  0.00620786 -0.13732865]
 [-0.09583709  0.08742219 -0.04303658]
 [-0.07827064  0.08027199 -0.04700477]
 [-0.27413209 -0.09049419  0.20145991]
 [ 0.02588579 -0.05479543 -0.16933842]
 [-0.02287835 -0.00068318 -0.41699286]
 [ 0.48750941 -0.33047302  0.24693167]
 [ 0.32185933 -0.25251187  0.16115721]
 [-0.19337242 -0.03303966  0.09069722]]


## TF-IDF for test data


In [8]:
news_test = pd.read_csv("news_test.csv")

In [29]:
test_tokenized = [simple_preprocess(line) for line in news_test['cleaned_words']]
test_dict = corpora.Dictionary()
test_corpus = [test_dict.doc2bow(line, allow_update=True) for line in test_tokenized]

In [28]:
## Need to do TF-IDF  for test data, BUT we dont want to use the IDF for the test data - the relavant IDF is the training IDF