## TF-IDF for training data

In [11]:
import numpy as np
import pandas as pd
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim import corpora

In [12]:
news_train = pd.read_csv("news_train.csv")

In [13]:
train_tokenized = [simple_preprocess(line) for line in news_train['cleaned_words']]
train_dict = corpora.Dictionary()
train_corpus = [train_dict.doc2bow(line, allow_update=True) for line in train_tokenized]

In [14]:
train_tfidf = TfidfModel(train_corpus, smartirs='ntc')

In [15]:
 # apply model to the first corpus document
vector = train_tfidf[train_corpus[0]] 

#etc for other docs in corpus

In [None]:
vector

In [16]:
from gensim.matutils import corpus2dense
print(train_tfidf)
print(train_corpus[1])
print(train_tfidf[train_corpus[1]])
dense_vec = corpus2dense(train_tfidf[train_corpus[1:5]], num_terms=len(train_dict)).transpose()
dense_vec
#corpus_tfidf_sparse = corpus2csc(corpus_tfidf, num_terms, num_docs)

TfidfModel(num_docs=104651, num_nnz=1769515)
[(12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1)]
[(12, 0.19625959819721403), (13, 0.18114835462564435), (14, 0.1442343339923965), (15, 0.23020045613760776), (16, 0.1215821887009755), (17, 0.2899603875144407), (18, 0.3392770863731662), (19, 0.17358249142787363), (20, 0.25704246343193127), (21, 0.16477335472980023), (22, 0.4250038937385465), (23, 0.2115885821189835), (24, 0.17199755598460853), (25, 0.37272612481649886), (26, 0.26182342144952586), (27, 0.23472115669795413)]


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [21]:
%%time
##### This cell should fit the pca in batches #####
from gensim.matutils import corpus2dense
from sklearn.decomposition import IncrementalPCA
pca = IncrementalPCA(n_components=950) ###n_components will determine the number of dimensions to keep after applying dimensionality reduction.
i=0
batch_size=1000 ###If you set the batch size to 1, it will throw an error, since the slice will no longer be a list of lists
while True:
    print('Percent completion: ' + "{:10.2f}".format(100*i*batch_size/len(train_corpus)) + '%')
    #####Temp is a numpy array corresponding to the dense rows generated from the gensim tfidf
    temp = corpus2dense(train_tfidf[train_corpus[min(batch_size*i, len(train_corpus)-1):min(batch_size*(i+1), len(train_corpus)-1)]], num_terms=len(train_dict)).transpose()
    print(temp.shape)
    if temp.shape[0]==0:
        break
    #####We incrementally fit the pca in batches
    pca.partial_fit(temp)
    i+=1
    
    #####This line should be removed once we want to run this code on the entire dataset
    if i==3:
        break

Percent completion:       0.00%
(3000, 54415)


KeyboardInterrupt: 

In [20]:
pca.explained_variance_ratio_

array([0.00314384, 0.00254032, 0.00210664, 0.00182604, 0.00174032,
       0.00172537, 0.00167915, 0.00162576, 0.00160155, 0.00156684,
       0.00149198, 0.00148234, 0.00145228, 0.00141896, 0.00140582,
       0.00138152, 0.00135055, 0.00133839, 0.00132323, 0.0013175 ,
       0.00129023, 0.00128372, 0.0012599 , 0.00124494, 0.00123185,
       0.00122589, 0.00120789, 0.00120093, 0.00119938, 0.00117671,
       0.00117489, 0.00115578, 0.00115161, 0.0011495 , 0.00113882,
       0.00113278, 0.00112515, 0.00111606, 0.00111509, 0.00110743,
       0.00109876, 0.00109087, 0.00108623, 0.00108286, 0.00107521,
       0.00106363, 0.00105319, 0.00104927, 0.0010428 , 0.00104237,
       0.00103111, 0.00103059, 0.00102241, 0.0010196 , 0.00101462,
       0.00100866, 0.00100699, 0.00100584, 0.00099754, 0.00099129,
       0.00098093, 0.0009779 , 0.00097698, 0.00097197, 0.00096558,
       0.00096042, 0.00095521, 0.00095132, 0.00094671, 0.0009443 ,
       0.0009365 , 0.00093446, 0.00092947, 0.00092575, 0.00092

In [None]:
#####This cell should generate the transformed data in batches
i=0
batch_size=10
while True:
    print('Percent completion: ' + "{:10.2f}".format(100*i*batch_size/len(train_corpus)) + '%')
    temp = corpus2dense(train_tfidf[train_corpus[min(batch_size*i, len(train_corpus)-1):min(batch_size*(i+1), len(train_corpus)-1)]], num_terms=len(train_dict)).transpose()
    if temp.shape[0]==0:
        break
        
    #####out is a matrix containing the dimension reduced columns. Output to a pandas df not yet implemented. 
    out=pca.transform(temp)
    print(out)
    i+=1
    
    #####This line should be removed once we want to run this code on the entire dataset
    if i==2:
        break

## TF-IDF for test data


In [None]:
news_test = pd.read_csv("news_test.csv")

In [None]:
test_tokenized = [simple_preprocess(line) for line in news_test['cleaned_words']]
test_dict = corpora.Dictionary()
test_corpus = [test_dict.doc2bow(line, allow_update=True) for line in test_tokenized]

In [None]:
## Need to do TF-IDF  for test data, BUT we dont want to use the IDF for the test data - the relavant IDF is the training IDF