# TF-IDF Model training

In [4]:
import gc
import pickle as pkl

import cupy as cp
import numpy as np
import pandas as pd


from collections import Counter
from pathlib import Path

from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense



In [None]:
import spacy
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_trf", disable=["tok2vec", "ner"])

In [2]:
data_dir = Path(Path.cwd().parent, 'data/interim')
tfi_dir = Path(data_dir, 'tfidf')

In [None]:
def save_tfidf(data, file_name, file_content='corpus'):
    file = Path(tfi_dir, f"tfidf_{file_content}_{filename}")
    with open(file, 'wb') as handle:
        pkl.dump(terms, handle, protocol=pkl.HIGHEST_PROTOCOL)
    print(f'File {file} saved.')


def get_terms(string):
    terms = []
    for token in nlp(string):
        if not token.is_stop and not token.is_punct and not token.is_space\
        and not token.like_url and not token.like_email and not token.is_currency\
        and not token.like_num and token.pos_ != 'X' and not token.is_digit\
        and token.is_alpha:
            terms.append(f'{token}_{token.pos_}_{token.lemma_}')
    
    return terms


def preprocess_terms(data, file_name):
    file = Path(tfi_dir, f"terms_{file_name}")
    print(f'Obtaining terms in mode form_pos_lemma for file {file}')
    if file.is_file():
        print(f'File exists, loading terms')
        with open(file, 'rb') as handle:
            terms = pkl.load(handle)
    else:
        terms = [get_terms(message) for message in data]
        with open(file, 'wb') as handle:
            pkl.dump(terms, handle, protocol=pkl.HIGHEST_PROTOCOL)
    
    return terms
    

def get_tfidf(data, file_name):
    print('Converting terms to TF-IDF')
    dictionary = Dictionary(data)
    corpus = [dictionary.doc2bow(term) for term in data]
    num_docs = dictionary.num_docs
    num_terms = len(dictionary.keys())
    tfidf = TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    corpus_tfidf_dense = corpus2dense(corpus_tfidf, num_terms, num_docs)
    cupy_corpus_tfidf = cp.array(corpus_tfidf_dense.T)
    save_tfidf(data, file_name)
    save_tfidf(data, file_name, 'matrix')
    del cupy_corpus_tfidf
    
    print('Done')

In [None]:
for path in sorted(data_dir.glob('parsed_emails*.pkl'))[1:]:
    df = pd.read_pickle(path)
    file_name = '_'.join(path.name.split('_')[2:])
    messages = df['Message'].dropna().str.lower().to_numpy()
    terms = preprocess_terms(messages, file_name)
    get_tfidf(terms, file_name)
    gc.collect()

In [None]:
for path in sorted(data_dir.glob('parsed_emails*.pkl'))[1:]:
    print(path)

In [5]:
for path in sorted(tfi_dir.glob('*')):
    print(path)

/home/miguel/Projects/tfm-nlp/data/interim/tfidf/terms_chains_gt_1.pkl
/home/miguel/Projects/tfm-nlp/data/interim/tfidf/terms_lemma_gt_1.pkl


In [None]:
df = pd.read_pickle('/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_gt_1.pkl')
messages = df['Message'].dropna().str.lower().to_numpy()

In [None]:
messages

In [5]:
with open('/home/miguel/Projects/tfm-nlp/data/interim/tfidf/terms_chains_gt_1.pkl', 'rb') as handle:
    terms = pkl.load(handle)

In [7]:
len(terms)

47610

In [None]:
count_vect = [list(Counter(term).values()) for term in terms]

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [8]:
dictionary = Dictionary(terms)
corpus = [dictionary.doc2bow(term) for term in terms]
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [9]:
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [14]:
corpus_tfidf.save(str(Path(tfi_dir,'corpus_tfidf_terms_chains_gt_1.pkl')))

In [3]:
corpus_tfidf = pkl.load(str(Path(tfi_dir,'corpus_tfidf_terms_chains_gt_1.pkl')))

TypeError: file must have 'read' and 'readline' attributes

In [None]:
corpus[0]

In [None]:
[[(i, c) for i, c in doc if c > 1] for doc in corpus]

In [None]:
idx, freq = zip(*corpus[0])

In [None]:
freq

In [None]:
[(idx, freq) for zip(*doc) in corpus]

In [None]:
num_docs = len(terms)
num_terms = len(term_dict.keys())

In [None]:
terms_id = [[term_dict[term] for term in doc] for doc in terms]

In [6]:
terms_ =[' '.join(term) for term in terms]

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer



In [8]:
vectorizer = TfidfVectorizer(min_df=2)
X = vectorizer.fit_transform(terms_)





In [9]:
vectorizer.get_feature_names_out()

array(['00', '000', '0000108806', ..., 'zydeco_propn_zydeco',
       'zyr7vf_propn_zyr7vf', 'zz_propn_zz'], dtype=object)

In [10]:
print(X.shape)

(47610, 53954)


In [11]:
X.shape[0] * X.shape[1]

2568749940

In [None]:
corpus = [list(Counter(term).items()) for term in terms_id]

In [None]:
tfidf = TfidfModel(corpus)

In [None]:
corpus_tfidf = tfidf[corpus]

In [None]:
corpus_tfidf

In [None]:
corpus_tfidf_dense = corpus2dense(corpus_tfidf, num_terms, num_docs)

In [None]:
dictionary = Dictionary(terms)
corpus = [dictionary.doc2bow(term) for term in terms]
num_docs = dictionary.num_docs
num_terms = len(dictionary.keys())
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
corpus_tfidf_dense = corpus2dense(corpus_tfidf, num_terms, num_docs)
cupy_corpus_tfidf = cp.array(corpus_tfidf_dense.T)

In [None]:
len(terms)