# TF-IDF Model training

In [1]:
import gc
import pickle as pkl
import re

import cupy as cp
import numpy as np
import pandas as pd

from collections import Counter
from pathlib import Path

from cuml.feature_extraction.text import TfidfVectorizer
from cuml.metrics import pairwise_distances
from dist_matrix.cuda_dist_matrix_full import dist_matrix as gpu_dist_matrix
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense

In [2]:
data_dir = Path(Path.cwd().parent, 'data/interim')
tfi_dir = Path(data_dir, 'tfidf')

In [3]:
def clean_string(msg):
    tokens = msg.lower().strip().split()
    clean_tokens = [t for t in tokens if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    return clean_s

In [5]:
for path in data_dir.glob('parsed_*'):
    print(path)
#     dataset = pd.read_pickle(path)
#     messages = dataset['Message'].dropna().apply(lambda s: clean_string(s))
#     tfidf_vectorizer = TfidfVectorizer(stop_words='english')
#     X = tfidf_vectorizer.fit_transform(messages)

#     if not Path(tfi_dir, f'tfidf_normal_{path.stem[14:]}.npy').is_file():
#         cp.save(Path(tfi_dir, f'tfidf_normal_{path.stem[14:]}.npy'), X)
        
#     metric = 'wmd'
#     weights = np.ones(X.shape, dtype=np.float64)

#     X = gpu_dist_matrix(X, V=X, U_weights=weights, V_weights=weights, metric='wasserstein')

#     if not Path(data_dir, metric, f'tfidf_{metric}_{path.stem[14:]}.npy').is_file():
#         np.save(Path(data_dir, metric, f'tfidf_{metric}_{path.stem[14:]}.npy'), X)



/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_eq_2.pkl
/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_ge_10.pkl
/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_ge_4_lt_10.pkl
/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_eq_3.pkl


In [None]:
for path in data_dir.glob('parsed_*'):
    dataset = pd.read_pickle(path)
    messages = dataset['Message'].dropna().apply(lambda s: clean_string(s))
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    X = tfidf_vectorizer.fit_transform(messages)

    if not Path(tfi_dir, f'tfidf_normal_{path.stem[14:]}.npy').is_file():
        cp.save(Path(tfi_dir, f'tfidf_normal_{path.stem[14:]}.npy'), X)

    metric = 'wmd'
    X = pairwise_distances(X, metric=metric)

    if not Path(data_dir, metric, f'tfidf_{metric}_{path.stem[14:]}.npy').is_file():
        np.save(Path(data_dir, metric, f'tfidf_{metric}_{path.stem[14:]}.npy'), X)



In [None]:
for path in data_dir.glob('parsed_*'):
    print(f'tfidf_normal_{path.stem[14:]}.pkl')

In [None]:
datasets = [pd.read_pickle(path) for path in data_dir.glob('parsed_*')]

In [None]:
messages = datasets[0]['Message'].dropna()

In [None]:
# cln_msg = [clean_string(string) for string in messages] 
cln_msg = messages.apply(lambda x: clean_string(x))

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X = tfidf_vectorizer.fit_transform(cln_msg)

In [None]:
km = KMeans(n_clusters=3)
km.fit(X)
labels = km.labels_.tolist()

In [None]:
len(labels)

In [None]:
import spacy
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_trf", disable=["tok2vec", "ner"])

In [None]:
def save_tfidf(data, file_name, file_content='corpus'):
    file = Path(tfi_dir, f"tfidf_{file_content}_{filename}")
    with open(file, 'wb') as handle:
        pkl.dump(terms, handle, protocol=pkl.HIGHEST_PROTOCOL)
    print(f'File {file} saved.')


def get_terms(string):
    terms = []
    for token in nlp(string):
        if not token.is_stop and not token.is_punct and not token.is_space\
        and not token.like_url and not token.like_email and not token.is_currency\
        and not token.like_num and token.pos_ != 'X' and not token.is_digit\
        and token.is_alpha:
            terms.append(f'{token}_{token.pos_}_{token.lemma_}')
    
    return terms


def preprocess_terms(data, file_name):
    file = Path(tfi_dir, f"terms_{file_name}")
    print(f'Obtaining terms in mode form_pos_lemma for file {file}')
    if file.is_file():
        print(f'File exists, loading terms')
        with open(file, 'rb') as handle:
            terms = pkl.load(handle)
    else:
        terms = [get_terms(message) for message in data]
        with open(file, 'wb') as handle:
            pkl.dump(terms, handle, protocol=pkl.HIGHEST_PROTOCOL)
    
    return terms
    

def get_tfidf(data, file_name):
    print('Converting terms to TF-IDF')
    dictionary = Dictionary(data)
    corpus = [dictionary.doc2bow(term) for term in data]
    num_docs = dictionary.num_docs
    num_terms = len(dictionary.keys())
    tfidf = TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    corpus_tfidf_dense = corpus2dense(corpus_tfidf, num_terms, num_docs)
    cupy_corpus_tfidf = cp.array(corpus_tfidf_dense.T)
    save_tfidf(data, file_name)
    save_tfidf(data, file_name, 'matrix')
    del cupy_corpus_tfidf
    
    print('Done')

In [None]:
for path in sorted(data_dir.glob('parsed_emails*.pkl'))[1:]:
    df = pd.read_pickle(path)
    file_name = '_'.join(path.name.split('_')[2:])
    messages = df['Message'].dropna().str.lower().to_numpy()
    terms = preprocess_terms(messages, file_name)
    get_tfidf(terms, file_name)
    gc.collect()

In [None]:
for path in sorted(data_dir.glob('parsed_emails*.pkl'))[1:]:
    print(path)

In [None]:
for path in sorted(tfi_dir.glob('*')):
    print(path)

In [None]:
df = pd.read_pickle('/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_gt_1.pkl')
messages = df['Message'].dropna().str.lower().to_numpy()

In [None]:
messages

In [None]:
with open('/home/miguel/Projects/tfm-nlp/data/interim/tfidf/terms_chains_gt_1.pkl', 'rb') as handle:
    terms = pkl.load(handle)

In [None]:
len(terms)

In [None]:
count_vect = [list(Counter(term).values()) for term in terms]

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
dictionary = Dictionary(terms)
corpus = [dictionary.doc2bow(term) for term in terms]
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [None]:
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [None]:
corpus_tfidf.save(str(Path(tfi_dir,'corpus_tfidf_terms_chains_gt_1.pkl')))

In [None]:
corpus_tfidf = pkl.load(str(Path(tfi_dir,'corpus_tfidf_terms_chains_gt_1.pkl')))

In [None]:
corpus[0]

In [None]:
[[(i, c) for i, c in doc if c > 1] for doc in corpus]

In [None]:
idx, freq = zip(*corpus[0])

In [None]:
freq

In [None]:
[(idx, freq) for zip(*doc) in corpus]

In [None]:
num_docs = len(terms)
num_terms = len(term_dict.keys())

In [None]:
terms_id = [[term_dict[term] for term in doc] for doc in terms]

In [None]:
terms_ =[' '.join(term) for term in terms]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer



In [None]:
vectorizer = TfidfVectorizer(min_df=2)
X = vectorizer.fit_transform(terms_)





In [None]:
vectorizer.get_feature_names_out()

In [None]:
print(X.shape)

In [None]:
X.shape[0] * X.shape[1]

In [None]:
corpus = [list(Counter(term).items()) for term in terms_id]

In [None]:
tfidf = TfidfModel(corpus)

In [None]:
corpus_tfidf = tfidf[corpus]

In [None]:
corpus_tfidf

In [None]:
corpus_tfidf_dense = corpus2dense(corpus_tfidf, num_terms, num_docs)

In [None]:
dictionary = Dictionary(terms)
corpus = [dictionary.doc2bow(term) for term in terms]
num_docs = dictionary.num_docs
num_terms = len(dictionary.keys())
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
corpus_tfidf_dense = corpus2dense(corpus_tfidf, num_terms, num_docs)
cupy_corpus_tfidf = cp.array(corpus_tfidf_dense.T)

In [None]:
len(terms)