# Analisis de Similitud de SOPs

#### Importo las librerias 

In [45]:
import numpy as np
import pandas as pd
import os
import smart_open
import gensim
import matplotlib.pyplot as plt
import plotly.express as px
from gensim.models import Doc2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


#### Paso los documentos de pdf y docx a txt

In [46]:
# %run 1_transformar_formato.py

## Doc2Vec por documento

#### Generacion del Corpus

En read_corpus tokenizamos y etiquetamos los documentos

In [47]:
def read_corpus(fname, tokens_only=False):
    for document in os.listdir(fname):
        file_path = os.path.join(fname, document)
        with smart_open.open(file_path, encoding="iso-8859-1") as f:
            content = f.read()
            tokens = gensim.utils.simple_preprocess(content)
            if tokens_only:
                yield tokens    
            else:
                yield gensim.models.doc2vec.TaggedDocument(tokens, [document])



corpus = list(read_corpus("../SOPs txt"))

print(corpus [:2])
print("Documentos leidos: ", len(corpus))

[TaggedDocument(words=['objetivo', 'auditorã', 'as', 'de', 'buenas', 'prã', 'cticas', 'de', 'manufactura', 'gmp', 'buenas', 'prã', 'cticas', 'de', 'distribuciã³n', 'gdp', 'son', 'realizadas', 'frecuentemente', 'para', 'verificar', 'que', 'la', 'fabricaciã³n', 'asã', 'como', 'tambiã', 'el', 'almacenaje', 'distribuciã³n', 'de', 'materiales', 'productos', 'siguen', 'las', 'leyes', 'estã', 'ndares', 'las', 'aplicaciones', 'de', 'autorizaciã³n', 'del', 'mercado', 'las', 'regulaciones', 'de', 'bayer', 'este', 'documento', 'provee', 'los', 'requerimientos', 'mã', 'nimos', 'para', 'la', 'planificaciã³n', 'ejecuciã³n', 'documentaciã³n', 'de', 'las', 'auditorã', 'as', 'gmp', 'gdp', 'iso', 'la', 'auditorã', 'de', 'calidad', 'es', 'uno', 'de', 'los', 'elementos', 'clave', 'del', 'sistema', 'de', 'gestiã³n', 'de', 'cumplimiento', 'de', 'bayer', 'mediante', 'dichas', 'auditorã', 'as', 'se', 'determina', 'el', 'grado', 'de', 'cumplimiento', 'de', 'un', 'proveedor', 'fabricante', 'con', 'los', 'requer

Divido el corpus en un set de entranamiento y uno de testeo

In [48]:
train_corpus, test_corpus = train_test_split(corpus, test_size=0.2, random_state=42)
# print(len(train_corpus))

Guardo el corpus de entranamiento en un txt

In [49]:
with open("corpus.txt", 'w') as file:
    file.write(str(corpus))

#### Entreno el modelo

In [50]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [51]:
print(f"La palabra 'objetivo' aparece {model.wv.get_vecattr('objetivo', 'count')} veces en el corpus de entrenamiento.")

La palabra 'objetivo' aparece 1109 veces en el corpus de entrenamiento.


Genero los archivos de vectores y tensores para poder verlo de forma grafica en [TensorFlow](https://projector.tensorflow.org/)

In [53]:
model.save_word2vec_format('doc_tensor.w2v', doctag_vec=True, word_vec=False)
model.save('doc_tensor_model')
#%run 00_process_d2v_files.py -i doc_tensor.w2v


with open("metadata.tsv", 'r') as file:
    lineas = file.readlines()

lineas = [linea.replace("*dt_", "", 1) for linea in lineas]

with open("metadata.tsv", 'w') as file:
    file.writelines(lineas)

#### Testeo el modelo

In [54]:
y_true = []
y_pred = []

# print(len(test_corpus))

model = Doc2Vec.load('doc_tensor_model')
i=0
simil_list = []
for doc in corpus:
    vec = model.infer_vector(doc.words)
    simil = model.dv.most_similar([vec])
    # simil_elem = [[doc[1][0]]]
    # simil_elem[i].append(simil)
    # simil_list.append(simil_elem)
    print("\033[92m" + f"  ########################################{doc[1][0]}#########################################" + "\033[0m")
    for similar_doc in simil:
        print(f"  Document Corpus: {similar_doc[0]} | Input Document: {doc[1][0]} | Similarity: {similar_doc[1]:.4f}")

    predicted_label = simil[0][0]
    y_true.append(doc.tags[0])
    y_pred.append(predicted_label)

[92m  ########################################A0002.txt#########################################[0m
  Document Corpus: A0002.txt | Input Document: A0002.txt | Similarity: 0.9971
  Document Corpus: G0001.txt | Input Document: A0002.txt | Similarity: 0.9156
  Document Corpus: G0223.txt | Input Document: A0002.txt | Similarity: 0.8885
  Document Corpus: G0002.txt | Input Document: A0002.txt | Similarity: 0.8059
  Document Corpus: S0011.txt | Input Document: A0002.txt | Similarity: 0.8038
  Document Corpus: G0133.txt | Input Document: A0002.txt | Similarity: 0.7799
  Document Corpus: A0023.txt | Input Document: A0002.txt | Similarity: 0.7509
  Document Corpus: G0206.txt | Input Document: A0002.txt | Similarity: 0.7442
  Document Corpus: A0025.txt | Input Document: A0002.txt | Similarity: 0.7151
  Document Corpus: A0013.txt | Input Document: A0002.txt | Similarity: 0.7014
[92m  ########################################A0005.txt#########################################[0m
  Document Corpu

#### Graficamos

In [56]:
vectors = pd.read_csv('vectors.tsv', sep='\t', header=None)

metadata = pd.read_csv('metadata.tsv', sep='\t', header=None)

Debemos reducir la dimensionalidad

##### PCA

In [58]:
pca = PCA(n_components=2)

vectors_red = pca.fit_transform(vectors)

In [59]:
x = vectors_red[:, 0]
y = vectors_red[:, 1]

fig = px.scatter(vectors_red, x, y, hover_name=metadata[0], title='Reduccion por PCA')

fig.show()



##### T-SNE

In [60]:
tsne = TSNE(n_components=2, random_state=42)

vectors_red2 = tsne.fit_transform(vectors)

df = pd.DataFrame(vectors_red2)

df['Name'] = metadata[0]


Could not find the number of physical cores for the following reason:
found 0 physical cores < 1

  File "c:\Program Files\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [61]:
fig = px.scatter(df, x, y, hover_name='Name', title='Reduccion por T-SNE')

fig.show()