# NLP on ARES

### Packages

In [2]:
import spacy

In [3]:
%pip install spacy

Note: you may need to restart the kernel to use updated packages.


## Bigram Tokenizer

In [11]:
# Define the function to tokenize a string into bigrams
def tokenize_bigrams(text):
    # Process the text using the spaCy pipeline
    doc = nlp(text)
    # Extract the lemmatized tokens from the processed text
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    # Combine adjacent pairs of tokens into bigrams
    bigrams = [f"{tokens[i]} {tokens[i+1]}" for i in range(len(tokens)-1)]
    return bigrams

print(tokenize_bigrams("Documentación proceso multipago bancario ajuste estructura archivo consignantes"))

['Documentación proceso', 'proceso multipago', 'multipago bancario', 'bancario ajuste', 'ajuste estructura', 'estructura archivo', 'archivo consignant']


## ARES Activities Dataset and Filtering

In [4]:
import pandas as pd

df = pd.read_csv("ARES2_EJECUCION_ACTIVIDADES.csv")
df.head(10)

Unnamed: 0,DESCRIPCION,CODIGO_ETAPA,DURACION_HORAS
0,Vacaciones,VAC,8.0
1,Vacaciones,VAC,8.0
2,Documentación proceso multipago bancario ajust...,COCOD,3.0
3,Documentación proceso acumulación pisos y paso...,COCOD,3.0
4,Documentación proceso acumulación pisos y cont...,COCOD,2.0
5,Vacaciones,VAC,8.0
6,Vacaciones,VAC,8.0
7,Vacaciones,VAC,8.0
8,Vacaciones,VAC,8.0
9,Vacaciones,VAC,8.0


In [5]:
df.info

<bound method DataFrame.info of                                              DESCRIPCION CODIGO_ETAPA  \
0                                             Vacaciones          VAC   
1                                             Vacaciones          VAC   
2      Documentación proceso multipago bancario ajust...        COCOD   
3      Documentación proceso acumulación pisos y paso...        COCOD   
4      Documentación proceso acumulación pisos y cont...        COCOD   
...                                                  ...          ...   
52851  Reunión daily, Documentando el código desarrol...        COCOD   
52852  Generando los test unitarios a los servicios y...        COCOD   
52853  Homologando las colecciones de postman, optimi...        COCOD   
52854  Validación de los datos de parametrización e i...        TRCON   
52855  Apoyo al equipo de trabajo con las tareas del ...        DIAPL   

       DURACION_HORAS  
0                 8.0  
1                 8.0  
2                 3

In [6]:
df['CODIGO_ETAPA'].nunique()

136

In [7]:
#filter the table in order to have only data from the some codes
codes = ['APSEG', 'PRSIS', 'ASEJE', 'COAJU']
df1=df[df["CODIGO_ETAPA"].isin(codes)]
df1.columns

Index(['DESCRIPCION', 'CODIGO_ETAPA', 'DURACION_HORAS'], dtype='object')

#### Load of Trained Language Model in Spanish

In [10]:
# Load the Spanish language model
!python -m spacy download es_core_news_sm
nlp = spacy.load("es_core_news_sm")


Collecting es-core-news-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0-py3-none-any.whl (12.9 MB)
     --------------------------------------- 12.9/12.9 MB 11.9 MB/s eta 0:00:00
Installing collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.3.0
✔ Download and installation successful
You can now load the package via spacy.load('es_core_news_sm')


## Jaccard Similarity per CÓDIGO_ETAPA

In [19]:
import spacy
import pandas as pd


# Define the function to tokenize a string into bigrams
def tokenize_bigrams(text):
    # Process the text using the spaCy pipeline
    doc = nlp(text)
    # Extract the lemmatized tokens from the processed text
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    # Combine adjacent pairs of tokens into bigrams
    bigrams = [f"{tokens[i]} {tokens[i+1]}" for i in range(len(tokens)-1)]
    return set(bigrams)


# Group the descriptions by unique code in 'CODIGO_ETAPA' and tokenize them into sets of bigrams
code_sets = {}
for code in df1['CODIGO_ETAPA'].unique():
    code_set = set()
    for description in df1.loc[df1['CODIGO_ETAPA'] == code, 'DESCRIPCION']:
        code_set.update(tokenize_bigrams(description))
    code_sets[code] = code_set

# Get the user input
prompt = input("Please enter a description: ")
prompt_set = tokenize_bigrams(prompt)

# Compute the Jaccard similarity between the sets and the prompt
similarities = {}
for code, code_set in code_sets.items():
    jaccard_sim = len(prompt_set.intersection(code_set)) / len(prompt_set.union(code_set))
    similarities[code] = jaccard_sim

# Print the similarity scores in descending order
for code, sim in sorted(similarities.items(), key=lambda x: x[1], reverse=True):
    print(f"CODIGO_ETAPA: {code} - Jaccard similarity: {sim:.2f}")
    


CODIGO_ETAPA: COAJU - Jaccard similarity: 0.00
CODIGO_ETAPA: PRSIS - Jaccard similarity: 0.00
CODIGO_ETAPA: APSEG - Jaccard similarity: 0.00
CODIGO_ETAPA: ASEJE - Jaccard similarity: 0.00


## Search Engine

Dataset is filtered by codes APSEG, PRSIS, ASEJE, COAJU


In [16]:
import spacy
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the function to tokenize a string into bigrams
def tokenize_bigrams(text):
    # Process the text using the spaCy pipeline
    doc = nlp(text)
    # Extract the lemmatized tokens from the processed text
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    # Combine adjacent pairs of tokens into bigrams
    bigrams = [f"{tokens[i]} {tokens[i+1]}" for i in range(len(tokens)-1)]
    return bigrams

# Tokenize the 'DESCRIPCION' column into bigrams and create a set of unique bigrams for each row
df1["bigrams"] = df1["DESCRIPCION"].apply(lambda x: set(tokenize_bigrams(x)))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["bigrams"] = df1["DESCRIPCION"].apply(lambda x: set(tokenize_bigrams(x)))


### Lineas de prueba

* QA Ciclo 1
* - Pruebas de regresión y cierre de bugs
* HU módulo 
* Desarrollo actividades para ajustes segun archivo del

In [21]:
# Prompt the user for a description
user_description = input("Please enter a description: ")

# Tokenize the user description into bigrams
user_bigrams = set(tokenize_bigrams(user_description))

vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), lowercase=False, use_idf=True)
corpus = [" ".join(row) for row in df1["bigrams"].values]
X = vectorizer.fit_transform(corpus)
X = X.log1p()



# Compute the cosine similarity between the user bigrams and the bigrams in the dataset
user = vectorizer.transform([" ".join(user_bigrams)])
similarity_scores = cosine_similarity(user, X)

# Get the index of the row with the highest similarity score
most_similar_index = similarity_scores.argmax()

# Get the corresponding DESCRIPCION of the highest similarity score
mostSimilarDescripcion = df1.iloc[most_similar_index]["DESCRIPCION"]

# Get the corresponding value of the 'CODIGO_ETAPA' column
recommended_etapa = df1.iloc[most_similar_index]["CODIGO_ETAPA"]

# Print the most similar DESCRIPCION and the recommended CODIGO_ETAPA
print(f'User\'s input: {user_description}')
print(f"The most similar description for this input is: {mostSimilarDescripcion}")
print(f"The recommended CODIGO_ETAPA for this description is: {recommended_etapa}")

User's input: Desarrollo actividades para ajustes segun archivo del
The most similar description for this input is: Desarrollo actividades para ajustes segun archivo del cliente
The recommended CODIGO_ETAPA for this description is: COAJU
