In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from openai import OpenAI

  from tqdm.autonotebook import tqdm, trange


In [3]:
train_df = pd.read_csv('train_data.csv')
#test_df = pd.read_csv('test_data.csv')
test_df_met1 = pd.read_csv('test_data_met1.csv')

In [4]:
# Concatenar QuestionTitle y QuestionBody
train_df["QuestionText"] = train_df["QuestionTitle"] + " " + train_df["QuestionBody"]
#test_df["QuestionText"] = test_df["QuestionTitle"] + " " + test_df["QuestionBody"]
test_df_met1["QuestionText"] = test_df_met1["QuestionTitle"] + " " + test_df_met1["QuestionBody"]

In [9]:
# Inicializar el modelo con control de errores
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

def generate_embedding(text):
    try: 
        return model.encode(text).tolist()
    except Exception:
        print("Opps! Question: {}".format(text))
        return "embedding_error"



In [None]:
# Generar embeddings para cada pregunta
train_df['Embeddings'] = train_df['QuestionText'].apply(generate_embedding)

In [None]:
train_df.to_csv('train_data_embed.csv', index=False)

In [None]:
# Inicializar el modelo de parafraseo t5-large
paraphrase_model_name = 't5-large'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Mover el modelo a la GPU
paraphrase_model = T5ForConditionalGeneration.from_pretrained(paraphrase_model_name)
paraphrase_model.to(device)
paraphrase_tokenizer = T5Tokenizer.from_pretrained(paraphrase_model_name)

# Función para parafrasear texto
def paraphrase_text(text, model, tokenizer):
    encoding = tokenizer.encode_plus(text, return_tensors="pt", max_length=1024, truncation=True)
    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

    # Mover los tensores a la GPU
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=512,
        num_beams=8,
        num_return_sequences=1,
        temperature=1.0,
    )

    paraphrased_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return paraphrased_text


# Filtrar para obtener textos con al menos 80 caracteres
filtered_df = test_df_met1[test_df_met1['QuestionText'].str.len() >= 200]

# Ordenar el DataFrame por la longitud de `QuestionText` de menor a mayor
sorted_df = filtered_df.sort_values(by='QuestionText', key=lambda x: x.str.len())

# Seleccionar los primeros `n` elementos
n = 3
subset_df = sorted_df.head(n)

# Aplicar la función de parafraseo solo a estos `n` elementos
subset_df['paraphrased'] = subset_df['QuestionText'].apply(lambda x: paraphrase_text(x, paraphrase_model, paraphrase_tokenizer))



In [67]:
#Definir modelo OPENAI
client = OpenAI(api_key='')

def paraphrase_text_openai(text):
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"Paraphrase following text in original language with significant changes in structure: {text}"}
            ],
    max_tokens=180,  # Ajusta el número máximo de tokens en la respuesta
    temperature=1.5,  # Controla la creatividad de la respuesta
    top_p=1
    )
    paraphrased_text = response.choices[0].message.content
    return paraphrased_text

# Seleccionar los primeros `n` elementos con al menos 100 caracteres
n = 1000
filtered_df = test_df_met1[test_df_met1['QuestionText'].str.len() >= 100]
sorted_df = filtered_df.sort_values(by='QuestionText', key=lambda x: x.str.len())
subset_df = sorted_df.head(n)

# Aplicar la función de parafraseo solo a estos `n` elementos
subset_df['paraphrased'] = subset_df['QuestionText'].apply(lambda x: paraphrase_text_openai(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df['paraphrased'] = subset_df['QuestionText'].apply(lambda x: paraphrase_text_openai(x))


In [68]:
# Generar embeddings para cada pregunta
subset_df['Embeddings'] = subset_df['paraphrased'].apply(generate_embedding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df['Embeddings'] = subset_df['QuestionText'].apply(generate_embedding)


In [69]:
subset_df.to_csv('test_data_met1_embed.csv', index=False)

In [None]:
# Generar embeddings para cada pregunta
test_df['Embeddings'] = test_df['QuestionText'].apply(generate_embedding)

In [None]:

test_df.to_csv('test_data_embed.csv', index=False)


In [7]:
#Definir modelo OPENAI modificando parafraseo por plantear pregunta similar
client = OpenAI(api_key='')

def paraphrase_text_openai(text):
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},        
                {"role": "user", "content": f"Based on the following text, generate a new and different question that is still about the same topic: {text}"}            ],
    max_tokens=180,  # Ajusta el número máximo de tokens en la respuesta
    temperature=1.5,  # Controla la creatividad de la respuesta
    top_p=1
    )
    paraphrased_text = response.choices[0].message.content
    return paraphrased_text

# Seleccionar los primeros `n` elementos con al menos 100 caracteres
n = 1000
filtered_df = test_df_met1[test_df_met1['QuestionText'].str.len() >= 100]
sorted_df = filtered_df.sort_values(by='QuestionText', key=lambda x: x.str.len())
subset_df = sorted_df.head(n)

# Aplicar la función de parafraseo solo a estos `n` elementos
subset_df['paraphrased'] = subset_df['QuestionText'].apply(lambda x: paraphrase_text_openai(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df['paraphrased'] = subset_df['QuestionText'].apply(lambda x: paraphrase_text_openai(x))


In [10]:
subset_df

Unnamed: 0,QuestionId,AcceptedAnswerId,QuestionBody,QuestionTitle,QuestionTags,QuestionPostTypeId,QuestionOwnerUserId,QuestionScore,AnswerId,AnswerBody,...,AnswerPostTypeId,AnswerOwnerUserId,AnswerScore,UserId,Reputation,UpVotes,DownVotes,Views,QuestionText,paraphrased
18316,47363,47364,What is the command to list all triggers in a ...,Show all triggers in a MySQL database,"['sql', 'mysql', 'database', 'triggers', 'sqlc...",1,4704.0,128,47364,The command for listing all triggers is:\nshow...,...,2,4704,200,4704,5636,130,5,122,Show all triggers in a MySQL database What is ...,How can I view the complete list of stored pro...
4448,142504,142520,What are some methods of utilising Eclipse for...,Eclipse: Dependency Management,"['java', 'eclipse', 'maven-2', 'eclipse-plugin...",1,4857.0,7,142520,I really like the The Maven Integration for Ec...,...,2,3636,14,3636,12615,161,0,253,Eclipse: Dependency Management What are some m...,What are the key features to consider when con...
18008,70878287,70878664,Add an bool column based array of string column\n,Adding a column in dataframe based on another ...,"['dataframe', 'scala', 'apache-spark', 'apache...",1,,0,70878664,Use exists function for Spark 2.4+:\nval df = ...,...,2,1386551,0,1386551,31870,1917,1381,9581,Adding a column in dataframe based on another ...,How can you utilize a numerical column to dyna...
7546,13421376,13421514,"How do I replace age with 31?\n[{""name""=>""Bob""...",Ruby / Replace value in array of hash,['ruby'],1,450837.0,5,13421514,"Another way, using find\n1.9.3p194 :007 > arra...",...,2,43365,10,43365,6780,809,2,382,Ruby / Replace value in array of hash How do I...,"What is the Ruby code to update the ""age"" in t..."
2422,919056,919067,What's the easiest way to do a case-insensitiv...,Case insensitive replace,"['python', 'string', 'case-insensitive']",1,79.0,245,919067,The string type doesn't support this. You're p...,...,2,1199,284,1199,236729,2138,26,7265,Case insensitive replace What's the easiest wa...,Sure! Here is a new reference question on the ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4294,59927258,59927585,I want the command that was used to invoke the...,Get the command that executed the python scrip...,"['python', 'python-3.x']",1,12789749.0,0,59927585,Use sys.executable and sys.argv:\nfrom __futur...,...,2,629836,2,629836,114,51,0,35,Get the command that executed the python scrip...,Sure! Here is a different question related to ...
14554,40414946,40415157,Why does the following result in an array with...,Strange result from String.Split(),['powershell'],1,2063755.0,3,40415157,It splits the string for each character in the...,...,2,332188,2,332188,2974,444,201,295,Strange result from String.Split() Why does th...,Why are there 7 elements in the array when usi...
11443,13311303,13312830,"I'm making an application with sql database,\n...",Result type of Linq expression,"['c#', 'linq']",1,1489627.0,0,13312830,You could use this as suggested:\nprivate void...,...,2,35165,0,35165,2695,105,8,446,Result type of Linq expression I'm making an a...,What is the proper method to access and utiliz...
18384,889054,889137,I have a C++ dll that I need to call from C#. ...,Calling DLL function with char* param from C#?,['c#'],1,110044.0,4,889137,Just using strings will work fine for input pa...,...,2,5354,2,5354,1364,347,3,166,Calling DLL function with char* param from C#?...,Another related question that you may have is:...


In [11]:
# Generar embeddings para cada pregunta
subset_df['Embeddings'] = subset_df['paraphrased'].apply(generate_embedding)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df['Embeddings'] = subset_df['paraphrased'].apply(generate_embedding)


In [12]:
subset_df.to_csv('test_data_met1_embed_V2.csv', index=False)