In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from openai import OpenAI

In [None]:
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')
test_df_met1 = pd.read_csv('test_data_met1.csv')

In [None]:
# Concatenar QuestionTitle y QuestionBody
train_df["QuestionText"] = train_df["QuestionTitle"] + " " + train_df["QuestionBody"]
test_df["QuestionText"] = test_df["QuestionTitle"] + " " + test_df["QuestionBody"]
test_df_met1["QuestionText"] = test_df_met1["QuestionTitle"] + " " + test_df_met1["QuestionBody"]

In [None]:
# Inicializar el modelo con control de errores
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

def generate_embedding(text):
    try: 
        return model.encode(text).tolist()
    except Exception:
        print("Opps! Question: {}".format(text))
        return "embedding_error"

In [None]:
# Generar embeddings para cada pregunta
train_df['Embeddings'] = train_df['QuestionText'].apply(generate_embedding)

In [None]:
train_df.to_csv('test_data_embed.csv', index=False)

In [None]:
# Inicializar el modelo de parafraseo t5-large
paraphrase_model_name = 't5-large'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Mover el modelo a la GPU
paraphrase_model = T5ForConditionalGeneration.from_pretrained(paraphrase_model_name)
paraphrase_model.to(device)
paraphrase_tokenizer = T5Tokenizer.from_pretrained(paraphrase_model_name)

# Función para parafrasear texto
def paraphrase_text(text, model, tokenizer):
    encoding = tokenizer.encode_plus(text, return_tensors="pt", max_length=1024, truncation=True)
    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

    # Mover los tensores a la GPU
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=512,
        num_beams=8,
        num_return_sequences=1,
        temperature=1.0,
    )

    paraphrased_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return paraphrased_text


# Filtrar para obtener textos con al menos 80 caracteres
filtered_df = test_df_met1[test_df_met1['QuestionText'].str.len() >= 200]

# Ordenar el DataFrame por la longitud de `QuestionText` de menor a mayor
sorted_df = filtered_df.sort_values(by='QuestionText', key=lambda x: x.str.len())

# Seleccionar los primeros `n` elementos
n = 3
subset_df = sorted_df.head(n)

# Aplicar la función de parafraseo solo a estos `n` elementos
subset_df['paraphrased'] = subset_df['QuestionText'].apply(lambda x: paraphrase_text(x, paraphrase_model, paraphrase_tokenizer))



In [67]:
#Definir modelo OPENAI
client = OpenAI(api_key='')

def paraphrase_text_openai(text):
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"Paraphrase following text in original language with significant changes in structure: {text}"}
            ],
    max_tokens=180,  # Ajusta el número máximo de tokens en la respuesta
    temperature=1.5,  # Controla la creatividad de la respuesta
    top_p=1
    )
    paraphrased_text = response.choices[0].message.content
    return paraphrased_text

# Seleccionar los primeros `n` elementos con al menos 100 caracteres
n = 1000
filtered_df = test_df_met1[test_df_met1['QuestionText'].str.len() >= 100]
sorted_df = filtered_df.sort_values(by='QuestionText', key=lambda x: x.str.len())
subset_df = sorted_df.head(n)

# Aplicar la función de parafraseo solo a estos `n` elementos
subset_df['paraphrased'] = subset_df['QuestionText'].apply(lambda x: paraphrase_text_openai(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df['paraphrased'] = subset_df['QuestionText'].apply(lambda x: paraphrase_text_openai(x))


In [68]:
# Generar embeddings para cada pregunta
subset_df['Embeddings'] = subset_df['QuestionText'].apply(generate_embedding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df['Embeddings'] = subset_df['QuestionText'].apply(generate_embedding)


In [69]:
subset_df.to_csv('test_data_met1_embed.csv', index=False)

In [None]:
# Generar embeddings para cada pregunta
test_df['Embeddings'] = test_df['QuestionText'].apply(generate_embedding)

In [None]:

test_df.to_csv('test_data_embed.csv', index=False)
