In [None]:
import pandas as pd
import numpy as np
import os
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from openai import AzureOpenAI, RateLimitError
import concurrent.futures
import random
import time
import nltk
import string
from dotenv import load_dotenv
nltk.download('wordnet')

os.environ["TOKENIZERS_PARALLELISM"] = "false"
load_dotenv()

embeddings_file = 'embeddings.npy'
deployment_name='gpt4Test' #model_name also (azure...)
max_tokens=300

# number of issues being passed into ai prompt
max_similar_issues = 3

min_similarity_score = 0.9
max_similarity_score = 1.0

random_rows_number = 6

# how many times script is going to run with the same params: 
# model_name(deployment_name), max_tokens, max_similar_issues, min_similarity_score, max_similarity_score, random_rows_number
iterations = 1

client = AzureOpenAI(
    api_key=os.getenv("AZURE_API_KEY"),  
    api_version=os.getenv("AZURE_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_ENDPOINT")
)

def random_string(length=6):
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(length))

def create_result_directory(max_tokens, min_similarity_score, max_similarity_score, deployment_name):
    folder_name = f"{deployment_name}_{max_tokens}_{min_similarity_score}_{max_similarity_score}_{random_string()}"
    results_dir = "ai_assistant_results"
    os.makedirs(os.path.join(results_dir, folder_name))
    return os.path.join(results_dir, folder_name)

def save_parameters(folder_name, deployment_name, max_tokens, min_similarity_score, max_similarity_score):
    params = {
        "model_name": deployment_name,
        "max_tokens": max_tokens,
        "min_similarity_score": min_similarity_score,
        "max_similarity_score": max_similarity_score
    }
    with open(os.path.join(folder_name, "parameters.txt"), "w") as f:
        for key, value in params.items():
            f.write(f"{key}: {value}\n")

def save_results(folder_name, dataframe):
    dataframe.to_csv(os.path.join(folder_name, "result.csv"), index=False)

def get_random_rows_from_df(df, n=13):
    total_rows = len(df)

    n = min(n, total_rows)

    random_indices = random.sample(range(total_rows), n)
    rows = df.iloc[random_indices]
    return rows

def generate_chat_gpt_query(prompt, retries=1):
    if (retries < 0):
        return "No suggestion generated"
    try:
        response = client.chat.completions.create(
            model=deployment_name,
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_tokens,
            temperature=0,
            n=1,
            stop=None
        )

        if response.choices:
            return response.choices[0].message.content.strip()
        else:
            return "No suggestion generated."
        
    except RateLimitError as e:
        print(f"Rate limit exceeded. Retrying after 60 seconds...")
        time.sleep(60)
        return generate_chat_gpt_query(prompt, retries-1)  # Retry the request

def generate_prompt(current_issue_description, similar_issues):
    prompt = f"**Current Issue Description:**\n_{current_issue_description}_\n\n"
    prompt += "**Similar Issues and Their Estimations in Story Points:**\n"
    for i, (issue_key, similarity, estimation, description) in enumerate(similar_issues, start=1):
        similarity_percentage = similarity * 100
        prompt += f"Issue Key: {issue_key}\n"
        prompt += f"Similarity to Current Issue: {similarity_percentage:.2f}%\n"
        prompt += f"Estimation: {estimation} Story Points\n"
        prompt += f"Description: {description}\n"

    prompt += "\n**Prompt for Estimation and Effort Opinion:**\n"
    prompt += "Based on the similarities and differences between the current issue and the provided similar issues, please provide:\n"
    prompt += "1. **Estimation in Story Points:** _(Provide a numerical estimation)_\n"
    prompt += "2. **Effort Opinion:** _(Provide an opinion on the effort required to address the current issue, considering factors such as complexity, scope, and any notable similarities or differences with the similar issues)_"

    return prompt

def find_similar_issues(similarity_matrix, dataset, current_issue_index, min_similarity_score, max_similar_issues, max_similarity_score):
    similar_issues = []
    similarity_scores = similarity_matrix[current_issue_index]
    similar_issue_indices = np.where((similarity_scores >= min_similarity_score) & (similarity_scores <= max_similarity_score))[0]
    similar_issue_indices = similar_issue_indices[similar_issue_indices != current_issue_index]
    similar_issue_indices = similar_issue_indices[np.argsort(similarity_scores[similar_issue_indices])[::-1]]
    
    for issue_index in similar_issue_indices[:max_similar_issues]:
        issue_key = dataset.iloc[issue_index]['issuekey']
        similarity_score = similarity_scores[issue_index]
        description = dataset.iloc[issue_index]['text']
        estimation = dataset.iloc[issue_index]['storypoint']
        similar_issues.append((issue_key, similarity_score, estimation, description))
    
    return similar_issues

def generate_embeddings(model, dataset, batch_size=8):
    embeddings = []
    for i in range(0, len(dataset), batch_size):
        batch_texts = dataset['cleaned_text'].iloc[i:i+batch_size].values
        batch_embeddings = model.encode(batch_texts)
        embeddings.append(batch_embeddings)
    return np.concatenate(embeddings)

def load_or_generate_embeddings(model, dataset, embeddings_file, batch_size=8):
    if os.path.exists(embeddings_file):
        print("Loading embeddings from file...")
        embeddings = np.load(embeddings_file)
    else:
        print("Generating embeddings...")
        embeddings = generate_embeddings(model, dataset, batch_size)
        np.save(embeddings_file, embeddings)
    return embeddings

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'http\S+', '', text)
    
    text = text.lower()
    
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    stop_words = ENGLISH_STOP_WORDS
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    return text

dataset = pd.read_csv("dataset.csv")

dataset = dataset.loc[dataset['storypoint'] <= 8]
dataset = dataset.loc[dataset['storypoint'] != 0.1]
dataset = dataset.loc[dataset['storypoint'] != 0.6]
dataset = dataset.loc[dataset['storypoint'] != 1.5]
dataset = dataset.loc[dataset['storypoint'] != 4.0]
dataset = dataset.loc[dataset['storypoint'] != 7.0]

dataset['text'] = dataset.apply(lambda row: row['title'] + ' ' + row['description'] if pd.notnull(row['description']) else row['title'], axis=1)
dataset['cleaned_text'] = dataset['text'].apply(clean_text)

model = SentenceTransformer('Collab-uniba/github-issues-preprocessed-mpnet-st-e10')
embeddings = load_or_generate_embeddings(model, dataset, embeddings_file)
print("Embeddings shape:", embeddings.shape)

similarity_matrix = cosine_similarity(embeddings)

for iter in range(iterations):
    # Find similar issues for each issue in the dataset
    all_similar_issues = {}
    for i in range(len(dataset)):
        similar_issues = find_similar_issues(similarity_matrix, dataset, i, min_similarity_score, max_similar_issues, max_similarity_score)
        all_similar_issues[i] = similar_issues

    # Create an empty list to store the data
    data = []

    # Iterate over each issue and generate the prompt
    for i, similar_issues in all_similar_issues.items():
        if (len(similar_issues) > 0):
            current_issue_key = dataset.iloc[i]['issuekey']
            current_issue_text = dataset.iloc[i]['text']
            current_issue_storypoint = dataset.iloc[i]['storypoint']
            prompt = generate_prompt(current_issue_text, similar_issues)
            
            # Append data to the list
            data.append([current_issue_key, current_issue_text, current_issue_storypoint, prompt])

    # Convert the list to a DataFrame
    new_df = pd.DataFrame(data, columns=['issuekey', 'text', 'storypoint', 'prompt'])

    rows_to_proceed = get_random_rows_from_df(new_df, random_rows_number)

    batch_size = 2
    batches = [rows_to_proceed['prompt'].iloc[i:i+batch_size] for i in range(0, len(rows_to_proceed), batch_size)]

    # Iterate over each batch and generate AI responses
    for batch in batches:
        prompts = batch.tolist()
        
        with concurrent.futures.ThreadPoolExecutor() as executor:
            # Submit requests asynchronously
            futures = [executor.submit(generate_chat_gpt_query, prompt=prompt) for prompt in prompts]
            
            # Retrieve responses when they are available
            responses = [future.result() for future in concurrent.futures.as_completed(futures)]

        # Update DataFrame with AI responses
        for i, response in enumerate(responses):
            rows_to_proceed.at[batch.index[i], 'ai_response'] = response

    # Save the updated DataFrame to a new CSV file
    # rows_to_proceed.to_csv('new_df_with_responses.csv', index=False)

    result_folder = create_result_directory(max_tokens, min_similarity_score, max_similarity_score, deployment_name)

    # Save parameters
    save_parameters(result_folder, deployment_name, max_tokens, min_similarity_score, max_similarity_score)

    # Save results
    save_results(result_folder, rows_to_proceed)

