# CFFitST Template

In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from collections import defaultdict
from sentence_transformers.losses import CosineSimilarityLoss
from CFFit import CFFitST, ClassificationHead

In [None]:
BASE_MODEL = "all-mpnet-base-v2" # pretrained sentence transformer model
RANDOM_SEED = 23 # random seed
OUTPUT_PATH = 'output' # output directory
FRACTION = 1 # fraction of training data
THRES_POS = 0.95 # minimum cosine similarity to accept enough similarity between embeddings of a positive examples to classify as correct
THRES_NEG = 0.05 # maximum cosines similarity to accept enough similarity between embeddings of a negative example to classify as correct
LEARNING_RATE = 0.001 # learning rate used in SentenceTransformer fitting

import torch
# selecting cuda device
device_num = 3
DEVICE = "cuda:"+str(device_num)
torch.cuda.set_device(device_num)
DEVICE = DEVICE if torch.cuda.is_available() else "cpu"

In [None]:
# read test and train data
train_set = pd.read_csv("data/issues_train.csv")
test_set = pd.read_csv("data/issues_test.csv")

In [None]:
repos = list(set(train_set["repo"].unique()))
print(repos)

In [None]:
train_set.groupby(["repo", "label"]).size().unstack(fill_value=0)

In [None]:
from keras.utils import to_categorical

# prepare input as the sum of title and body
def process_dataset(df):
    for i, row in df.iterrows():
        #print(row)
        df.at[i,'text'] = str(row['title']) + " " + str(row['body'])
    df = df[['text', 'label', 'repo']]
    return df
    
train_set, test_set = process_dataset(train_set), process_dataset(test_set)

# function to return labels
dic_labels = {"feature":0,"bug":1,"question":2}
def get_labels(data_set):
    labels = data_set["label"]
    return to_categorical([ dic_labels[label] for i, label in labels.items()], num_classes=3)
    
# get input and labels from df
def get_x_y(df):
    x = df["text"].to_list()
    y = get_labels(df)
    return x, y 

In [None]:
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

# Function to generate a classification report within execution
dic_labels = {"feature":0,"bug":1,"question":2}
def class_report(y_true, y_pred, name_repo):
  
  # Convert the predicted probabilities to class labels
  y_pred_classes = np.argmax(y_pred, axis=1)  # Assuming a one-hot encoded target variable

  # Convert the true labels to class labels (if needed)
  y_true_classes = np.argmax(y_true, axis=1)  # Replace 'y_true' with your true labels

  # Generate the classification report
  report = classification_report(y_true_classes, y_pred_classes)
  print(name_repo)
  print(report)
    
  # Make confusion matrix
  matrix_confusion = confusion_matrix(y_true_classes, y_pred_classes)

  # Heatmap for the confusion matrix
  plt.figure(figsize=(4, 3))
  sns.heatmap(matrix_confusion, annot=True, fmt='d', cmap='Blues',
              xticklabels=list(dic_labels.keys()), yticklabels=list(dic_labels.keys()))
  plt.xlabel('Predicted')
  plt.ylabel('Actual')
  plt.title(name_repo)
  plt.show()
  return classification_report(y_true_classes, y_pred_classes, output_dict=True, digits=4)

In [None]:
from CFFit import CFFitST, ClassificationHead

results = defaultdict(dict)
for repo in repos:
    # get train and test sets filtering rows by repository
    train_set_repo, test_set_repo = train_set[train_set["repo"]==repo], test_set[test_set["repo"]==repo]

    # initialize CFFitST model
    cff_model = CFFitST.from_pretrained(BASE_MODEL)
    # set device
    cff_model.to(DEVICE)
    # training method
    cff_model.fit(train_set_repo.sample(frac=FRACTION,random_state=RANDOM_SEED), ["bug","feature","question"], random_state = RANDOM_SEED,\
            epochs=3, validation_data=0.1, chunk_size=0.2,\
            positive_threshold=THRES_POS, negative_threshold=THRES_NEG,\
            chunks_reviewed =3, batch_size = 32, min_chunk_size = 0.25, verbose=False,\
            save_path = OUTPUT_PATH, name=repo.replace("/","_")+"_baseline"+"_"+str(FRACTION))
    # initialize classification head
    cff_head = ClassificationHead(cff_model)
    # set cuda device for classification head (to avoid cuda crashes, select the same cusa device of the CFFitST object)
    cff_head.to(DEVICE)
    x, y = get_x_y(train_set_repo.sample(frac=FRACTION,random_state=RANDOM_SEED))
    cff_head.fit(x,y,epochs=40,learning_rate=LEARNING_RATE)
    y_pred = cff_head.predict(test_set_repo['text'])
    
    results[repo]['metrics'] = class_report(get_labels(test_set_repo), y_pred,repo)
    results[repo]['predictions'] = y_pred.tolist()
    
    

In [None]:
import json
print(results['label_mapping'])
for repo in repos:
    print(repo)
    print(json.dumps(results[repo]['metrics'], indent=4))

In [None]:
class_metrics_sum = defaultdict(defaultdict)
labels = [key for key in results[repos[0]]['metrics'].keys() if key.isnumeric()]

for repo in repos:
    for label in labels:
        for metric in results[repo]['metrics'][label]:
            class_metrics_sum[label][metric] = class_metrics_sum[label].get(metric, 0) + results[repo]['metrics'][label][metric]

class_metrics_avg = {
    label: {
        metric: class_metrics_sum[label][metric] / len(repos)
        for metric in class_metrics_sum[label]
    }
    for label in labels
}

# add the average of the metric over all classes
class_metrics_avg['average'] = {
    metric: sum(class_metrics_avg[label][metric] for label in labels)
    / len(labels)
    for metric in class_metrics_avg[labels[0]]
}

# add to the results    
results['overall'] = {
    'metrics': class_metrics_avg
}

In [None]:
import os
output_file_name = 'results.json'
with open(os.path.join(OUTPUT_PATH, output_file_name), 'w') as fp:
    json.dump(results, fp)