In [None]:
# !pip install simpletransformers

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.classification import MultiLabelClassificationModel
from sklearn.preprocessing import MultiLabelBinarizer
import datetime
import numpy as np
from functools import partial
import sklearn 
from scipy import stats
pd.set_option('display.max_colwidth', None)

## Prepare data

In [None]:
# set the columns of dataset
topics_col = 'github_topics_top'
text_col = 'input_text_freq'

#randomly split data, then load data
train_df = pd.read_csv('../data/repos_multihot_train.csv')
test_df = pd.read_csv('../data/repos_multihot_test.csv')
train_df.head(2)

In [None]:
#to prevent from getting "too many dimension str" error
#after saving a csv, lists turn to strings which can not be fed to the model below
train_df["labels"] = train_df["labels"].str.strip('\n')
train_df["labels"] = train_df["labels"].str.strip('][')
train_df["labels"] = train_df["labels"].str.split(' ')
train_df["labels"] = train_df["labels"].apply(lambda x: list(map(int, x)))

test_df["labels"] = test_df["labels"].str.strip('\n')
test_df["labels"] = test_df["labels"].str.strip('][')
test_df["labels"] = test_df["labels"].str.split(' ')
test_df["labels"] = test_df["labels"].apply(lambda x: list(map(int, x)))

In [None]:
#calculate weights based on frequency of labels for balancing the data
cols = train_df.columns.difference(['text','labels'])
freq = np.sum(train_df[cols], axis = 0)
w = max(freq) / freq

## Train the model

In [None]:
output_name = 'outputs/github220t_epoch6_disitillbert'
num_selected_labels = 220
#change the model names and paths here: bert, distilbert, albert, roberta, xlm, xlnet
model = MultiLabelClassificationModel('distilbert', 'distilbert-base-uncased',
                                      num_labels = num_selected_labels, 
                                      use_cuda = True, 
                                      cuda_device = 0,
                                      pos_weight = list(w),
                                      args={'gradient_accumulation_steps':8,
                                            'learning_rate': 3e-5, 
                                            'num_train_epochs': 6,
                                            'max_seq_length': 512,
                                            'train_batch_size':4,                                             
                                            'overwrite_output_dir': True,
                                            'output_dir': output_name +'/',
                                            "n_gpu": 1,
                                            'reprocess_input_data': True})

In [None]:
# Train the model
start_training = datetime.datetime.strftime(datetime.datetime.today(), '%d/%m/%Y-%H:%M')
model.train_model(train_df)
end_training = datetime.datetime.strftime(datetime.datetime.today(), '%d/%m/%Y-%H:%M')
print(start_training, end_training)

## Evaluation

In [None]:
#evaluation
def calc(p1, p2, func, **kwargs):
    p2 = [list(map(lambda x: 1 if x > 0.5 else 0,y)) for y in p2]
    return func(p1, p2, **kwargs)

def calc_recom(p1, p2, func, **kwargs):
    return func(p1, p2, **kwargs)

def success_rate(y_original, y_pred):
    common = 0
    for i in range(0, len(y_pred)):
        if(sum(y_original[i] * y_pred[i])) > 0:
            common = common +1
    success = common/len(y_pred)
    return success

def coverage(y_original,y_pred):
    x =  np.sum(y_pred, axis = 0)
    c = np.count_nonzero(x > 0)
    cov = c / num_selected_labels
    return cov    

def prf_at_k(y_original, y_pred_probab):
    org_label_count_vec = np.sum(y_original, axis=1)
    repo_5_tags = len(np.where(org_label_count_vec >= 5)[0])
    k_list = [1, 2, 3, 5, 8, 10]
    s1, s5 = {}, {}
    r, p,f =  {}, {}, {}

    for k in k_list:
        org_label_count = np.sum(y_original, axis=1).tolist()
        top_ind = []
        top_ind =  np.argpartition(y_pred_probab, -1 * k, axis=1)[:, -1 * k:]
        pred_in_org = y_original[np.arange(len(y_original))[:, None], top_ind]
        common_topk = np.sum(pred_in_org, axis=1)
        recall, precision, f1 = [], [], []
        success1, success5 = 0, 0
        for index, value in enumerate(common_topk):    
            recall.append(value/min(k, org_label_count[index]))
            precision.append(value/k)          
            if (value >= 1): success1 += 1          
            if (value >= 5): success5 += 1                 
        s1.update({'S1@'+str(k): "{:.2f}".format((success1/len(y_original))*100)})
        s5.update({'S5@'+str(k): "{:.2f}".format((success5/repo_5_tags)*100)})
        r.update({'R@'+str(k): "{:.2f}".format(np.mean(recall)*100)})           
        p.update({'P@'+str(k): "{:.2f}".format(np.mean(precision)*100)})
        f1 = stats.hmean([precision, recall])
        f.update({'F1@'+str(k): "{:.2f}".format(np.mean(f1)*100)})
    return r, p, f, s1, s5

metrics_recom = {
    "Success_Rate": partial(calc,func=success_rate),
    "Coverage": partial(calc,func=coverage),
    "LRL": partial(calc,func=sklearn.metrics.label_ranking_loss),
    "AUC_micro": partial(calc,func=sklearn.metrics.roc_auc_score, average='micro'),
    "AUC_macro": partial(calc,func=sklearn.metrics.roc_auc_score, average='macro'),
    "AUC_wighted": partial(calc,func=sklearn.metrics.roc_auc_score, average='weighted'),
    "Coverage_err": partial(calc,func=sklearn.metrics.coverage_error),
    "Avg_P_score_micro": partial(calc,func=sklearn.metrics.average_precision_score, average='micro'),
    "Avg_P_score_macro": partial(calc,func=sklearn.metrics.average_precision_score, average='macro'),     
    "R@k": partial(calc_recom,func=prf_at_k),
    "f1_micro": partial(calc,func=sklearn.metrics.f1_score,average='micro'),
    "f1_macro": partial(calc,func=sklearn.metrics.f1_score,average='macro'),
    "f1_weighted": partial(calc,func=sklearn.metrics.f1_score,average='weighted'),
    "f1_samples": partial(calc,func=sklearn.metrics.f1_score,average='samples'),
    "prec_micro": partial(calc,func=sklearn.metrics.precision_score,average='micro'),
    "prec_macro": partial(calc,func=sklearn.metrics.precision_score,average='macro'),
    "prec_weighted": partial(calc,func=sklearn.metrics.precision_score,average='weighted'),
    "prec_samples": partial(calc,func=sklearn.metrics.precision_score,average='samples'),
    "recall_micro": partial(calc,func=sklearn.metrics.recall_score,average='micro'),
    "recall_macro": partial(calc,func=sklearn.metrics.recall_score,average='macro'),
    "recall_weighted": partial(calc,func=sklearn.metrics.recall_score,average='weighted'),
    "recall_samples": partial(calc,func=sklearn.metrics.recall_score,average='samples'),
    "hamming_loss": partial(calc,func=sklearn.metrics.hamming_loss),
    "exact_match_ratio": partial(calc,func=sklearn.metrics.accuracy_score)   
}

In [None]:
eval_results, model_outputs, wrong_predictions = model.eval_model(test_df, verbose=True, **metrics_recom)

In [None]:
with open(output_name + '/fullreport.txt','w') as f:
        f.write(str(eval_results))

In [None]:
#prediction for sample inputs
predictions, raw_outputs = model.predict(['python python test java javascript meditation'])
print(predictions)
print(raw_outputs)