# **Config Colab**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

! pip install transformers

**GPU**

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

import os
import numpy as np
import random as rn
import tensorflow as tf
from transformers import set_seed

seed = 1234
os.environ['PYTHONHASHSEED']= '0'
np.random.seed(seed)
rn.seed(seed)
tf.random.set_seed(seed)
set_seed(seed)

# **Fine Tuning**

## **config**

In [None]:
import csv
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, precision_score, recall_score, f1_score, accuracy_score
from tensorflow.keras.utils import to_categorical


def convert_digit_multi(input_labels, target_label, labels):
    other_labels = [y for y in labels if y != target_label]
    new_label = []
    for x in input_labels:
        if x == target_label:
            new_label.append(0)
        elif x == other_labels[0]:
            new_label.append(1)
        elif x == other_labels[1]:
            new_label.append(2)
    return new_label

def convert_digit(input_labels, target_label, labels):
    other_labels = [y for y in labels if y != target_label]
    new_label = []
    for x in input_labels:
        if x == target_label:
            new_label.append(0)
        elif x in other_labels:
            new_label.append(1)
    return new_label

def evaluate_bi(test_label, pred_label, confusion_m=False):
    # pred_label = model.predict(test_sent)
    if confusion_m:
        cm=confusion_matrix(test_label, pred_label)
        print(cm)
        # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
        # disp.plot()
    target = 1
    precision = precision_score(test_label, pred_label, average='binary', pos_label=target)
    recall = recall_score(test_label, pred_label, average='binary', pos_label=target)
    f1 = f1_score(test_label, pred_label, average='binary', pos_label=target)
    accuracy = accuracy_score(test_label, pred_label)
    # print(f"precision: {precision} \trecal: {recall} \tf1: {f1} \taccuracy: {accuracy}")
    return [precision, recall, f1, accuracy]

def evaluate_multi(test_label, pred_label, avg, confusion_m=False):
    # pred_label = model.predict(test_sent)
    if confusion_m:
        cm=confusion_matrix(test_label, pred_label)
        print(cm)
        # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
        # disp.plot()
    precision = precision_score(test_label, pred_label, average=avg)
    recall = recall_score(test_label, pred_label, average=avg)
    f1 = f1_score(test_label, pred_label, average=avg)
    accuracy = accuracy_score(test_label, pred_label)
    # print(f"precision: {precision} \trecal: {recall} \tf1: {f1} \taccuracy: {accuracy}")
    return [precision, recall, f1, accuracy]

def to_class_names(predictions, target_label):
    class_names = [target_label, 'other']
    # predictions = tf.nn.softmax(predictions.logits)
    predictions_index = tf.argmax(predictions, axis=1).numpy()
    predictions = [class_names[prediction] for prediction in predictions_index]
    return predictions_index, predictions

def to_label_index(predictions):
    predictions = tf.nn.softmax(predictions.logits)
    predictions = tf.argmax(predictions, axis=1).numpy()
    return predictions

## **train on reviews+issues**

In [None]:
# train on cobmination
import numpy as np
import tensorflow as tf
from sklearn.utils import compute_class_weight
from transformers import AutoTokenizer
from transformers import DistilBertTokenizerFast
from transformers import TFAutoModelForSequenceClassification
from keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold


labels = ['bug', 'feature']
datasets = ['guzman_dataset', 'maalej_dataset', 'jha_dataset', 'surminer_dataset','scalabrino_dataset']
data_path = "/content/drive/MyDrive/ColabNotebooks/data/inter_context/"

name = "jan_2"
all_issues = pd.read_csv(data_path+f"z_all_issues_train_{name}.csv",
                         names=['id', 'text', 'label_b', 'label_f'])[1:]
all_issues = all_issues.fillna("")
all_issues['label_b'] = all_issues['label_b'].map(int)
all_issues['label_f'] = all_issues['label_f'].map(int)

deep_model = 'distilbert-base-uncased'
b_size = 16
l_r = 5e-5
epoch_no = 3
min_delta = 0.0001
patience = 5
num_labels = 2
max_length = 50
suffix = "f"
test_labels = []
pred_labels = []
test_sents = []
for target_label in labels:
    print(f"--------------------------- target label: {target_label} ---------------------------")
    tlabel = 'label_'+target_label[0]
    for test_dataset_name in datasets:

        test_labels = []
        pred_labels = []
        test_sents = []

        print("***************************************************")
        print('test on: ', test_dataset_name)

        file_path = data_path+f"{test_dataset_name}_{suffix}.csv"
        test_dataset = pd.read_csv(file_path)
        test_dataset = test_dataset.fillna("")

        test_dataset = test_dataset[['app_name', 'text', tlabel]]
        test_dataset[tlabel] = test_dataset[tlabel].map(int)
        print(test_dataset[tlabel].value_counts())

        all_dataset = pd.DataFrame([])
        for dataset in datasets:
            if dataset != test_dataset_name:
              result = []
              file_path = data_path+f"{dataset}_{suffix}.csv"
              review_dataset = pd.read_csv(file_path)
              review_dataset = review_dataset.fillna("")
              all_dataset = pd.concat([all_dataset, review_dataset])

        all_dataset = all_dataset[['app_name', 'text', tlabel]]
        all_dataset[tlabel] = all_dataset[tlabel].map(int)
        print("first train dataset: ")
        print(all_dataset[tlabel].value_counts())

        perc = 0.1
        sample_no = int(all_dataset.shape[0]*perc)
        all_issues_ = all_issues.sample(n=sample_no, random_state=seed)
        print("sample no: ", sample_no)
        dataset_train = pd.concat([all_dataset[['text', tlabel]], all_issues_[['text', tlabel]]])
        print("train dataset: ")
        print(dataset_train[tlabel].value_counts())

        set_x = dataset_train['text'].values.tolist()
        set_y = dataset_train[tlabel].values.tolist()

        k = 5
        kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        kfold = kfold.split(set_x, set_y)

        i = 0
        result_pre = []
        result_rec = []
        result_f1 = []
        result_mic = []
        for train_index, val_index in kfold:
            print("------------------------------------------------------")
            print("repeat: ", i)
            i += 1
            np.random.shuffle(train_index)
            np.random.shuffle(val_index)
            train_sent = [set_x[i] for i in train_index]
            train_label = [set_y[i] for i in train_index]
            val_sent = [set_x[i] for i in val_index]
            val_label = [set_y[i] for i in val_index]

            tokenizer = AutoTokenizer.from_pretrained(deep_model)
            train_sent_ = tokenizer(train_sent, padding="max_length", max_length=max_length, truncation=True)
            val_sent_ = tokenizer(val_sent, padding="max_length", max_length=max_length, truncation=True)

            train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_sent_), train_label)).batch(b_size)
            val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_sent_), val_label)).batch(b_size)

            model = TFAutoModelForSequenceClassification.from_pretrained(deep_model, num_labels=num_labels)
            # model.layers[0].trainable = False
            optimizer = tf.keras.optimizers.Adam(learning_rate=l_r)
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            metrics=tf.metrics.SparseCategoricalAccuracy()
            model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
            # model.summary()
            model.fit(train_dataset, validation_data=val_dataset, epochs=epoch_no, batch_size=b_size,
                      callbacks=[EarlyStopping(monitor='val_loss', verbose=1, patience=patience, min_delta=min_delta)])

            test_sent = test_dataset['text'].values.tolist()
            test_sents.extend(test_sent)
            test_sent = tokenizer(test_sent, padding="max_length", max_length=max_length, truncation=True)
            test_label = test_dataset[tlabel].values.tolist()

            pred_label = model.predict([x.ids for x in test_sent[::]])
            pred_label = to_label_index(pred_label)

            test_labels.extend(test_label)
            pred_labels.extend(pred_label)

            result = evaluate_bi(test_label, pred_label, True)
            result_pre.append(result[0])
            result_rec.append(result[1])
            result_f1.append(result[2])
            result = evaluate_bi(test_label, pred_label, True)
            result_mic.append(result)


        df_pre = pd.DataFrame(result_pre, index = range(k), columns=[target_label])
        df_pre.loc['mean'] = df_pre.mean()
        print("precision: ")
        print(df_pre)

        df_rec = pd.DataFrame(result_rec, index = range(k), columns=[target_label])
        df_rec.loc['mean'] = df_rec.mean()
        print("recall: ")
        print(df_rec)

        df_f1 = pd.DataFrame(result_f1, index = range(k), columns=[target_label])
        df_f1.loc['mean'] = df_f1.mean()
        print("f1-measure: ")
        print(df_f1)

        df_micro = pd.DataFrame(result_mic, index = range(k), columns=['precision', 'recall', 'f1-meature', 'accuracy'])
        df_micro.loc['mean'] = df_micro.mean()
        print("micro: \n")
        print(df_micro)

        true_ids = [len(test_sents[i].split()) for i,x in enumerate(pred_labels) if x == 1 and test_labels[i] == 1]
        false_ids = [len(test_sents[i].split()) for i,x in enumerate(pred_labels) if x == 0 and test_labels[i] == 1]



## **train on reviews**

In [None]:
# train on reviews only
import numpy as np
import tensorflow as tf
from sklearn.utils import compute_class_weight
from transformers import AutoTokenizer
from transformers import DistilBertTokenizerFast
from transformers import TFAutoModelForSequenceClassification
from keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold


labels = ['bug', 'feature']
datasets = ['guzman_dataset', 'maalej_dataset', 'jha_dataset', 'surminer_dataset','scalabrino_dataset']
data_path = "/content/drive/MyDrive/ColabNotebooks/data/inter_context/"

name = "jan_2"

deep_model = 'distilbert-base-uncased'
b_size = 16
l_r = 5e-5
epoch_no = 3
min_delta = 0.0001
patience = 5
num_labels = 2
max_length = 50
suffix = "f"
test_labels = []
pred_labels = []
test_sents = []
for target_label in labels:
    print(f"--------------------------- target label: {target_label} ---------------------------")
    tlabel = 'label_'+target_label[0]
    for test_dataset_name in datasets:

        print("***************************************************")
        print('test on: ', test_dataset_name)

        file_path = data_path+f"{test_dataset_name}_{suffix}.csv"
        test_dataset = pd.read_csv(file_path)
        test_dataset = test_dataset.fillna("")

        test_dataset = test_dataset[['app_name', 'text', tlabel]]
        test_dataset[tlabel] = test_dataset[tlabel].map(int)
        print(test_dataset[tlabel].value_counts())

        all_dataset = pd.DataFrame([])
        for dataset in datasets:
            if dataset != test_dataset_name:
              result = []
              file_path = data_path+f"{dataset}_{suffix}.csv"
              review_dataset = pd.read_csv(file_path)
              review_dataset = review_dataset.fillna("")
              all_dataset = pd.concat([all_dataset, review_dataset])

        all_dataset = all_dataset[['app_name', 'text', tlabel]]
        all_dataset[tlabel] = all_dataset[tlabel].map(int)
        print("first train dataset: ")
        print(all_dataset[tlabel].value_counts())

        dataset_train = all_dataset[['text', tlabel]]
        print("train dataset: ")
        print(dataset_train[tlabel].value_counts())

        set_x = dataset_train['text'].values.tolist()
        set_y = dataset_train[tlabel].values.tolist()

        k = 5
        kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        kfold = kfold.split(set_x, set_y)

        i = 0
        result_pre = []
        result_rec = []
        result_f1 = []
        result_mic = []
        for train_index, val_index in kfold:
            print("------------------------------------------------------")
            print("repeat: ", i)
            i += 1
            np.random.shuffle(train_index)
            np.random.shuffle(val_index)
            train_sent = [set_x[i] for i in train_index]
            train_label = [set_y[i] for i in train_index]
            val_sent = [set_x[i] for i in val_index]
            val_label = [set_y[i] for i in val_index]

            tokenizer = AutoTokenizer.from_pretrained(deep_model)
            train_sent_ = tokenizer(train_sent, padding="max_length", max_length=max_length, truncation=True)
            val_sent_ = tokenizer(val_sent, padding="max_length", max_length=max_length, truncation=True)

            train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_sent_), train_label)).batch(b_size)
            val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_sent_), val_label)).batch(b_size)

            model = TFAutoModelForSequenceClassification.from_pretrained(deep_model, num_labels=num_labels)
            # model.layers[0].trainable = False
            optimizer = tf.keras.optimizers.Adam(learning_rate=l_r)
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            metrics=tf.metrics.SparseCategoricalAccuracy()
            model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
            # model.summary()
            model.fit(train_dataset, validation_data=val_dataset, epochs=epoch_no, batch_size=b_size,
                      callbacks=[EarlyStopping(monitor='val_loss', verbose=1, patience=patience, min_delta=min_delta)])

            test_sent = test_dataset['text'].values.tolist()
            test_sents.extend(test_sent)
            test_sent = tokenizer(test_sent, padding="max_length", max_length=max_length, truncation=True)
            test_label = test_dataset[tlabel].values.tolist()

            pred_label = model.predict([x.ids for x in test_sent[::]])
            pred_label = to_label_index(pred_label)

            test_labels.extend(test_label)
            pred_labels.extend(pred_label)

            result = evaluate_bi(test_label, pred_label, True)
            result_pre.append(result[0])
            result_rec.append(result[1])
            result_f1.append(result[2])
            result = evaluate_bi(test_label, pred_label, True)
            result_mic.append(result)


        df_pre = pd.DataFrame(result_pre, index = range(k), columns=[target_label])
        df_pre.loc['mean'] = df_pre.mean()
        print("precision: ")
        print(df_pre)

        df_rec = pd.DataFrame(result_rec, index = range(k), columns=[target_label])
        df_rec.loc['mean'] = df_rec.mean()
        print("recall: ")
        print(df_rec)

        df_f1 = pd.DataFrame(result_f1, index = range(k), columns=[target_label])
        df_f1.loc['mean'] = df_f1.mean()
        print("f1-measure: ")
        print(df_f1)

        df_micro = pd.DataFrame(result_mic, index = range(k), columns=['precision', 'recall', 'f1-meature', 'accuracy'])
        df_micro.loc['mean'] = df_micro.mean()
        print("micro: \n")
        print(df_micro)




## **Same app and Simailar**

### **review**

In [None]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import DistilBertTokenizerFast
from transformers import TFAutoModelForSequenceClassification
from keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import compute_class_weight



labels = ['bug', 'feature']
datasets = ['guzman_dataset', 'maalej_dataset', 'jha_dataset', 'surminer_dataset','scalabrino_dataset']
data_path = "/content/drive/MyDrive/ColabNotebooks/data/inter_context/"

pack_to_git = { 'fr.free.nrw.commons': 'commons-app/apps-android-commons',
                'org.mozilla.focus': 'mozilla-mobile/focus-android',
                'org.odk.collect.android': 'opendatakit/collect',
                'de.danoeh.antennapod': 'AntennaPod/AntennaPod',
                'com.habitrpg.android.habitica': 'HabitRPG/habitica-android',
                'com.ichi2.anki': 'ankidroid/Anki-Android',
                'io.metamask': 'MetaMask/metamask-mobile',
                'io.homeassistant.companion.android': 'home-assistant/home-assistant-android',
                'org.mozilla.fenix': 'mozilla-mobile/fenix',
                'com.owncloud.android': 'nextcloud/android',
                'me.ccrama.redditslide': 'ccrama/Slide',
                'net.cozic.joplin': 'laurent22/joplin',
                'com.tavultesoft.kmapro': 'keymanapp/keyman'}

deep_model = 'distilbert-base-uncased'
b_size = 16
l_r = 5e-5
epoch_no = 3
min_delta = 0.0001
patience = 5
num_labels = 2
max_length = 50
suffix = "f"

test_labels = []
pred_labels = []
test_sents = []

git_basic_details = pd.read_csv(data_path+'git_details_proc.csv')
git_basic_details = git_basic_details.fillna("")
git_basic_descriptions = git_basic_details['text'].values.tolist()
git_basic_names = git_basic_details['id'].values

desc_tfIdfVectorizer = TfidfVectorizer(min_df=5, ngram_range=(1, 2), use_idf=True)
tfidf = desc_tfIdfVectorizer.fit_transform(git_basic_descriptions)
git_basic_descriptions_vectors = desc_tfIdfVectorizer.transform(git_basic_descriptions).toarray()

name = "jan_2"
all_issues = pd.read_csv(data_path+f"z_all_issues_train_{name}.csv",
                         names=['id', 'text', 'label_b', 'label_f'])[1:]
all_issues = all_issues.fillna("")
all_issues['label_b'] = all_issues['label_b'].map(int)
all_issues['label_f'] = all_issues['label_f'].map(int)


no_similars = 20
no_sample = 2000
no_app = 1

review_file = data_path + 'z_sample_reviews_2_f_pre_4.csv'
all_reviews = pd.read_csv(review_file)
all_reviews = all_reviews.fillna("")

app_names = np.unique(all_reviews['app_id'].values)


for target_label in labels:
    print(f"--------------------------- target label: {target_label} ---------------------------")
    tlabel = 'label_'+target_label[0]

    all_review_dataset = pd.DataFrame([])
    for dataset in datasets:
        result = []
        file_path = data_path+f"{dataset}_{suffix}.csv"
        review_dataset = pd.read_csv(file_path)
        review_dataset = review_dataset.fillna("")
        all_review_dataset = pd.concat([all_review_dataset, review_dataset])

    all_review_dataset = all_review_dataset[['app_name', 'text', tlabel]]
    all_review_dataset[tlabel] = all_review_dataset[tlabel].map(int)

    dataset_train = all_review_dataset[['text', tlabel]]

    print("train dataset: ")
    print(dataset_train[tlabel].value_counts())

    set_x = dataset_train['text'].values.tolist()
    set_y = dataset_train[tlabel].values.tolist()

    k = 5
    kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
    kfold = kfold.split(set_x, set_y)

    i = 0
    result_pre = []
    result_rec = []
    result_f1 = []
    result_mic = []

    for train_index, val_index in kfold:
        print("------------------------------------------------------")
        print("repeat: ", i)
        i += 1
        np.random.shuffle(train_index)
        np.random.shuffle(val_index)
        train_sent = [set_x[i] for i in train_index]
        train_label = [set_y[i] for i in train_index]
        val_sent = [set_x[i] for i in val_index]
        val_label = [set_y[i] for i in val_index]

        tokenizer = AutoTokenizer.from_pretrained(deep_model)
        train_sent_ = tokenizer(train_sent, padding="max_length", max_length=max_length, truncation=True)
        val_sent_ = tokenizer(val_sent, padding="max_length", max_length=max_length, truncation=True)

        train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_sent_), train_label)).batch(b_size)
        val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_sent_), val_label)).batch(b_size)

        model = TFAutoModelForSequenceClassification.from_pretrained(deep_model, num_labels=num_labels)
        # model.layers[0].trainable = False
        optimizer = tf.keras.optimizers.Adam(learning_rate=l_r)
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metrics=tf.metrics.SparseCategoricalAccuracy()
        model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
        # model.summary()
        model.fit(train_dataset, validation_data=val_dataset, epochs=epoch_no, batch_size=b_size,
                  callbacks=[EarlyStopping(monitor='val_loss', verbose=1, patience=patience, min_delta=min_delta)])


        for test_name in app_names:
            print("test on: ", test_name)
            test_dataset = all_reviews[all_reviews['app_id'] == test_name][['text_pre', 'label_b', 'label_f']]
            print(test_dataset[tlabel].value_counts())

            test_sent_ = test_dataset['text_pre'].values.tolist()
            test_sents.extend(test_sent_)
            test_sent = tokenizer(test_sent_, padding="max_length", max_length=max_length, truncation=True)
            test_label = test_dataset[tlabel].values.tolist()

            pred_label = model.predict([x.ids for x in test_sent[::]])
            pred_label = to_label_index(pred_label)

            test_labels.extend(test_label)
            pred_labels.extend(pred_label)

            result = evaluate_bi(test_label, pred_label, True)
            result_pre.append(result[0])
            result_rec.append(result[1])
            result_f1.append(result[2])
            result_mic.append(result)

    i = -1
    result_avg = []
    step = len(datasets) - 1
    for test_name in app_names:
        i += 1
        print("****************************************")
        print(test_name)
        result_pre_ = []
        result_rec_ = []
        result_f1_ = []
        result_mic_ = []

        for j in range(k):
            result_pre_.append(result_pre[step*j+i])
            result_rec_.append(result_rec[step*j+i])
            result_f1_.append(result_f1[step*j+i])
            result_mic_.append(result_mic[step*j+i])

        df_pre = pd.DataFrame(result_pre_, index = range(k), columns=[target_label])
        df_pre.loc['mean'] = df_pre.mean()
        print("precision: ")
        print(df_pre)

        df_rec = pd.DataFrame(result_rec_, index = range(k), columns=[target_label])
        df_rec.loc['mean'] = df_rec.mean()
        print("recall: ")
        print(df_rec)

        df_f1 = pd.DataFrame(result_f1_, index = range(k), columns=[target_label])
        df_f1.loc['mean'] = df_f1.mean()
        print("f1-measure: ")
        print(df_f1)

        df_micro = pd.DataFrame(result_mic_, index = range(k), columns=['precision', 'recall', 'f1-meature', 'accuracy'])
        df_micro.loc['mean'] = df_micro.mean()
        print("micro: \n")
        print(df_micro)

        result_avg.append(df_micro.loc['mean'].values.tolist())


    df_result_avg = pd.DataFrame(result_avg, columns=['p', 'r', 'f1', 'ac'], index=app_names)
    df_result_avg.loc['mean'] = df_result_avg.mean()
    print(df_result_avg)



### **review+same**

In [None]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import DistilBertTokenizerFast
from transformers import TFAutoModelForSequenceClassification
from keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import compute_class_weight




labels = ['bug', 'feature', 'other']
datasets = ['guzman_dataset', 'maalej_dataset', 'jha_dataset', 'surminer_dataset','scalabrino_dataset']
data_path = "/content/drive/MyDrive/ColabNotebooks/data/inter_context/"

pack_to_git = { 'fr.free.nrw.commons': 'commons-app/apps-android-commons',
                'org.mozilla.focus': 'mozilla-mobile/focus-android',
                'org.odk.collect.android': 'opendatakit/collect',
                'de.danoeh.antennapod': 'AntennaPod/AntennaPod',
                'com.habitrpg.android.habitica': 'HabitRPG/habitica-android',
                'com.ichi2.anki': 'ankidroid/Anki-Android',
                'io.metamask': 'MetaMask/metamask-mobile',
                'io.homeassistant.companion.android': 'home-assistant/home-assistant-android',
                'org.mozilla.fenix': 'mozilla-mobile/fenix',
                'com.owncloud.android': 'nextcloud/android',
                'me.ccrama.redditslide': 'ccrama/Slide',
                'net.cozic.joplin': 'laurent22/joplin',
                'com.tavultesoft.kmapro': 'keymanapp/keyman'}

deep_model = 'distilbert-base-uncased'
b_size = 16
l_r = 5e-5
epoch_no = 3
min_delta = 0.0001
patience = 5
num_labels = 2
max_length = 50
suffix = "f"

test_labels = []
pred_labels = []
test_sents = []

git_basic_details = pd.read_csv(data_path+'git_details_proc.csv')
git_basic_details = git_basic_details.fillna("")
git_basic_descriptions = git_basic_details['text'].values.tolist()
git_basic_names = git_basic_details['id'].values

desc_tfIdfVectorizer = TfidfVectorizer(min_df=5, ngram_range=(1, 2), use_idf=True)
tfidf = desc_tfIdfVectorizer.fit_transform(git_basic_descriptions)
git_basic_descriptions_vectors = desc_tfIdfVectorizer.transform(git_basic_descriptions).toarray()

name = "jan_2"
all_issues = pd.read_csv(data_path+f"z_all_issues_train_{name}.csv",
                         names=['id', 'text', 'label_b', 'label_f'])[1:]
all_issues = all_issues.fillna("")
all_issues['label_b'] = all_issues['label_b'].map(int)
all_issues['label_f'] = all_issues['label_f'].map(int)


no_similars = 20
no_sample = 2000
no_app = 1

review_file = data_path + 'z_sample_reviews_2_f_pre_4.csv'
all_reviews = pd.read_csv(review_file)
all_reviews = all_reviews.fillna("")

app_names = np.unique(all_reviews['app_id'].values)


for target_label in labels[:1]:
    print(f"--------------------------- target label: {target_label} ---------------------------")
    tlabel = 'label_'+target_label[0]
    for test_name in app_names[:2]:

        test_labels = []
        pred_labels = []
        test_sents = []

        print("***************************************************")
        print('test on: ', test_name)

        test_dataset = all_reviews[all_reviews['app_id'] == test_name][['text_pre', 'label_b', 'label_f']]
        print(test_dataset[tlabel].value_counts())

        ############################## review datasets #########################
        all_review_dataset = pd.DataFrame([])
        for dataset in datasets:
              result = []
               file_path = data_path+f"{dataset}_{suffix}.csv"
              review_dataset = pd.read_csv(file_path)
              review_dataset = review_dataset.fillna("")
               all_review_dataset = pd.concat([all_review_dataset, review_dataset])

        all_review_dataset = all_review_dataset[['app_name', 'text', tlabel]]
        all_review_dataset[tlabel] = all_review_dataset[tlabel].map(int)

        ############################## similar git datasets #########################

        git_app_description = git_basic_details[git_basic_details['id'] == pack_to_git[test_name]]['text'].values.tolist()[0]
        google_app_description_vector = desc_tfIdfVectorizer.transform([git_app_description]).toarray()[0]


        no_similars_ = no_similars
        similarity = [cosine_similarity([t, google_app_description_vector])[0][1] for t in git_basic_descriptions_vectors]
        similar_projects = git_basic_names[(-np.array(similarity)).argsort()[:no_similars_]]

        not_exist = True
        tlabel = "label_"+target_label[0]
        similar_issues = all_issues[all_issues['id'].isin(similar_projects)][['text', tlabel]][:no_sample]

        ############################## same git dataset #########################
        same_issues = all_issues[all_issues['id'] == pack_to_git[test_name]]

        ############################## merge datasets #########################
        thr = int(all_review_dataset.shape[0]*0.3)
        if same_issues.shape[0] > thr:
            same_issues = same_issues.sample(n=thr, random_state=seed)

        dataset_train = pd.concat([all_review_dataset[['text', tlabel]], same_issues[['text', tlabel]]])

        print("same dataset: ")
        print(same_issues[tlabel].value_counts())

        print("train dataset: ")
        print(dataset_train[tlabel].value_counts())

        set_x = dataset_train['text'].values.tolist()
        set_y = dataset_train[tlabel].values.tolist()

        k = 5
        kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        kfold = kfold.split(set_x, set_y)

        i = 0
        result_pre = []
        result_rec = []
        result_f1 = []
        result_mic = []
        for train_index, val_index in kfold:
            print("------------------------------------------------------")
            print("repeat: ", i)
            i += 1
            np.random.shuffle(train_index)
            np.random.shuffle(val_index)
            train_sent = [set_x[i] for i in train_index]
            train_label = [set_y[i] for i in train_index]
            val_sent = [set_x[i] for i in val_index]
            val_label = [set_y[i] for i in val_index]

            tokenizer = AutoTokenizer.from_pretrained(deep_model)
            train_sent_ = tokenizer(train_sent, padding="max_length", max_length=max_length, truncation=True)
            val_sent_ = tokenizer(val_sent, padding="max_length", max_length=max_length, truncation=True)

            train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_sent_), train_label)).batch(b_size)
            val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_sent_), val_label)).batch(b_size)

            model = TFAutoModelForSequenceClassification.from_pretrained(deep_model, num_labels=num_labels)
            # model.layers[0].trainable = False
            optimizer = tf.keras.optimizers.Adam(learning_rate=l_r)
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            metrics=tf.metrics.SparseCategoricalAccuracy()
            model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
            # model.summary()
            model.fit(train_dataset, validation_data=val_dataset, epochs=epoch_no, batch_size=b_size,
                      callbacks=[EarlyStopping(monitor='val_loss', verbose=1, patience=patience, min_delta=min_delta)])

            test_sent = test_dataset['text_pre'].values.tolist()
            test_sents.extend(test_sent)
            test_sent = tokenizer(test_sent, padding="max_length", max_length=max_length, truncation=True)
            test_label = test_dataset[tlabel].values.tolist()

            pred_label = model.predict([x.ids for x in test_sent[::]])
            pred_label = to_label_index(pred_label)

            test_labels.extend(test_label)
            pred_labels.extend(pred_label)

            result = evaluate_bi(test_label, pred_label, True)
            result_pre.append(result[0])
            result_rec.append(result[1])
            result_f1.append(result[2])
            result = evaluate_bi(test_label, pred_label, True)
            result_mic.append(result)

        df_pre = pd.DataFrame(result_pre, index = range(k), columns=[target_label])
        df_pre.loc['mean'] = df_pre.mean()
        print("precision: ")
        print(df_pre)

        df_rec = pd.DataFrame(result_rec, index = range(k), columns=[target_label])
        df_rec.loc['mean'] = df_rec.mean()
        print("recall: ")
        print(df_rec)

        df_f1 = pd.DataFrame(result_f1, index = range(k), columns=[target_label])
        df_f1.loc['mean'] = df_f1.mean()
        print("f1-measure: ")
        print(df_f1)

        df_micro = pd.DataFrame(result_mic, index = range(k), columns=['precision', 'recall', 'f1-meature', 'accuracy'])
        df_micro.loc['mean'] = df_micro.mean()
        print("micro: \n")
        print(df_micro)



### **review+similar**

In [None]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import DistilBertTokenizerFast
from transformers import TFAutoModelForSequenceClassification
from keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import compute_class_weight


labels = ['bug', 'feature', 'other']
datasets = ['guzman_dataset', 'maalej_dataset', 'jha_dataset', 'surminer_dataset','scalabrino_dataset']
data_path = "/content/drive/MyDrive/ColabNotebooks/data/inter_context/"

pack_to_git = { 'fr.free.nrw.commons': 'commons-app/apps-android-commons',
                'org.mozilla.focus': 'mozilla-mobile/focus-android',
                'org.odk.collect.android': 'opendatakit/collect',
                'de.danoeh.antennapod': 'AntennaPod/AntennaPod',
                'com.habitrpg.android.habitica': 'HabitRPG/habitica-android',
                'com.ichi2.anki': 'ankidroid/Anki-Android',
                'io.metamask': 'MetaMask/metamask-mobile',
                'io.homeassistant.companion.android': 'home-assistant/home-assistant-android',
                'org.mozilla.fenix': 'mozilla-mobile/fenix',
                'com.owncloud.android': 'nextcloud/android',
                'me.ccrama.redditslide': 'ccrama/Slide',
                'net.cozic.joplin': 'laurent22/joplin',
                'com.tavultesoft.kmapro': 'keymanapp/keyman'}

deep_model = 'distilbert-base-uncased'
b_size = 16
l_r = 5e-5
epoch_no = 3
min_delta = 0.0001
patience = 5
num_labels = 2
max_length = 50
suffix = "f"

test_labels = []
pred_labels = []
test_sents = []

git_basic_details = pd.read_csv(data_path+'git_details_proc.csv')
git_basic_details = git_basic_details.fillna("")
git_basic_descriptions = git_basic_details['text'].values.tolist()
git_basic_names = git_basic_details['id'].values

desc_tfIdfVectorizer = TfidfVectorizer(min_df=5, ngram_range=(1, 2), use_idf=True)
tfidf = desc_tfIdfVectorizer.fit_transform(git_basic_descriptions)
git_basic_descriptions_vectors = desc_tfIdfVectorizer.transform(git_basic_descriptions).toarray()

name = "jan_2"
all_issues = pd.read_csv(data_path+f"z_all_issues_train_{name}.csv",
                         names=['id', 'text', 'label_b', 'label_f'])[1:]
all_issues = all_issues.fillna("")
all_issues['label_b'] = all_issues['label_b'].map(int)
all_issues['label_f'] = all_issues['label_f'].map(int)


no_similars = 20
no_sample = 2000
no_app = 1

review_file = data_path + 'z_sample_reviews_2_f_pre_4.csv'
all_reviews = pd.read_csv(review_file)
all_reviews = all_reviews.fillna("")

app_names = np.unique(all_reviews['app_id'].values)


for target_label in labels:
    print(f"--------------------------- target label: {target_label} ---------------------------")
    tlabel = 'label_'+target_label[0]
    for test_name in app_names:

        test_labels = []
        pred_labels = []
        test_sents = []

        print("***************************************************")
        print('test on: ', test_name)

        test_dataset = all_reviews[all_reviews['app_id'] == test_name][['text_pre', 'label_b', 'label_f']]
        print(test_dataset[tlabel].value_counts())

        ############################## review datasets #########################
        all_review_dataset = pd.DataFrame([])
        for dataset in datasets:
              result = []
              file_path = data_path+f"{dataset}_{suffix}.csv"
              review_dataset = pd.read_csv(file_path)
              review_dataset = review_dataset.fillna("")
              all_review_dataset = pd.concat([all_review_dataset, review_dataset])

        all_review_dataset = all_review_dataset[['app_name', 'text', tlabel]]
        all_review_dataset[tlabel] = all_review_dataset[tlabel].map(int)

        ############################## similar git datasets #########################
        git_app_description = git_basic_details[git_basic_details['id'] == pack_to_git[test_name]]['text'].values.tolist()[0]
        google_app_description_vector = desc_tfIdfVectorizer.transform([git_app_description]).toarray()[0]


        no_similars_ = no_similars
        similarity = [cosine_similarity([t, google_app_description_vector])[0][1] for t in git_basic_descriptions_vectors]
        similar_projects = git_basic_names[(-np.array(similarity)).argsort()]
        similar_projects = [x for x in similar_projects if x != pack_to_git[test_name]]

        not_exist = True
        tlabel = "label_"+target_label[0]
        thr = int(all_review_dataset.shape[0]*0.3)
        print("theresholad", thr)
        similar_issues = pd.DataFrame([])
        for sim in similar_projects:
            similar_issues_ = all_issues[all_issues['id'] == sim][['text', tlabel]]
            similar_issues = pd.concat([similar_issues, similar_issues_])
            if similar_issues.shape[0] > thr:
                similar_issues = similar_issues.sample(n=thr, random_state=seed)
                break

        ############################## same git dataset #########################
        same_issues = all_issues[all_issues['id'] == pack_to_git[test_name]]

        ############################## merge datasets #########################
        dataset_train = pd.concat([all_review_dataset[['text', tlabel]], similar_issues[['text', tlabel]]])


        print("similar dataset: ")
        print(similar_issues[tlabel].value_counts())

        print("train dataset: ")
        print(dataset_train[tlabel].value_counts())

        set_x = dataset_train['text'].values.tolist()
        set_y = dataset_train[tlabel].values.tolist()

        k = 5
        kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        kfold = kfold.split(set_x, set_y)

        i = 0
        result_pre = []
        result_rec = []
        result_f1 = []
        result_mic = []
        for train_index, val_index in kfold:
            print("------------------------------------------------------")
            print("repeat: ", i)
            i += 1
            np.random.shuffle(train_index)
            np.random.shuffle(val_index)
            train_sent = [set_x[i] for i in train_index]
            train_label = [set_y[i] for i in train_index]
            val_sent = [set_x[i] for i in val_index]
            val_label = [set_y[i] for i in val_index]

            tokenizer = AutoTokenizer.from_pretrained(deep_model)
            train_sent_ = tokenizer(train_sent, padding="max_length", max_length=max_length, truncation=True)
            val_sent_ = tokenizer(val_sent, padding="max_length", max_length=max_length, truncation=True)

            train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_sent_), train_label)).batch(b_size)
            val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_sent_), val_label)).batch(b_size)

            model = TFAutoModelForSequenceClassification.from_pretrained(deep_model, num_labels=num_labels)
            # model.layers[0].trainable = False
            optimizer = tf.keras.optimizers.Adam(learning_rate=l_r)
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            metrics=tf.metrics.SparseCategoricalAccuracy()
            model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
            model.fit(train_dataset, validation_data=val_dataset, epochs=epoch_no, batch_size=b_size,
                      callbacks=[EarlyStopping(monitor='val_loss', verbose=1, patience=patience, min_delta=min_delta)])

            test_sent_ = test_dataset['text_pre'].values.tolist()
            test_sents.extend(test_sent_)
            test_sent = tokenizer(test_sent_, padding="max_length", max_length=max_length, truncation=True)
            test_label = test_dataset[tlabel].values.tolist()

            pred_label = model.predict([x.ids for x in test_sent[::]])
            pred_label = to_label_index(pred_label)

            test_labels.extend(test_label)
            pred_labels.extend(pred_label)

            result = evaluate_bi(test_label, pred_label, True)
            result_pre.append(result[0])
            result_rec.append(result[1])
            result_f1.append(result[2])
            result_mic.append(result)
            false_sents = [test_sent_[i] for i,x in enumerate(pred_label) if x == 0 and test_label[i] == 1]
            print(false_sents)

        df_pre = pd.DataFrame(result_pre, index = range(k), columns=[target_label])
        df_pre.loc['mean'] = df_pre.mean()
        print("precision: ")
        print(df_pre)

        df_rec = pd.DataFrame(result_rec, index = range(k), columns=[target_label])
        df_rec.loc['mean'] = df_rec.mean()
        print("recall: ")
        print(df_rec)

        df_f1 = pd.DataFrame(result_f1, index = range(k), columns=[target_label])
        df_f1.loc['mean'] = df_f1.mean()
        print("f1-measure: ")
        print(df_f1)

        df_micro = pd.DataFrame(result_mic, index = range(k), columns=['precision', 'recall', 'f1-meature', 'accuracy'])
        df_micro.loc['mean'] = df_micro.mean()
        print("micro: \n")
        print(df_micro)


### **review+same+similar**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import DistilBertTokenizerFast
from transformers import TFAutoModelForSequenceClassification
from keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import compute_class_weight


labels = ['bug', 'feature', 'other']
datasets = ['guzman_dataset', 'maalej_dataset', 'jha_dataset', 'surminer_dataset','scalabrino_dataset']
data_path = "/content/drive/MyDrive/ColabNotebooks/data/inter_context/"

pack_to_git = { 'fr.free.nrw.commons': 'commons-app/apps-android-commons',
                'org.mozilla.focus': 'mozilla-mobile/focus-android',
                'org.odk.collect.android': 'opendatakit/collect',
                'de.danoeh.antennapod': 'AntennaPod/AntennaPod',
                'com.habitrpg.android.habitica': 'HabitRPG/habitica-android',
                'com.ichi2.anki': 'ankidroid/Anki-Android',
                'io.metamask': 'MetaMask/metamask-mobile',
                'io.homeassistant.companion.android': 'home-assistant/home-assistant-android',
                'org.mozilla.fenix': 'mozilla-mobile/fenix',
                'com.owncloud.android': 'nextcloud/android',
                'me.ccrama.redditslide': 'ccrama/Slide',
                'net.cozic.joplin': 'laurent22/joplin',
                'com.tavultesoft.kmapro': 'keymanapp/keyman'}

deep_model = 'distilbert-base-uncased'
b_size = 16
l_r = 5e-5
epoch_no = 3
min_delta = 0.0001
patience = 5
num_labels = 2
max_length = 50
suffix = "f"

test_labels = []
pred_labels = []
test_sents = []

git_basic_details = pd.read_csv(data_path+'git_details_proc.csv')
git_basic_details = git_basic_details.fillna("")
git_basic_descriptions = git_basic_details['text'].values.tolist()
git_basic_names = git_basic_details['id'].values

desc_tfIdfVectorizer = TfidfVectorizer(min_df=5, ngram_range=(1, 2), use_idf=True)
tfidf = desc_tfIdfVectorizer.fit_transform(git_basic_descriptions)
git_basic_descriptions_vectors = desc_tfIdfVectorizer.transform(git_basic_descriptions).toarray()

name = "jan_2"
all_issues = pd.read_csv(data_path+f"z_all_issues_train_{name}.csv",
                         names=['id', 'text', 'label_b', 'label_f'])[1:]
all_issues = all_issues.fillna("")
all_issues['label_b'] = all_issues['label_b'].map(int)
all_issues['label_f'] = all_issues['label_f'].map(int)


no_similars = 20
no_sample = 2000
no_app = 1

review_file = data_path + 'z_sample_reviews_2_f_pre_4.csv'
all_reviews = pd.read_csv(review_file)
all_reviews = all_reviews.fillna("")

app_names = np.unique(all_reviews['app_id'].values)



for target_label in labels:
    print(f"--------------------------- target label: {target_label} ---------------------------")
    tlabel = 'label_'+target_label[0]
    start = 0
    for test_name in app_names[start:]:

        test_labels = []
        pred_labels = []
        test_sents = []

        print("***************************************************")
        print('test on: ', test_name)

        test_dataset = all_reviews[all_reviews['app_id'] == test_name][['text_pre', 'label_b', 'label_f']]
        print(test_dataset[tlabel].value_counts())

        ############################## review datasets #########################
        all_review_dataset = pd.DataFrame([])
        for dataset in datasets:
              result = []
              file_path = data_path+f"{dataset}_{suffix}.csv"
              review_dataset = pd.read_csv(file_path)
              review_dataset = review_dataset.fillna("")
              all_review_dataset = pd.concat([all_review_dataset, review_dataset])

        all_review_dataset = all_review_dataset[['app_name', 'text', tlabel]]
        all_review_dataset[tlabel] = all_review_dataset[tlabel].map(int)

        ############################## similar git datasets #########################
        git_app_description = git_basic_details[git_basic_details['id'] == pack_to_git[test_name]]['text'].values.tolist()[0]
        google_app_description_vector = desc_tfIdfVectorizer.transform([git_app_description]).toarray()[0]


        no_similars_ = no_similars + 1
        similarity = [cosine_similarity([t, google_app_description_vector])[0][1] for t in git_basic_descriptions_vectors]
        similar_projects = git_basic_names[(-np.array(similarity)).argsort()]
        similar_projects = [x for x in similar_projects if x != pack_to_git[test_name]]

        not_exist = True
        tlabel = "label_"+target_label[0]
        similar_issues = all_issues[all_issues['id'].isin(similar_projects)][['text', tlabel]]

        ############################## same git dataset #########################
        same_issues = all_issues[all_issues['id'] == pack_to_git[test_name]]

        ############################## merge datasets #########################
        thr = int(all_review_dataset.shape[0]*0.3)
        if similar_issues.shape[0] + same_issues.shape[0] > thr:
            thr = thr - same_issues.shape[0]
            similar_issues = pd.DataFrame([])
            for sim in similar_projects:
                similar_issues_ = all_issues[all_issues['id'] == sim][['text', tlabel]]
                similar_issues = pd.concat([similar_issues, similar_issues_])
                if similar_issues.shape[0] > thr:
                    similar_issues = similar_issues.sample(n=thr, random_state=seed)
                    break

        aug_dataset = pd.concat([same_issues, similar_issues])
        dataset_train = pd.concat([all_review_dataset[['text', tlabel]], aug_dataset])


        print("same dataset: ")
        print(same_issues[tlabel].value_counts())

        print("similar dataset: ")
        print(similar_issues[tlabel].value_counts())

        print("aug dataset: ")
        print(aug_dataset[tlabel].value_counts())

        print("train dataset: ")
        print(dataset_train[tlabel].value_counts())

        set_x = dataset_train['text'].values.tolist()
        set_y = dataset_train[tlabel].values.tolist()

        k = 5
        kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        kfold = kfold.split(set_x, set_y)

        i = 0
        result_pre = []
        result_rec = []
        result_f1 = []
        result_mic = []
        for train_index, val_index in kfold:
            print("------------------------------------------------------")
            print("repeat: ", i)
            i += 1
            np.random.shuffle(train_index)
            np.random.shuffle(val_index)
            train_sent = [set_x[i] for i in train_index]
            train_label = [set_y[i] for i in train_index]
            val_sent = [set_x[i] for i in val_index]
            val_label = [set_y[i] for i in val_index]

            tokenizer = AutoTokenizer.from_pretrained(deep_model)
            train_sent_ = tokenizer(train_sent, padding="max_length", max_length=max_length, truncation=True)
            val_sent_ = tokenizer(val_sent, padding="max_length", max_length=max_length, truncation=True)

            train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_sent_), train_label)).batch(b_size)
            val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_sent_), val_label)).batch(b_size)

            model = TFAutoModelForSequenceClassification.from_pretrained(deep_model, num_labels=num_labels)
            # model.layers[0].trainable = False
            optimizer = tf.keras.optimizers.Adam(learning_rate=l_r)
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            metrics=tf.metrics.SparseCategoricalAccuracy()
            model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
            model.fit(train_dataset, validation_data=val_dataset, epochs=epoch_no, batch_size=b_size,
                      callbacks=[EarlyStopping(monitor='val_loss', verbose=1, patience=patience, min_delta=min_delta)])

            test_sent = test_dataset['text_pre'].values.tolist()
            test_sents.extend(test_sent)
            test_sent = tokenizer(test_sent, padding="max_length", max_length=max_length, truncation=True)
            test_label = test_dataset[tlabel].values.tolist()

            pred_label = model.predict([x.ids for x in test_sent[::]])
            pred_label = to_label_index(pred_label)

            test_labels.extend(test_label)
            pred_labels.extend(pred_label)

            result = evaluate_bi(test_label, pred_label, True)
            result_pre.append(result[0])
            result_rec.append(result[1])
            result_f1.append(result[2])
            result = evaluate_bi(test_label, pred_label, True)
            result_mic.append(result)

        df_pre = pd.DataFrame(result_pre, index = range(k), columns=[target_label])
        df_pre.loc['mean'] = df_pre.mean()
        print("precision: ")
        print(df_pre)

        df_rec = pd.DataFrame(result_rec, index = range(k), columns=[target_label])
        df_rec.loc['mean'] = df_rec.mean()
        print("recall: ")
        print(df_rec)

        df_f1 = pd.DataFrame(result_f1, index = range(k), columns=[target_label])
        df_f1.loc['mean'] = df_f1.mean()
        print("f1-measure: ")
        print(df_f1)

        df_micro = pd.DataFrame(result_mic, index = range(k), columns=['precision', 'recall', 'f1-meature', 'accuracy'])
        df_micro.loc['mean'] = df_micro.mean()
        print("micro: \n")
        print(df_micro)


### **random**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import DistilBertTokenizerFast
from transformers import TFAutoModelForSequenceClassification
from keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import compute_class_weight


labels = ['bug', 'feature', 'other']
datasets = ['guzman_dataset', 'maalej_dataset', 'jha_dataset', 'surminer_dataset','scalabrino_dataset']
data_path = "/content/drive/MyDrive/Colab Notebooks (1)/data/inter_context/"

pack_to_git = { 'fr.free.nrw.commons': 'commons-app/apps-android-commons',
                'org.mozilla.focus': 'mozilla-mobile/focus-android',
                'org.odk.collect.android': 'opendatakit/collect',
                'de.danoeh.antennapod': 'AntennaPod/AntennaPod',
                'com.habitrpg.android.habitica': 'HabitRPG/habitica-android',
                'com.ichi2.anki': 'ankidroid/Anki-Android',
                'io.metamask': 'MetaMask/metamask-mobile',
                'io.homeassistant.companion.android': 'home-assistant/home-assistant-android',
                'org.mozilla.fenix': 'mozilla-mobile/fenix',
                'com.owncloud.android': 'nextcloud/android',
                'me.ccrama.redditslide': 'ccrama/Slide',
                'net.cozic.joplin': 'laurent22/joplin',
                'com.tavultesoft.kmapro': 'keymanapp/keyman'}

deep_model = 'distilbert-base-uncased'
b_size = 16
l_r = 5e-5
epoch_no = 3
min_delta = 0.0001
patience = 5
num_labels = 2
max_length = 50
suffix = "f"

test_labels = []
pred_labels = []
test_sents = []

git_basic_details = pd.read_csv(data_path+'git_details_proc.csv')
git_basic_details = git_basic_details.fillna("")
git_basic_descriptions = git_basic_details['text'].values.tolist()
git_basic_names = git_basic_details['id'].values

desc_tfIdfVectorizer = TfidfVectorizer(min_df=5, ngram_range=(1, 2), use_idf=True)
tfidf = desc_tfIdfVectorizer.fit_transform(git_basic_descriptions)
git_basic_descriptions_vectors = desc_tfIdfVectorizer.transform(git_basic_descriptions).toarray()

name = "jan_2"
all_issues = pd.read_csv(data_path+f"z_all_issues_train_{name}.csv",
                         names=['id', 'text', 'label_b', 'label_f'])[1:]
all_issues = all_issues.fillna("")
all_issues['label_b'] = all_issues['label_b'].map(int)
all_issues['label_f'] = all_issues['label_f'].map(int)


no_similars = 20
no_sample = 2000
no_app = 1

review_file = data_path + 'z_sample_reviews_2_f_pre.csv'
all_reviews = pd.read_csv(review_file)
all_reviews = all_reviews.fillna("")

app_names = np.unique(all_reviews['app_id'].values)


for target_label in labels:
    print(f"--------------------------- target label: {target_label} ---------------------------")
    tlabel = 'label_'+target_label[0]
    for test_name in app_names:

        test_labels = []
        pred_labels = []
        test_sents = []

        print("***************************************************")
        print('test on: ', test_name)

        test_dataset = all_reviews[all_reviews['app_id'] == test_name][['text_pre', 'label_b', 'label_f']]
        print(test_dataset[tlabel].value_counts())

        ############################## review datasets #########################
        all_review_dataset = pd.DataFrame([])
        for dataset in datasets:
              result = []
              file_path = data_path+f"{dataset}_{suffix}.csv"
              review_dataset = pd.read_csv(file_path)
              review_dataset = review_dataset.fillna("")
              all_review_dataset = pd.concat([all_review_dataset, review_dataset])

        all_review_dataset = all_review_dataset[['app_name', 'text', tlabel]]
        all_review_dataset[tlabel] = all_review_dataset[tlabel].map(int)

        ############################## similar git datasets #########################

        git_app_description = git_basic_details[git_basic_details['id'] == pack_to_git[test_name]]['text'].values.tolist()[0]
        google_app_description_vector = desc_tfIdfVectorizer.transform([git_app_description]).toarray()[0]


        no_similars_ = no_similars
        similarity = [cosine_similarity([t, google_app_description_vector])[0][1] for t in git_basic_descriptions_vectors]
        similar_projects = git_basic_names[(-np.array(similarity)).argsort()[:no_similars_]]

        not_exist = True
        tlabel = "label_"+target_label[0]
        similar_issues = all_issues[all_issues['id'].isin(similar_projects)][['text', tlabel]]

        ############################## same git dataset #########################
        same_issues = all_issues[all_issues['id'] == pack_to_git[test_name]]

        ############################## merge datasets #########################
        perc = 0.1
        thr = int(all_review_dataset.shape[0]*perc)
        aug_dataset = all_issues.sample(n=thr, random_state=seed)

        dataset_train = pd.concat([all_review_dataset[['text', tlabel]], aug_dataset[['text', tlabel]]])
        print("aug dataset: ")
        print(aug_dataset[tlabel].value_counts())

        print("train dataset: ")
        print(dataset_train[tlabel].value_counts())

        set_x = dataset_train['text'].values.tolist()
        set_y = dataset_train[tlabel].values.tolist()

        k = 5
        kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        kfold = kfold.split(set_x, set_y)

        i = 0
        result_pre = []
        result_rec = []
        result_f1 = []
        result_mic = []
        for train_index, val_index in kfold:
            print("------------------------------------------------------")
            print("repeat: ", i)
            i += 1
            np.random.shuffle(train_index)
            np.random.shuffle(val_index)
            train_sent = [set_x[i] for i in train_index]
            train_label = [set_y[i] for i in train_index]
            val_sent = [set_x[i] for i in val_index]
            val_label = [set_y[i] for i in val_index]

            tokenizer = AutoTokenizer.from_pretrained(deep_model)
            train_sent_ = tokenizer(train_sent, padding="max_length", max_length=max_length, truncation=True)
            val_sent_ = tokenizer(val_sent, padding="max_length", max_length=max_length, truncation=True)

            train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_sent_), train_label)).batch(b_size)
            val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_sent_), val_label)).batch(b_size)

            model = TFAutoModelForSequenceClassification.from_pretrained(deep_model, num_labels=num_labels)
            # model.layers[0].trainable = False
            optimizer = tf.keras.optimizers.Adam(learning_rate=l_r)
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            metrics=tf.metrics.SparseCategoricalAccuracy()
            model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
            # model.summary()
            model.fit(train_dataset, validation_data=val_dataset, epochs=epoch_no, batch_size=b_size,
                      callbacks=[EarlyStopping(monitor='val_loss', verbose=1, patience=patience, min_delta=min_delta)])

            test_sent = test_dataset['text_pre'].values.tolist()
            test_sents.extend(test_sent)
            test_sent = tokenizer(test_sent, padding="max_length", max_length=max_length, truncation=True)
            test_label = test_dataset[tlabel].values.tolist()

            pred_label = model.predict([x.ids for x in test_sent[::]])
            pred_label = to_label_index(pred_label)

            test_labels.extend(test_label)
            pred_labels.extend(pred_label)

            result = evaluate_bi(test_label, pred_label, True)
            result_pre.append(result[0])
            result_rec.append(result[1])
            result_f1.append(result[2])
            result = evaluate_bi(test_label, pred_label, True)
            result_mic.append(result)

        df_pre = pd.DataFrame(result_pre, index = range(k), columns=[target_label])
        df_pre.loc['mean'] = df_pre.mean()
        print("precision: ")
        print(df_pre)

        df_rec = pd.DataFrame(result_rec, index = range(k), columns=[target_label])
        df_rec.loc['mean'] = df_rec.mean()
        print("recall: ")
        print(df_rec)

        df_f1 = pd.DataFrame(result_f1, index = range(k), columns=[target_label])
        df_f1.loc['mean'] = df_f1.mean()
        print("f1-measure: ")
        print(df_f1)

        df_micro = pd.DataFrame(result_mic, index = range(k), columns=['precision', 'recall', 'f1-meature', 'accuracy'])
        df_micro.loc['mean'] = df_micro.mean()
        print("micro: \n")
        print(df_micro)



### **similar**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import DistilBertTokenizerFast
from transformers import TFAutoModelForSequenceClassification
from keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import compute_class_weight


labels = ['bug', 'feature', 'other']
datasets = ['guzman_dataset', 'maalej_dataset', 'jha_dataset', 'surminer_dataset','scalabrino_dataset']
data_path = "/content/drive/MyDrive/ColabNotebooks/data/inter_context/"

pack_to_git = { 'fr.free.nrw.commons': 'commons-app/apps-android-commons',
                'org.mozilla.focus': 'mozilla-mobile/focus-android',
                'org.odk.collect.android': 'opendatakit/collect',
                'de.danoeh.antennapod': 'AntennaPod/AntennaPod',
                'com.habitrpg.android.habitica': 'HabitRPG/habitica-android',
                'com.ichi2.anki': 'ankidroid/Anki-Android',
                'io.metamask': 'MetaMask/metamask-mobile',
                'io.homeassistant.companion.android': 'home-assistant/home-assistant-android',
                'org.mozilla.fenix': 'mozilla-mobile/fenix',
                'com.owncloud.android': 'nextcloud/android',
                'me.ccrama.redditslide': 'ccrama/Slide',
                'net.cozic.joplin': 'laurent22/joplin',
                'com.tavultesoft.kmapro': 'keymanapp/keyman'}

deep_model = 'distilbert-base-uncased'
b_size = 16
l_r = 5e-5
epoch_no = 3
min_delta = 0.0001
patience = 5
num_labels = 2
max_length = 50
suffix = "f"

test_labels = []
pred_labels = []
test_sents = []

git_basic_details = pd.read_csv(data_path+'git_details_proc.csv')
git_basic_details = git_basic_details.fillna("")
git_basic_descriptions = git_basic_details['text'].values.tolist()
git_basic_names = git_basic_details['id'].values

desc_tfIdfVectorizer = TfidfVectorizer(min_df=5, ngram_range=(1, 2), use_idf=True)
tfidf = desc_tfIdfVectorizer.fit_transform(git_basic_descriptions)
git_basic_descriptions_vectors = desc_tfIdfVectorizer.transform(git_basic_descriptions).toarray()


name = "jan_2"
all_issues = pd.read_csv(data_path+f"z_all_issues_train_{name}.csv",
                         names=['id', 'text', 'label_b', 'label_f'])[1:]
all_issues = all_issues.fillna("")
all_issues['label_b'] = all_issues['label_b'].map(int)
all_issues['label_f'] = all_issues['label_f'].map(int)


no_similars = 20
no_sample = 2000
no_app = 1

review_file = data_path + 'z_sample_reviews_2_f_pre_4.csv'
all_reviews = pd.read_csv(review_file)
all_reviews = all_reviews.fillna("")
app_names = np.unique(all_reviews['app_id'].values)


for target_label in labels:
    print(f"--------------------------- target label: {target_label} ---------------------------")
    tlabel = 'label_'+target_label[0]
    for test_name in app_names:

        test_labels = []
        pred_labels = []
        test_sents = []

        print("***************************************************")
        print('test on: ', test_name)

        test_dataset = all_reviews[all_reviews['app_id'] == test_name][['text_pre', 'label_b', 'label_f']]
        print(test_dataset[tlabel].value_counts())

        ############################## review datasets #########################
        all_review_dataset = pd.DataFrame([])
        for dataset in datasets:
              result = []
              file_path = data_path+f"{dataset}_{suffix}.csv"
              review_dataset = pd.read_csv(file_path)
              review_dataset = review_dataset.fillna("")
              all_review_dataset = pd.concat([all_review_dataset, review_dataset])

        all_review_dataset = all_review_dataset[['app_name', 'text', tlabel]]
        all_review_dataset[tlabel] = all_review_dataset[tlabel].map(int)

        ############################## similar git datasets #########################
        git_app_description = git_basic_details[git_basic_details['id'] == pack_to_git[test_name]]['text'].values.tolist()[0]
        google_app_description_vector = desc_tfIdfVectorizer.transform([git_app_description]).toarray()[0]


        no_similars_ = no_similars
        similarity = [cosine_similarity([t, google_app_description_vector])[0][1] for t in git_basic_descriptions_vectors]
        similar_projects = git_basic_names[(-np.array(similarity)).argsort()]
        similar_projects = [x for x in similar_projects if x != pack_to_git[test_name]]

        not_exist = True
        tlabel = "label_"+target_label[0]
        thr = int(all_review_dataset.shape[0]*0.3)
        print("theresholad", thr)
        similar_issues = pd.DataFrame([])
        for sim in similar_projects:
            similar_issues_ = all_issues[all_issues['id'] == sim][['text', tlabel]]
            similar_issues = pd.concat([similar_issues_, review_dataset])
            if similar_issues.shape[0] > thr:
                similar_issues = similar_issues.sample(n=thr, random_state=seed)
                break
        ############################## same git dataset #########################
        same_issues = all_issues[all_issues['id'] == pack_to_git[test_name]]

        ############################## merge datasets #########################
        dataset_train = pd.concat([all_review_dataset[['text', tlabel]], similar_issues[['text', tlabel]]])

        print("similar dataset: ")
        print(similar_issues[tlabel].value_counts())

        print("train dataset: ")
        print(dataset_train[tlabel].value_counts())

        set_x = dataset_train['text'].values.tolist()
        set_y = dataset_train[tlabel].values.tolist()

        k = 5
        kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        kfold = kfold.split(set_x, set_y)

        i = 0
        result_pre = []
        result_rec = []
        result_f1 = []
        result_mic = []
        for train_index, val_index in kfold:
            print("------------------------------------------------------")
            print("repeat: ", i)
            i += 1
            np.random.shuffle(train_index)
            np.random.shuffle(val_index)
            train_sent = [set_x[i] for i in train_index]
            train_label = [set_y[i] for i in train_index]
            val_sent = [set_x[i] for i in val_index]
            val_label = [set_y[i] for i in val_index]


            tokenizer = AutoTokenizer.from_pretrained(deep_model)
            train_sent_ = tokenizer(train_sent, padding="max_length", max_length=max_length, truncation=True)
            val_sent_ = tokenizer(val_sent, padding="max_length", max_length=max_length, truncation=True)

            train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_sent_), train_label)).batch(b_size)
            val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_sent_), val_label)).batch(b_size)

            model = TFAutoModelForSequenceClassification.from_pretrained(deep_model, num_labels=num_labels)
            # model.layers[0].trainable = False
            optimizer = tf.keras.optimizers.Adam(learning_rate=l_r)
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            metrics=tf.metrics.SparseCategoricalAccuracy()
            model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
            # model.summary()
            model.fit(train_dataset, validation_data=val_dataset, epochs=epoch_no, batch_size=b_size,
                       callbacks=[EarlyStopping(monitor='val_loss', verbose=1, patience=patience, min_delta=min_delta)])

            test_sent_ = test_dataset['text_pre'].values.tolist()
            test_sents.extend(test_sent_)
            test_sent = tokenizer(test_sent_, padding="max_length", max_length=max_length, truncation=True)
            test_label = test_dataset[tlabel].values.tolist()

            pred_label = model.predict([x.ids for x in test_sent[::]])
            pred_label = to_label_index(pred_label)

            test_labels.extend(test_label)
            pred_labels.extend(pred_label)

            result = evaluate_bi(test_label, pred_label, True)
            result_pre.append(result[0])
            result_rec.append(result[1])
            result_f1.append(result[2])
            result_mic.append(result)
            false_sents = [test_sent_[i] for i,x in enumerate(pred_label) if x == 0 and test_label[i] == 1]
            print(false_sents)

        df_pre = pd.DataFrame(result_pre, index = range(k), columns=[target_label])
        df_pre.loc['mean'] = df_pre.mean()
        print("precision: ")
        print(df_pre)

        df_rec = pd.DataFrame(result_rec, index = range(k), columns=[target_label])
        df_rec.loc['mean'] = df_rec.mean()
        print("recall: ")
        print(df_rec)

        df_f1 = pd.DataFrame(result_f1, index = range(k), columns=[target_label])
        df_f1.loc['mean'] = df_f1.mean()
        print("f1-measure: ")
        print(df_f1)

        df_micro = pd.DataFrame(result_mic, index = range(k), columns=['precision', 'recall', 'f1-meature', 'accuracy'])
        df_micro.loc['mean'] = df_micro.mean()
        print("micro: \n")
        print(df_micro)

