# Google Drive Access



In [None]:
# Mounting google drive
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!pip install -q pydrive

In [None]:
# Authentication of google drive account
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Installing Necessary Libs

In [None]:
!pip install -q keras

In [None]:
!pip install simpletransformers

In [None]:
!pip install -q iterative-stratification

# Defining Some Necessary Functions




In [None]:
# For data cleaning purpuse
def clean_text(text):
    
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\r", " ", text)

    return text

In [None]:
# For measuirign some evaluation metrics specific to MultiLabel Problem
# They were basically used in our earlier introductory work of SANER 2021 conference.

def accuracy_multilabel(y_true,y_pred):
    cnt=0

    N=len(y_pred[0])
    if(N==0):
        return 0

    total=len(y_true)

    for i in range(total):
        cnt_temp=0
        for j in range(N):
            if(y_true[i][j]==y_pred[i][j]):
                cnt_temp+=1
        cnt+=cnt_temp

    cnt/=N
    acc=cnt/total

    return acc



def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        #print('\nset_true: {0}'.format(set_true))
        #print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)
    return np.mean(acc_list)


def exact_match(y_true,y_pred):

    N=len(y_pred[0])
    if(N==0):
        return 0

    total=0
    accurate = 0

    for i in range(len(y_pred)):
        for j in range(N):
            if(y_true[i][j]==y_pred[i][j]):
                accurate+=1
            total+=1

    acc=accurate/total

    return acc

# Specifying Project Path

In [None]:
# Specifying the project path (where all the corresponding files exist)
project_path = '/content/drive/My Drive/Code Documentation Project/'

# Importing Necessary Libs

In [None]:
# Importing necessary libs
from sklearn.model_selection import train_test_split
from simpletransformers.classification import MultiLabelClassificationModel
import glob
import os
import gc
import keras.backend as K
import numpy as np
import pickle
import pandas as pd
import re
import string
import numpy as np
import pandas as pd


import glob

import os
#from sklearn.model_selection import KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

import gc
import keras.backend as K
import numpy as np

import wandb

# Loading Dataset

In [None]:
## Reading all NEW labelled data

# reading the file names
all_files = glob.glob(project_path+"Documentation Smell (Extension)/Dataset/all labelled sample sets-extension/" + "/*.xlsx")

main_df = pd.DataFrame()

# loading new file contents in the main_df
for filename in all_files:
    df = pd.read_excel(filename)
    try:
      df['Documentation Text'] = df['Documentation Text'].apply(clean_text)
    except:
      print(filename)
      pass
    main_df = main_df.append(df,ignore_index=True)

# clearing memory
del all_files
del df

# removing any unintentional null data
main_df = main_df.dropna(axis=1, how='any') 

# selecting the required columns
selected_columns = ['Id',	'Method Prototype',	'Documentation Text',	'Fragmented',	'Tangled',	'Excessive Structured',	'Bloated',	'Lazy']
main_df = main_df[selected_columns]


# loading old labelled data (SANER conference)
saner_df = pd.read_excel(project_path + 'Documentation Smell (Extension)/Dataset/labelled_dataset_full_SANER.xlsx')

# merging NEW and OLD dataset
main_df = main_df.append(saner_df,ignore_index=True)

#main_df = main_df.head(n=50) ### to be commented out

# clearing memory
del saner_df


# checking data by printing
print(main_df.head())

# Correcting Data Format to Fit with Transformer (simple transformer)

In [None]:
text=main_df['Documentation Text']
text=text.map(lambda x: clean_text(x))

label=main_df.iloc[:,3:8].values

# Model Specification

In [None]:
model_type = "bert"
model_name = "bert-base-cased"

train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "use_cached_eval_features": True,
    "output_dir": project_path + "Models/Transformers/output/"+model_type,
    "best_model_dir": project_path + "Models/Transformers/output/"+model_type+"/best_model",
    "use_early_stopping": True,
    "early_stopping_delta": 0.0,
    "early_stopping_metric": "eval_loss",
    "early_stopping_metric_minimize" : True,
    "early_stopping_patience" : 2,
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 25,
    "wandb_project": "Simple Sweep",
    "wandb_kwargs": {"name": model_name},
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "evaluate_during_training_verbose" : True,
    "max_seq_length": 300,
    "num_train_epochs": 10
}



# Hyperparameter Tuning Specification

In [None]:
sweep_config = {
    "method": "grid",  # grid, random, bayes
    "metric": {"name": "train_loss", "goal": "minimize"},
    "parameters": {
        "learning_rate": {"values": [5e-5, 3e-4]},
        "train_batch_size":{"values":[16,32]},
    },
}


sweep_id = wandb.sweep(sweep_config, project="Simple Sweep")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: hbqqinol
Sweep URL: https://wandb.ai/junaed/Simple%20Sweep/sweeps/hbqqinol


# Iterative Cross Validation

In [None]:
X_train_all = np.array(text)
y_train_all = np.array(label)

print(X_train_all.shape)
print(y_train_all.shape)

num_cross_validation = 5 #1


mskf = MultilabelStratifiedShuffleSplit(n_splits = num_cross_validation,test_size=0.2, random_state=42)

pred_list=[]


Fold = 1

for train, val in mskf.split(X_train_all, y_train_all):
    gc.collect()
    K.clear_session()
    print('Fold: ', Fold)

    X_train = X_train_all[train]
    X_val = X_train_all[val]
    y_train = y_train_all[train]
    y_val = y_train_all[val]

    with open(project_path + 'Documentation Smell (Extension)/Dataset/Pickles/Pickles of BERT on Extended Dataset/Train Test Fold/pickle_train_test_fold_' +str(Fold) + '.pickle','wb') as f:
        pickle.dump((X_train, X_val, y_train, y_val),f)

    Fold = Fold + 1

del main_df # To clear memory

In [None]:
pred_list=[]

Fold = 1

gc.collect()
K.clear_session()
print('Fold: ', Fold)

with open(project_path + 'Documentation Smell (Extension)/Dataset/Pickles/Pickles of BERT on Extended Dataset/Train Test Fold/pickle_train_test_fold_' +str(Fold) + '.pickle','rb') as f:
    X_train, X_val, y_train, y_val = pickle.load(f)


ll=[]
for i in range(len(X_train)):
  ll.append([X_train[i],y_train[i]])

train_val_df = pd.DataFrame(ll)
train_val_df.columns = ["text", "labels"]

ll=[]
for i in range(len(X_val)):
  ll.append([X_val[i],y_val[i]])

test_df = pd.DataFrame(ll)
test_df.columns = ["text", "labels"]

train_df, val_df = train_test_split(train_val_df, test_size=0.1)



# Training and Hyperparameter Tuning

In [None]:
# A training function with specified model and hyperpatemeter specifications
def train_function():
  # Create a ClassificationModel
  model = MultiLabelClassificationModel(model_type, model_name, num_labels=5, args=train_args,sweep_config=wandb.config,)

  # Train the model
  #model.train_model(train_df, eval_df=val_df)
  model.train_model(train_df, eval_df=val_df)
  wandb.join()

# Run wandb agent for hyperparameter tuning
wandb.agent(sweep_id, train_function)

# Evaluation of Best Model

In [None]:
loaded_model = MultiLabelClassificationModel(
    "bert", project_path + "Models/Transformers/output/bert/"
)

pred, raw_outputs = loaded_model.predict(test_df['text'].tolist())


#
y_test_all = test_df['labels']

pred_binary=np.array(pred)

for i in range(len(pred_binary)):
  for j in range(len(pred_binary[i])):
    pred_binary[i][j]=int(1*(pred_binary[i][j]>0.5))

  #y_true_all_fold.append(y_val[i])
  pred_binary_all.append(pred_binary[i])

In [None]:
from sklearn.metrics import classification_report,precision_recall_fscore_support
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.metrics import accuracy_score,jaccard_similarity_score, hamming_loss

report=classification_report(y_test_all,pred_binary_all,average='weighted')

print('Classification Report: '+str(report))