<a href="https://colab.research.google.com/github/MaxVortman/IDEA-code-clones/blob/master/colab/code_clones_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from scipy import sparse

import warnings
warnings.filterwarnings('ignore')

In [0]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

def printMetrics(y_test, y_pred):
    print('accuracy: ')
    print(accuracy_score(y_test, y_pred))
    print('\nprecision: ')
    print(precision_score(y_test, y_pred))
    print('\nrecall: ')
    print(recall_score(y_test, y_pred))
    print('\nroc auc: ')
    print(roc_auc_score(y_test, y_pred))

In [0]:
def resample(X, y):
  rus = RandomUnderSampler(random_state=23)
  return rus.fit_resample(X, y)

## Bag of words

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
sparse_bow_X = sparse.load_npz("/content/gdrive/My Drive/code-clones/csv/bagofwords_vectors_X.npz")
sparse_bow_y = sparse.load_npz("/content/gdrive/My Drive/code-clones/csv/bagofwords_vectors_y.npz").T

In [6]:
sparse_bow_y.shape

(19990, 1)

In [7]:
sparse_bow_X.shape

(19990, 52736)

In [0]:
X_resampled, y_resampled = resample(sparse_bow_X, sparse_bow_y.toarray())
#print(sorted(Counter(y_resampled.T).items()))
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=23)

In [9]:
!git clone https://github.com/hyperopt/hyperopt-sklearn
%cd hyperopt-sklearn
!pip install -e .

Cloning into 'hyperopt-sklearn'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects:  14% (1/7)   [Kremote: Counting objects:  28% (2/7)   [Kremote: Counting objects:  42% (3/7)   [Kremote: Counting objects:  57% (4/7)   [Kremote: Counting objects:  71% (5/7)   [Kremote: Counting objects:  85% (6/7)   [Kremote: Counting objects: 100% (7/7)   [Kremote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects:  14% (1/7)   [Kremote: Compressing objects:  28% (2/7)   [Kremote: Compressing objects:  42% (3/7)   [Kremote: Compressing objects:  57% (4/7)   [Kremote: Compressing objects:  71% (5/7)   [Kremote: Compressing objects:  85% (6/7)   [Kremote: Compressing objects: 100% (7/7)   [Kremote: Compressing objects: 100% (7/7), done.[K
Receiving objects:   0% (1/1171)   Receiving objects:   1% (12/1171)   Receiving objects:   2% (24/1171)   Receiving objects:   3% (36/1171)   Receiving objects:   4% (47/1171)   Receiving objects:  

In [10]:
from hpsklearn import HyperoptEstimator, any_sparse_classifier, any_classifier
from hyperopt import tpe

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


## Сustom loss

In [0]:
def calc_fnr_fpr(y_target, y_prediction):
  fp = 0
  fn = 0
  n = len(y_target)
  for i in range(n):
    if y_prediction[i] == 1 and y_target[i] == 0:
      fp += 1
    if y_prediction[i] == 0 and y_target[i] == 1:
      fn += 1
  return (fn / n, fp / n) 

In [0]:
def custom_loss(y_target, y_prediction):
  k_fn = 1
  k_fp = 10
  n = len(y_target)
  (fnr, fpr) = calc_fnr_fpr(y_target, y_prediction)
  return k_fn * fnr + k_fp * fpr

In [0]:
def hyperopt_custom_loss_estimator(X_train, y_train, sparse=True):
  classifier = any_sparse_classifier('clf') if sparse else any_classifier('clf')
  estim = HyperoptEstimator(classifier=classifier,
                          preprocessing=[],
                          algo=tpe.suggest, 
                          trial_timeout=300,
                          loss_fn=custom_loss,
                          seed=23)
  estim.fit(X_train, y_train)
  return estim

## Custom loss bow

In [14]:
estim = hyperopt_custom_loss_estimator(X_train, y_train)

100%|██████████| 1/1 [01:10<00:00, 70.92s/it, best loss: 1.7004297114794351]
100%|██████████| 1/1 [00:00<00:00,  2.02it/s, best loss: 1.7004297114794351]
100%|██████████| 1/1 [00:00<00:00, 12.25it/s, best loss: 1.4223449969306323]
100%|██████████| 1/1 [00:00<00:00,  2.41it/s, best loss: 1.4223449969306323]
100%|██████████| 1/1 [00:04<00:00,  4.30s/it, best loss: 1.4223449969306323]
100%|██████████| 1/1 [01:09<00:00, 69.96s/it, best loss: 1.4223449969306323]
100%|██████████| 1/1 [00:00<00:00, 15.71it/s, best loss: 1.4223449969306323]
100%|██████████| 1/1 [00:00<00:00, 15.44it/s, best loss: 1.4223449969306323]
100%|██████████| 1/1 [00:00<00:00,  3.54it/s, best loss: 1.4223449969306323]
100%|██████████| 1/1 [00:00<00:00, 16.27it/s, best loss: 1.4223449969306323]


In [0]:
y_pred = estim.predict(X_test)

In [16]:
calc_fnr_fpr(y_test, y_pred)

(0.1285451197053407, 0.1406998158379374)

In [17]:
printMetrics(y_test, y_pred)

accuracy: 
0.7307550644567219

precision: 
0.730225988700565

recall: 
0.74765003615329

roc auc: 
0.7304316246832515


In [18]:
estim.best_model()

{'ex_preprocs': (),
 'learner': SGDClassifier(alpha=0.06118428364662971, average=False, class_weight=None,
        early_stopping=False, epsilon=0.1, eta0=0.0003970052556398526,
        fit_intercept=True, l1_ratio=0.5022724095962902,
        learning_rate='constant', loss='log', max_iter=158684350.0,
        n_iter=None, n_iter_no_change=5, n_jobs=1, penalty='l2',
        power_t=0.907051629874976, random_state=0, shuffle=True,
        tol=0.006308140398304822, validation_fraction=0.1, verbose=False,
        warm_start=False),
 'preprocs': ()}

In [0]:
thr = 0.5
y_prob = [y[1] for y in estim.best_model()['learner'].predict_proba(X_test)]
y_pred = [int(y >= thr) for y in y_prob]
_, fpr = calc_fnr_fpr(y_test, y_pred)
while fpr - 0.10 >= 0.005:
  thr += 0.0005
  y_pred = [int(y >= thr) for y in y_prob]
  _, fpr = calc_fnr_fpr(y_test, y_pred)

In [30]:
thr

0.5199999999999978

In [31]:
calc_fnr_fpr(y_test, y_pred)

(0.17605893186003682, 0.10460405156537753)

In [32]:
printMetrics(y_test, y_pred)

accuracy: 
0.7193370165745856

precision: 
0.7611438183347351

recall: 
0.6543745480838756

roc auc: 
0.7205806674353312


## code2vec

In [0]:
df1 = pd.read_csv("/content/gdrive/My Drive/code-clones/csv/lwjgl3_code2vec_vectors1.csv")
df2 = pd.read_csv("/content/gdrive/My Drive/code-clones/csv/lwjgl3_code2vec_vectors2.csv")
df3 = pd.read_csv("/content/gdrive/My Drive/code-clones/csv/lwjgl_code2vec_vectors.csv")
df4 = pd.read_csv("/content/gdrive/My Drive/code-clones/csv/spring_code2vec_vectors.csv")
df5 = pd.read_csv("/content/gdrive/My Drive/code-clones/csv/jenkins_code2vec_vectors.csv")
df6 = pd.read_csv("/content/gdrive/My Drive/code-clones/csv/rxjava_code2vec_vectors.csv")
df_c2v = pd.concat([df1, df2, df3, df4, df5, df6], axis=0)

In [34]:
df_c2v.head()

Unnamed: 0,exp,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384
0,0.0,0.708866,-0.139283,0.790398,0.880303,0.291317,0.553174,0.823598,-0.693108,-0.379951,-0.536196,-0.057756,-0.763562,-0.78196,-0.448249,-0.533357,-0.451522,-0.065547,0.545325,0.905961,0.846379,0.947172,-0.874068,0.936903,-0.440319,0.009643,-0.558544,0.288042,-0.060373,0.068816,-0.541809,-0.713808,0.880594,-0.628212,-0.87276,0.714413,-0.542334,0.740742,-0.198978,-0.450086,...,0.075166,-0.686472,0.770341,0.88623,0.070973,0.990748,0.556059,-0.979709,0.71664,-0.159755,-0.584313,0.86368,-0.305081,0.005122,0.20805,-0.919549,0.481194,-0.905497,0.054141,0.132897,-0.831053,0.891648,0.524972,-0.797217,0.006505,-0.80547,0.439665,-0.932609,-0.97223,0.444506,0.409421,-0.743124,-0.406415,0.34819,-0.722999,0.883331,0.975385,0.811376,0.529098,-0.580801
1,0.0,0.45757,0.465724,0.04987,0.329546,0.185015,-0.367061,0.646942,0.066054,-0.377213,0.211417,-0.618707,0.051697,-0.09821,0.367054,-0.714161,-0.27363,0.390221,0.230662,0.526845,-0.471009,-0.086963,0.647146,-0.149217,0.850062,0.063569,-0.201275,-0.523791,0.217598,0.393475,-0.2423,0.048727,0.385479,-0.608135,-0.076953,0.053418,0.257176,-0.541758,-0.838411,0.164181,...,0.243302,-0.210837,-0.267095,-0.873269,-0.082603,-0.095414,-0.132618,-0.470093,-0.649802,0.424356,0.059514,-0.261998,-0.033176,0.04918,0.144208,0.05668,0.152013,-0.175859,-0.114169,0.529386,0.405167,0.616029,-0.260005,-0.537304,0.111804,0.100389,0.545154,-0.169261,0.353477,0.224362,0.422311,0.542749,0.726254,0.081205,-0.139942,-0.282454,0.584696,0.31682,0.038821,-0.080841
2,0.0,0.674882,-0.227605,-0.05146,0.322587,0.113033,-0.206298,0.618875,-0.831079,0.467831,0.223961,-0.474329,-0.135923,0.431246,-0.218008,-0.649191,0.012374,0.336949,0.886156,0.454673,-0.639669,0.443773,0.031497,0.345675,0.482762,-0.693035,0.416126,-0.515968,0.172227,0.217237,-0.31185,-0.614717,0.240569,-0.55482,-0.054012,-0.354445,-0.449013,-0.17667,-0.092752,-0.221199,...,-0.175787,-0.18485,-0.531791,-0.455446,-0.020367,0.270404,0.041985,-0.55016,0.767969,0.171719,-0.247429,-0.261586,-0.07214,-0.336018,-0.227835,-0.287362,-0.423872,-0.743692,0.063031,0.646673,-0.507511,0.666606,-0.274423,0.246878,-0.009419,-0.132733,0.432687,0.648385,0.087229,-0.328465,0.380035,-0.358241,0.168523,-0.115633,0.013167,-0.600655,-0.152489,0.309746,0.104927,-0.33848
3,0.0,0.238518,0.239017,-0.329432,0.076081,0.674525,0.240891,-0.138851,0.377708,-0.052835,0.758993,0.212176,0.31909,0.772388,-0.185795,0.141485,-0.263851,-0.114226,0.478966,-0.43692,-0.188591,-0.345383,0.206126,-0.520585,0.897969,0.783976,0.424795,0.498398,-0.557342,0.319309,-0.687786,0.658537,0.692565,0.06352,0.564135,0.445329,-0.448139,-0.636677,0.653752,-0.661554,...,0.37313,-0.401377,-0.644872,0.041106,0.609853,-0.099505,0.522919,-0.703145,0.073218,-0.380949,-0.728012,-0.726868,-0.303032,-0.133776,0.549605,-0.556467,0.017998,-0.53082,-0.10609,-0.76048,-0.038897,0.251707,0.725792,0.145969,0.424751,-0.584596,-0.767587,0.301838,0.746892,-0.455481,-0.629672,-0.818671,-0.24046,0.834071,-0.167629,-0.331171,-0.589584,0.668989,0.260057,0.465806
4,0.0,-0.261068,-0.683279,-0.386828,0.031994,-0.891767,0.202983,-0.022529,0.667178,0.515003,0.826141,-0.176847,-0.039715,-0.733276,0.443135,-0.023443,-0.327092,-0.362199,0.233987,0.319233,0.29928,-0.234696,0.303305,0.170403,-0.224727,0.483681,-0.372073,0.648301,-0.407349,-0.432478,0.606037,-0.137597,0.223075,0.147892,-0.423443,-0.44104,-0.332034,-0.342627,0.125737,-0.40785,...,-0.610979,-0.784399,0.278932,-0.380384,-0.457574,0.915914,0.290403,-0.275704,-0.058648,0.762267,-0.22762,0.422656,0.326875,-0.433874,0.615914,0.239106,-0.050791,-0.4105,-0.014974,0.270726,-0.353269,0.280647,-0.001562,0.455043,0.148662,0.32071,-0.886484,0.124052,0.072454,0.706325,-0.393192,0.125753,0.020193,0.403067,-0.779176,0.066885,0.953279,-0.108147,0.251376,-0.54375


In [0]:
y = df_c2v['exp']
X = df_c2v.drop('exp', 1)
X_resampled, y_resampled = resample(X, y)
#print(sorted(Counter(y_resampled.T).items()))
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=23)

## Custom loss code2vec

In [36]:
estim = hyperopt_custom_loss_estimator(X_train, y_train, sparse=False)

100%|██████████| 1/1 [00:13<00:00, 13.23s/it, best loss: 2.9465317919075145]
100%|██████████| 1/1 [00:11<00:00, 11.01s/it, best loss: 2.79335260115607]
100%|██████████| 1/1 [00:18<00:00, 18.91s/it, best loss: 2.79335260115607]
100%|██████████| 1/1 [01:14<00:00, 74.96s/it, best loss: 2.79335260115607]
100%|██████████| 1/1 [00:12<00:00, 12.51s/it, best loss: 2.0209537572254335]
100%|██████████| 1/1 [00:16<00:00, 16.33s/it, best loss: 2.0209537572254335]
100%|██████████| 1/1 [00:00<00:00,  1.95it/s, best loss: 1.9342485549132948]
100%|██████████| 1/1 [01:34<00:00, 94.50s/it, best loss: 1.9342485549132948]
100%|██████████| 1/1 [00:01<00:00,  1.46s/it, best loss: 1.8229768786127167]
100%|██████████| 1/1 [00:03<00:00,  3.90s/it, best loss: 1.8229768786127167]


In [0]:
y_pred = estim.predict(X_test)

In [38]:
calc_fnr_fpr(y_test, y_pred)

(0.07368877329865627, 0.19592544429995665)

In [39]:
printMetrics(y_test, y_pred)

accuracy: 
0.7303857824013871

precision: 
0.6861111111111111

recall: 
0.853195164075993

roc auc: 
0.7299048057107556


In [40]:
estim.best_model()

{'ex_preprocs': (),
 'learner': GradientBoostingClassifier(criterion='friedman_mse', init=None,
               learning_rate=0.005891891362147046, loss='deviance',
               max_depth=None, max_features='log2', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=8, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=16,
               n_iter_no_change=None, presort='auto', random_state=0,
               subsample=1.0, tol=0.0001, validation_fraction=0.1,
               verbose=0, warm_start=False),
 'preprocs': ()}

In [0]:
thr = 0.5
y_prob = [y[1] for y in estim.best_model()['learner'].predict_proba(X_test)]
y_pred = [int(y >= thr) for y in y_prob]
_, fpr = calc_fnr_fpr(y_test, y_pred)
while fpr - 0.10 >= 0.005:
  thr += 0.0005
  y_pred = [int(y >= thr) for y in y_prob]
  _, fpr = calc_fnr_fpr(y_test, y_pred)

In [42]:
thr

0.5084999999999991

In [43]:
calc_fnr_fpr(y_test, y_pred)

(0.1729518855656697, 0.10489813610749892)

In [44]:
printMetrics(y_test, y_pred)

accuracy: 
0.7221499783268314

precision: 
0.7582417582417582

recall: 
0.655440414507772

roc auc: 
0.7224112429370888
