<a href="https://colab.research.google.com/github/MaxVortman/ASCIIrecipient/blob/master/code_clones_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logistic Regression

In [0]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from scipy import sparse

import warnings
warnings.filterwarnings('ignore')

In [0]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def printMetrics(y_test, y_pred):
    print('accuracy: ')
    print(accuracy_score(y_test, y_pred))
    print('\nprecision: ')
    print(precision_score(y_test, y_pred))
    print('\nrecall: ')
    print(recall_score(y_test, y_pred))

In [0]:
def resample(X, y):
  rus = RandomUnderSampler(random_state=23)
  return rus.fit_resample(X, y)

In [0]:
def logit_with_gs(df, C=1):
  y = df['exp']
  X = df.drop('exp', 1)
  X_resampled, y_resampled = resample(X, y)
  print(sorted(Counter(y_resampled).items()))
  X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=23)
  clf = LogisticRegression(random_state=23, C=C)
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  printMetrics(y_test, y_pred)
  #parameters = {'C':[0.01, 0.03, 0.1, 0.3, 1, 3]}
  #best_clf = GridSearchCV(LogisticRegression(random_state=23), parameters, cv=10)
  #best_clf.fit(X_resampled, y_resampled)
  #print(best_clf.best_estimator_)
  #print(best_clf.best_score_)

## Bag of words

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
sparse_bow_X = sparse.load_npz("/content/gdrive/My Drive/code-clones/csv/bagofwords_vectors_X.npz")
sparse_bow_y = sparse.load_npz("/content/gdrive/My Drive/code-clones/csv/bagofwords_vectors_y.npz").T

In [0]:
sparse_bow_y.shape

(19990, 1)

In [0]:
sparse_bow_X.shape

(19990, 52736)

In [0]:
X_resampled, y_resampled = resample(sparse_bow_X, sparse_bow_y.toarray())
#print(sorted(Counter(y_resampled.T).items()))
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=23)

In [0]:
!git clone https://github.com/hyperopt/hyperopt-sklearn
%cd hyperopt-sklearn
!pip install -e .

Cloning into 'hyperopt-sklearn'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 1171 (delta 1), reused 4 (delta 0), pack-reused 1164[K
Receiving objects: 100% (1171/1171), 2.00 MiB | 19.71 MiB/s, done.
Resolving deltas: 100% (708/708), done.
/content/hyperopt-sklearn
Obtaining file:///content/hyperopt-sklearn
Installing collected packages: hpsklearn
  Running setup.py develop for hpsklearn
Successfully installed hpsklearn


In [0]:
from hpsklearn import HyperoptEstimator, any_sparse_classifier, any_classifier
from hyperopt import tpe

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


## Сustom loss

In [0]:
def calc_fnr_fpr(y_target, y_prediction):
  fp = 0
  fn = 0
  n = len(y_target)
  for i in range(n):
    if y_prediction[i] == 1 and y_target[i] == 0:
      fp += 1
    if y_prediction[i] == 0 and y_target[i] == 1:
      fn += 1
  return (fn / n, fp / n) 

In [0]:
def custom_loss(y_target, y_prediction):
  k_fn = 1
  k_fp = 10
  n = len(y_target)
  (fnr, fpr) = calc_fnr_fpr(y_target, y_prediction)
  return k_fn * fnr + k_fp * fpr

In [0]:
def hyperopt_custom_loss_estimator(X_train, y_train, sparse=True):
  classifier = any_sparse_classifier('clf') if sparse else any_classifier('clf')
  estim = HyperoptEstimator(classifier=classifier,
                          preprocessing=[],
                          algo=tpe.suggest, 
                          trial_timeout=300,
                          loss_fn=custom_loss,
                          seed=23)
  estim.fit(X_train, y_train)
  return estim

## Custom loss bow

In [0]:
estim = hyperopt_custom_loss_estimator(X_train, y_train)
y_pred = estim.predict(X_test)
calc_fnr_fpr(y_test, y_pred)

100%|██████████| 1/1 [01:22<00:00, 82.20s/it, best loss: 1.7004297114794351]
100%|██████████| 1/1 [00:00<00:00,  1.71it/s, best loss: 1.7004297114794351]
100%|██████████| 1/1 [00:00<00:00,  9.83it/s, best loss: 1.4223449969306323]
100%|██████████| 1/1 [00:00<00:00,  1.96it/s, best loss: 1.4223449969306323]
100%|██████████| 1/1 [00:04<00:00,  4.43s/it, best loss: 1.4223449969306323]
100%|██████████| 1/1 [01:14<00:00, 74.37s/it, best loss: 1.4223449969306323]
100%|██████████| 1/1 [00:00<00:00, 12.23it/s, best loss: 1.4223449969306323]
100%|██████████| 1/1 [00:00<00:00, 12.97it/s, best loss: 1.4223449969306323]
100%|██████████| 1/1 [00:00<00:00,  3.31it/s, best loss: 1.4223449969306323]
100%|██████████| 1/1 [00:00<00:00, 12.52it/s, best loss: 1.4223449969306323]


(0.1285451197053407, 0.1406998158379374)

In [0]:
estim.best_model()

{'ex_preprocs': (),
 'learner': SGDClassifier(alpha=0.06118428364662971, average=False, class_weight=None,
        early_stopping=False, epsilon=0.1, eta0=0.0003970052556398526,
        fit_intercept=True, l1_ratio=0.5022724095962902,
        learning_rate='constant', loss='log', max_iter=158684350.0,
        n_iter=None, n_iter_no_change=5, n_jobs=1, penalty='l2',
        power_t=0.907051629874976, random_state=0, shuffle=True,
        tol=0.006308140398304822, validation_fraction=0.1, verbose=False,
        warm_start=False),
 'preprocs': ()}

## Hyperopt bow

In [0]:
estim = HyperoptEstimator(classifier=any_sparse_classifier('clf'),
                          preprocessing=[],
                          algo=tpe.suggest, 
                          trial_timeout=300,
                          seed=23)
estim.fit(X_train, y_train)
y_pred = estim.predict(X_test)
printMetrics(y_test, y_pred)

100%|██████████| 1/1 [01:10<00:00, 70.73s/it, best loss: 0.3136893799877225]
100%|██████████| 1/1 [00:00<00:00,  2.22it/s, best loss: 0.3063228974831185]
100%|██████████| 1/1 [00:00<00:00, 12.71it/s, best loss: 0.26212400245549416]
100%|██████████| 1/1 [00:00<00:00,  2.31it/s, best loss: 0.26212400245549416]
100%|██████████| 1/1 [00:04<00:00,  4.20s/it, best loss: 0.2523020257826888]
100%|██████████| 1/1 [01:10<00:00, 70.10s/it, best loss: 0.2523020257826888]
100%|██████████| 1/1 [00:00<00:00, 15.39it/s, best loss: 0.2523020257826888]
100%|██████████| 1/1 [00:00<00:00, 15.55it/s, best loss: 0.2523020257826888]
100%|██████████| 1/1 [00:00<00:00,  3.62it/s, best loss: 0.2523020257826888]
100%|██████████| 1/1 [00:00<00:00, 15.54it/s, best loss: 0.2523020257826888]
accuracy: 
0.7524861878453039

precision: 
0.7416723317471108

recall: 
0.7888647866955893


In [0]:
clf = LogisticRegression(random_state=23)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
printMetrics(y_test, y_pred)

accuracy: 
0.7473296500920811

precision: 
0.744218640504555

recall: 
0.7678958785249458


## code2vec

In [0]:
df1 = pd.read_csv("/content/gdrive/My Drive/code-clones/csv/lwjgl3_code2vec_vectors1.csv")
df2 = pd.read_csv("/content/gdrive/My Drive/code-clones/csv/lwjgl3_code2vec_vectors2.csv")
df3 = pd.read_csv("/content/gdrive/My Drive/code-clones/csv/lwjgl_code2vec_vectors.csv")
df4 = pd.read_csv("/content/gdrive/My Drive/code-clones/csv/spring_code2vec_vectors.csv")
df5 = pd.read_csv("/content/gdrive/My Drive/code-clones/csv/jenkins_code2vec_vectors.csv")
df6 = pd.read_csv("/content/gdrive/My Drive/code-clones/csv/rxjava_code2vec_vectors.csv")
df_c2v = pd.concat([df1, df2, df3, df4, df5, df6], axis=0)

In [0]:
df_c2v.head()

Unnamed: 0,exp,1,2,3,4,5,6,7,8,9,...,375,376,377,378,379,380,381,382,383,384
0,0.0,0.708866,-0.139283,0.790398,0.880303,0.291317,0.553174,0.823598,-0.693108,-0.379951,...,0.409421,-0.743124,-0.406415,0.34819,-0.722999,0.883331,0.975385,0.811376,0.529098,-0.580801
1,0.0,0.45757,0.465724,0.04987,0.329546,0.185015,-0.367061,0.646942,0.066054,-0.377213,...,0.422311,0.542749,0.726254,0.081205,-0.139942,-0.282454,0.584696,0.31682,0.038821,-0.080841
2,0.0,0.674882,-0.227605,-0.05146,0.322587,0.113033,-0.206298,0.618875,-0.831079,0.467831,...,0.380035,-0.358241,0.168523,-0.115633,0.013167,-0.600655,-0.152489,0.309746,0.104927,-0.33848
3,0.0,0.238518,0.239017,-0.329432,0.076081,0.674525,0.240891,-0.138851,0.377708,-0.052835,...,-0.629672,-0.818671,-0.24046,0.834071,-0.167629,-0.331171,-0.589584,0.668989,0.260057,0.465806
4,0.0,-0.261068,-0.683279,-0.386828,0.031994,-0.891767,0.202983,-0.022529,0.667178,0.515003,...,-0.393192,0.125753,0.020193,0.403067,-0.779176,0.066885,0.953279,-0.108147,0.251376,-0.54375


In [0]:
y = df_c2v['exp']
X = df_c2v.drop('exp', 1)
X_resampled, y_resampled = resample(X, y)
#print(sorted(Counter(y_resampled.T).items()))
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=23)

## Custom loss code2vec

In [0]:
X_train

array([[-0.01438679,  0.3345368 , -0.8996175 , ..., -0.5302292 ,
         0.8585713 , -0.25103238],
       [ 0.01357042,  0.51884558, -0.90628993, ..., -0.2826493 ,
         0.73221474,  0.57430793],
       [ 0.82422745,  0.58516496,  0.69403327, ..., -0.6134136 ,
        -0.5360763 , -0.46073088],
       ...,
       [ 0.32086062,  0.40459627, -0.63095433, ...,  0.45880777,
         0.56039832, -0.29471198],
       [-0.1270637 ,  0.29226178, -0.9327881 , ..., -0.5400725 ,
         0.82426435, -0.0014705 ],
       [ 0.16497834, -0.01034388, -0.16908072, ..., -0.47794458,
         0.52750588, -0.24562948]])

In [0]:
estim = hyperopt_custom_loss_estimator(X_train, y_train, sparse=False)
y_pred = estim.predict(X_test)
calc_fnr_fpr(y_test, y_pred)

100%|██████████| 1/1 [00:13<00:00, 13.30s/it, best loss: 2.9465317919075145]
100%|██████████| 1/1 [00:11<00:00, 11.05s/it, best loss: 2.79335260115607]
100%|██████████| 1/1 [00:18<00:00, 18.54s/it, best loss: 2.79335260115607]
100%|██████████| 1/1 [01:16<00:00, 76.55s/it, best loss: 2.79335260115607]
100%|██████████| 1/1 [00:12<00:00, 12.26s/it, best loss: 2.0209537572254335]
100%|██████████| 1/1 [00:15<00:00, 15.58s/it, best loss: 2.0209537572254335]
100%|██████████| 1/1 [00:00<00:00,  2.57it/s, best loss: 1.9342485549132948]
100%|██████████| 1/1 [01:39<00:00, 99.01s/it, best loss: 1.9342485549132948]
100%|██████████| 1/1 [00:01<00:00,  1.68s/it, best loss: 1.8229768786127167]
100%|██████████| 1/1 [00:03<00:00,  3.95s/it, best loss: 1.8229768786127167]


(0.07368877329865627, 0.19592544429995665)

In [0]:
estim.best_model()

{'ex_preprocs': (),
 'learner': GradientBoostingClassifier(criterion='friedman_mse', init=None,
               learning_rate=0.005891891362147046, loss='deviance',
               max_depth=None, max_features='log2', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=8, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=16,
               n_iter_no_change=None, presort='auto', random_state=0,
               subsample=1.0, tol=0.0001, validation_fraction=0.1,
               verbose=0, warm_start=False),
 'preprocs': ()}

## Hyperopt code2vec

In [0]:
estim = HyperoptEstimator(classifier=any_classifier('clf'),
                          preprocessing=[],
                          algo=tpe.suggest, 
                          trial_timeout=300,
                          seed=23)
estim.fit(X_train, y_train)
y_pred = estim.predict(X_test)
printMetrics(y_test, y_pred)

100%|██████████| 1/1 [00:12<00:00, 12.76s/it, best loss: 0.3128612716763006]
100%|██████████| 1/1 [00:10<00:00, 10.28s/it, best loss: 0.30924855491329484]
100%|██████████| 1/1 [00:15<00:00, 15.36s/it, best loss: 0.3085260115606936]
100%|██████████| 1/1 [01:13<00:00, 73.91s/it, best loss: 0.3085260115606936]
100%|██████████| 1/1 [00:11<00:00, 11.26s/it, best loss: 0.3085260115606936]
100%|██████████| 1/1 [00:14<00:00, 14.06s/it, best loss: 0.3085260115606936]
100%|██████████| 1/1 [00:00<00:00,  2.65it/s, best loss: 0.28251445086705207]
100%|██████████| 1/1 [01:35<00:00, 95.36s/it, best loss: 0.28251445086705207]
100%|██████████| 1/1 [00:01<00:00,  1.48s/it, best loss: 0.2817919075144508]
100%|██████████| 1/1 [00:03<00:00,  3.53s/it, best loss: 0.25144508670520227]
accuracy: 
0.7433896835717382

precision: 
0.6740467404674046

recall: 
0.9464594127806563


In [0]:
logit_with_gs(df_c2v, 0.1)

[(0.0, 4613), (1.0, 4613)]
accuracy: 
0.7212830515821413

precision: 
0.680448493342677

recall: 
0.8385146804835925


# SVM

In [0]:
from sklearn import svm

In [0]:
def svm_cls(df):
  y = df['exp']
  X = df.drop('exp', 1)
  X_resampled, y_resampled = resample(X, y)
  X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=23)
  clf = svm.SVC(gamma='scale', random_state=23)
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  printMetrics(y_test, y_pred)
  #parameters = {'C':[0.01, 0.03, 0.1, 0.3, 1, 3]}
  #best_clf = GridSearchCV(clf, parameters, cv=10)
  #best_clf.fit(X_resampled, y_resampled)
  #print(best_clf.best_estimator_)
  #print(best_clf.best_score_)

## code2vec

In [0]:
svm_cls(df_c2v)

accuracy: 
0.7078456870394452

precision: 
0.6337016574585635

recall: 
0.9905008635578584


# Fully-connected Neural Network

In [0]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.metrics import binary_accuracy
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [0]:
def create_model(init, activation):
  model = Sequential()
  model.add(Dense(384, input_shape=(384,), init=init, activation=activation))
  model.add(Dense(384, init=init, activation=activation))
  model.add(Dense(100, init=init, activation=activation))
  model.add(Dense(100, init=init, activation=activation))
  model.add(Dense(1, init=init, activation='sigmoid'))
  return model

In [0]:
es = EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=10,
                              verbose=0, mode='auto',
                              restore_best_weights=True)

In [0]:
def split_into_test_train(df):
  y = df['exp']
  X = df.drop('exp', 1)
  X_resampled, y_resampled = resample(X, y)
  return train_test_split(X_resampled, y_resampled, random_state=23)
X_train, X_test, y_train, y_test = split_into_test_train(df_c2v)

In [0]:
uniform_model = create_model("uniform", 'tanh')
uniform_model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
uniform_model.fit(X_train, y_train, batch_size=64, nb_epoch=100, verbose=1, validation_data=(X_test, y_test), callbacks=[es])

In [0]:
y_pred = uniform_model.predict_classes(X_test)
printMetrics(y_test, y_pred)

accuracy: 
0.6926744690073688

precision: 
0.6361431170406306

recall: 
0.9058721934369602


In [0]:
glorot_model = create_model("glorot_normal", 'tanh')
glorot_model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
glorot_model.fit(X_train, y_train, batch_size=64, nb_epoch=100, verbose=1, validation_data=(X_test, y_test), callbacks=[es])

In [0]:
y_pred = glorot_model.predict_classes(X_test)
printMetrics(y_test, y_pred)

accuracy: 
0.7269180754226268

precision: 
0.6841004184100419

recall: 
0.8471502590673575


In [0]:
uniform_adam_model = create_model("uniform", 'tanh')
uniform_adam_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
uniform_adam_model.fit(X_train, y_train, batch_size=64, nb_epoch=100, verbose=1, validation_data=(X_test, y_test), callbacks=[es])

In [0]:
y_pred = uniform_adam_model.predict_classes(X_test)
printMetrics(y_test, y_pred)

accuracy: 
0.694408322496749

precision: 
0.6217087587318646

recall: 
0.9991364421416234


In [0]:
glorot_adam_model = create_model("glorot_normal", 'tanh')
glorot_adam_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
glorot_adam_model.fit(X_train, y_train, batch_size=64, nb_epoch=100, verbose=1, validation_data=(X_test, y_test), callbacks=[es])

In [0]:
y_pred = glorot_adam_model.predict_classes(X_test)
printMetrics(y_test, y_pred)

accuracy: 
0.694408322496749

precision: 
0.6230309614340033

recall: 
0.9905008635578584


In [0]:
glorot_adam_relu_model = create_model("glorot_normal", 'relu')
glorot_adam_relu_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
glorot_adam_relu_model.fit(X_train, y_train, batch_size=64, nb_epoch=100, verbose=1, validation_data=(X_test, y_test), callbacks=[es])

In [0]:
y_pred = glorot_adam_relu_model.predict_classes(X_test)
printMetrics(y_test, y_pred)

accuracy: 
0.7104464672735153

precision: 
0.6740056818181818

recall: 
0.8195164075993091


In [0]:
def create_large_model(init):
  model = Sequential()
  model.add(Dense(384, input_shape=(384,), init=init, activation='relu'))
  model.add(Dense(100, init=init, activation='relu'))
  model.add(Dense(100, init=init, activation='relu'))
  model.add(Dense(100, init=init, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(1, init=init, activation='sigmoid'))
  return model

In [0]:
glorot_adam_relu_large_model = create_large_model("glorot_normal")
glorot_adam_relu_large_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
glorot_adam_relu_large_model.fit(X_train, y_train, batch_size=128, nb_epoch=100, verbose=1, validation_data=(X_test, y_test), callbacks=[es])

In [0]:
y_pred = glorot_adam_relu_large_model.predict_classes(X_test)
printMetrics(y_test, y_pred)

accuracy: 
0.7126137841352406

precision: 
0.6668914362778152

recall: 
0.8540587219343696


## Cos distance

### code2vec

In [0]:
df_cos = pd.read_csv('/content/gdrive/My Drive/code-clones/csv/cos_distance.csv')

In [0]:
logit_with_gs(df_cos)

[(0, 210), (1, 210)]
accuracy: 
0.638095238095238

precision: 
0.5869565217391305

recall: 
1.0
LogisticRegression(C=3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=23, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
0.6


In [0]:
svm_cls(df_cos)

accuracy: 
0.580952380952381

precision: 
0.5510204081632653

recall: 
1.0
SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=23, shrinking=True,
  tol=0.001, verbose=False)
0.5976190476190476


### bow

In [0]:
df_cos = pd.read_csv('/content/gdrive/My Drive/code-clones/csv/bagofwords_cos.csv')

In [0]:
logit_with_gs(df_cos, C=3)

[(0, 3704), (1, 3704)]
accuracy: 
0.6090712742980562

precision: 
0.5832012678288431

recall: 
0.7880085653104925
LogisticRegression(C=3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=23, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
0.6186555075593952
