In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import nltk
dler = nltk.downloader.Downloader()
dler._update_index()
dler.download('all')

# File loading, Train-test-split

In [0]:
import pickle
import os
import pandas as pd
import numpy as np
import re

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
%matplotlib inline


import time
seed = int(time.strftime("%Y%m%d"))

In [0]:
path = '/content/drive/My Drive/Colab Notebooks/sol_classification.pickle'
data = pickle.load(open(path, "rb"))
data.comments = data.comments.apply('\n'.join)

In [6]:
# suppress categories with freq less than 1%
freq = data['category'].value_counts(normalize=True)
data['category'].replace(to_replace=list(freq[freq<0.02].index),value='others',inplace=True)
data['category'].value_counts(normalize=True)

games           0.268832
exchanges       0.216102
finance         0.156309
gambling        0.093691
others          0.056026
high-risk       0.044727
marketplaces    0.039077
social          0.036723
development     0.033427
media           0.031544
property        0.023540
Name: category, dtype: float64

In [7]:
# dummy coding for target variables
dummies = data['category'].str.get_dummies()
X = data.loc[:,('source_code','uncommented','comments')]
dummies.shape, X.shape

((2124, 11), (2124, 3))

In [8]:
# train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, dummies, test_size = 0.25, random_state = seed, stratify=data.category)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((1593, 3), (531, 3), (1593, 11), (531, 11))

# Feature Engineering

In [0]:
def X_non_NLP_features (X):
  code_len = X['uncommented'].apply(lambda x: len([line for line in x.split('\n') if line.strip() != '']))
  X = X.assign(code_len = code_len)

  comment_len = X['comments'].apply(lambda x: len([line for line in x.split('\n') if line.strip() != '']))
  X = X.assign(comment_len = comment_len)

  comment_ratio = comment_len/code_len
  X = X.assign(comment_ratio = comment_ratio)

  X.drop(labels=['source_code','uncommented','comments'],axis=1,inplace=True)
  return np.array(X)

In [10]:
# NLP imports
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [0]:
my_stopwords = stopwords.words("english").append('')

In [17]:
regex_tokenizer = nltk.RegexpTokenizer(r"[\w^@]+")
corpus = ' '.join(X_train[0:20]['comments'].values)
new_words = regex_tokenizer.tokenize(corpus)

new_words = sum([word.split('_') for word in new_words],[])
new_words = [re.sub('[0-9]','', word) for word in new_words]
new_words = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in new_words]
new_words = sum(new_words, [])

fdist1 = nltk.FreqDist(new_words)
fdist1.most_common(100)

[('the', 1755),
 ('to', 1006),
 ('of', 657),
 ('@dev', 525),
 ('a', 521),
 ('@param', 515),
 ('is', 440),
 ('amount', 435),
 ('token', 399),
 ('for', 364),
 ('in', 313),
 ('and', 312),
 ('be', 305),
 ('address', 298),
 ('contract', 282),
 ('The', 278),
 ('by', 267),
 ('tokens', 243),
 ('that', 234),
 ('from', 233),
 ('new', 229),
 ('if', 217),
 ('Token', 217),
 ('@return', 209),
 ('this', 203),
 ('an', 196),
 ('connector', 190),
 ('balance', 183),
 ('s', 182),
 ('not', 181),
 ('owner', 179),
 ('account', 176),
 ('asset', 162),
 ('transfer', 159),
 ('on', 158),
 ('it', 147),
 ('borrow', 147),
 ('@notice', 141),
 ('value', 139),
 ('return', 139),
 ('ERC', 136),
 ('Amount', 135),
 ('interest', 133),
 ('we', 128),
 ('with', 125),
 ('conversion', 125),
 ('can', 123),
 ('only', 116),
 ('when', 116),
 ('We', 115),
 ('This', 102),
 ('total', 102),
 ('sol', 101),
 ('supply', 101),
 ('or', 100),
 ('function', 93),
 ('number', 92),
 ('price', 91),
 ('protocol', 91),
 ('collateral', 90),
 ('which'

In [0]:
def my_tokenizer (text):
  
  #tokenize
  regex_tokenizer = nltk.RegexpTokenizer(r"[\w^@]+")
  new_words = regex_tokenizer.tokenize(text)

  #remove numbers
  new_words = [re.sub('[0-9]','', word) for word in new_words]

  #split additionally by under_score
  new_words = sum([word.split('_') for word in new_words],[])

  #clear camelCase
  new_words = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in new_words]
  new_words = sum(new_words, [])

  return new_words

In [0]:
vectorizer = TfidfVectorizer(stop_words = my_stopwords, tokenizer = my_tokenizer, lowercase = True,
                max_features =1000, smooth_idf=True, analyzer = 'word')

# Models, params optimization and results

In [0]:
path = '/content/drive/My Drive/Colab Notebooks/params_search.pickle'
models_search = pickle.load(open(path, "rb"))

In [19]:
old_cv = []
for DApp_type in models_search:
  cv_aucs = [model_cv.best_score_ for model_cv in models_search[DApp_type].values()]
  cv_aucs.append(DApp_type)
  old_cv.append(cv_aucs)

old_cv = pd.DataFrame(old_cv)
old_cv.columns = ['logit_cv','lightbm_cv','mlp_cv','category']
old_cv.set_index('category',inplace=True)
old_cv

Unnamed: 0_level_0,logit_cv,lightbm_cv,mlp_cv
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
games,0.904717,0.912997,0.878994
exchanges,0.96617,0.959131,0.935162
finance,0.896869,0.90573,0.882382
gambling,0.902788,0.907676,0.872975
others,0.818144,0.783667,0.748744
high-risk,0.902955,0.863496,0.873256
marketplaces,0.752246,0.782454,0.744156
social,0.773068,0.72636,0.716448
development,0.833748,0.782182,0.77415
media,0.768422,0.705935,0.694033


In [0]:
# logit model
def logit_model (X_train,y_train,params):
  #logreg = LogisticRegression(penalty=params['penalty'],max_iter=1000)
  logreg = LogisticRegression(penalty=params['penalty'],C=params['C'],max_iter=10000)
  logreg.fit(X_train, y_train)
  return logreg

In [0]:
# lightbm model
def lightbm_model (X_train,y_train,params):

  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = seed, stratify=y_train)

  train_data = lgb.Dataset(X_train,label=y_train)
  validation_data = lgb.Dataset(X_val,label=y_val)

  params.update([('objective','binary'),('metric','auc')])
  num_round = 100
  bst = lgb.train(params, train_data, num_round, valid_sets=validation_data,verbose_eval=False,early_stopping_rounds=5)

  return bst

In [0]:
def mlp_model (X_train,y_train,params):
  mlp_classifier = MLPClassifier(hidden_layer_sizes=params['hidden_layer_sizes'],solver=params['solver'],early_stopping=True,max_iter=10000)
  mlp_classifier.fit(X_train, y_train)
  return mlp_classifier

In [0]:
# cross_validation for opt params
models_search = {}
for DApp_type in data['category'].value_counts().index:

  y_train = np.array(Y_train[DApp_type])
  X_train_xNLP = np.array(X_non_NLP_features(X_train))
  X_train_NLP = vectorizer.fit_transform(X_train['comments'])
  X_train_CV = hstack((X_train_xNLP,X_train_NLP)).toarray()

  scaler = MinMaxScaler()
  X_train_CV = scaler.fit_transform(X_train_CV)

  params_dist = {'logit':{'penalty':['l1','l2'],'C':[0.5,1,2]},
           'lightbm':{'num_leaves':[32, 64, 128]},
           'mlp':{'hidden_layer_sizes':[(64,32),(128,32),(256,32)],
               'solver':['lbfgs','adam']}}

  #print('Fitting logit')
  logit_classifier = LogisticRegression(max_iter=10000)
  logit_search = RandomizedSearchCV(logit_classifier, param_distributions=params_dist['logit'], n_iter=3, cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  logit_search.fit(X_train_CV,y_train)

  #print('Fitting lightbm')
  lgb_classifier = lgb.LGBMClassifier()
  #lgb_search = GridSearchCV(lgb_classifier, param_grid=params_dist['lightbm'], cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  lgb_search = RandomizedSearchCV(lgb_classifier, param_distributions=params_dist['lightbm'], n_iter=3, cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  lgb_search.fit(X_train_CV,y_train)

  #print('Fitting MLP')
  mlp_classifier = MLPClassifier(early_stopping=True,max_iter=10000)
  #mlp_search = GridSearchCV(mlp_classifier, param_grid=params_dist['mlp'], cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  mlp_search = RandomizedSearchCV(mlp_classifier, param_distributions=params_dist['mlp'], n_iter=3, cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  mlp_search.fit(X_train_CV,y_train)

  searches = {'logit_search':logit_search,'lgb_search':lgb_search,'mlp_search':mlp_search}
  DApps_model_params.update([(DApp_type,searches)])

with open('params_search.pickle', 'wb') as handle:
  pickle.dump(DApps_model_params, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [33]:
DApps_model_params['games']['logit_search'].best_params_

{'C': 1, 'penalty': 'l2'}

In [34]:
DApps_model_params['games']['lgb_search'].best_params_

{'num_leaves': 64}

In [35]:
DApps_model_params['games']['mlp_search'].best_params_

{'hidden_layer_sizes': (64, 32), 'solver': 'adam'}

In [30]:
table_cv = []
for DApp_type in DApps_model_params:
  cv_aucs = [model_cv.best_score_ for model_cv in DApps_model_params[DApp_type].values()]
  cv_aucs.append(DApp_type)
  table_cv.append(cv_aucs)

table_cv
table_cv = pd.DataFrame(table_cv)
table_cv.columns = ['logit_cv','lightbm_cv','mlp_cv','category']
table_cv.set_index('category',inplace=True)
table_cv

Unnamed: 0_level_0,logit_cv,lightbm_cv,mlp_cv
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
games,0.903527,0.91248,0.886719
exchanges,0.959379,0.955496,0.946653
finance,0.904681,0.908626,0.875982
gambling,0.887796,0.900053,0.88869
others,0.845451,0.839435,0.818368
high-risk,0.905216,0.896943,0.864426
marketplaces,0.774803,0.754351,0.764272
social,0.758435,0.722853,0.744929
development,0.820489,0.813542,0.754994
media,0.816791,0.768849,0.748617


In [0]:
# main function

test_aucs = []
for DApp_type in data['category'].value_counts().index:
  y_train = np.array(Y_train[DApp_type])
  y_test = np.array(Y_test[DApp_type])

  X_train_xNLP = np.array(X_non_NLP_features(X_train))
  X_train_NLP = vectorizer.fit_transform(X_train['comments'])
  X_train_set = hstack((X_train_xNLP,X_train_NLP)).toarray()

  X_test_xNLP = np.array(X_non_NLP_features(X_test))
  X_test_NLP = vectorizer.transform(X_test['comments'])
  X_test_set = hstack((X_test_xNLP,X_test_NLP)).toarray()

  scaler = MinMaxScaler()
  X_train_set = scaler.fit_transform(X_train_set)
  X_test_set = scaler.transform(X_test_set)

  logit = logit_model(X_train_set,y_train,models_search[DApp_type]['logit_search'].best_params_)
  lightbm = lightbm_model(X_train_set,y_train,models_search[DApp_type]['lgb_search'].best_params_)
  mlp = mlp_model(X_train_set,y_train,models_search[DApp_type]['mlp_search'].best_params_)

  test_aucs.append([DApp_type,roc_auc_score(y_test,logit.predict(X_test_set)),roc_auc_score(y_test,lightbm.predict(X_test_set)),roc_auc_score(y_test,[x[1] for x in mlp.predict_proba(X_test_set)])])

In [37]:
# output
table_test = pd.DataFrame(test_aucs)
table_test.columns = ['category','logit','lightbm','mlp']
table_test.set_index('category',inplace=True)
table_test

Unnamed: 0_level_0,logit,lightbm,mlp
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
games,0.806944,0.919544,0.90673
exchanges,0.89954,0.937949,0.961716
finance,0.738059,0.892642,0.864202
gambling,0.73896,0.835281,0.88079
others,0.531337,0.784331,0.799468
high-risk,0.694668,0.884201,0.932557
marketplaces,0.595238,0.75915,0.795798
social,0.498047,0.639597,0.752981
development,0.583333,0.779186,0.800032
media,0.528439,0.718986,0.734321


In [38]:
table = table_test.join(table_cv)
table.loc[:,('logit_cv','logit','lightbm_cv','lightbm','mlp_cv','mlp')]

Unnamed: 0_level_0,logit_cv,logit,lightbm_cv,lightbm,mlp_cv,mlp
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
games,0.903527,0.806944,0.91248,0.919544,0.886719,0.90673
exchanges,0.959379,0.89954,0.955496,0.937949,0.946653,0.961716
finance,0.904681,0.738059,0.908626,0.892642,0.875982,0.864202
gambling,0.887796,0.73896,0.900053,0.835281,0.88869,0.88079
others,0.845451,0.531337,0.839435,0.784331,0.818368,0.799468
high-risk,0.905216,0.694668,0.896943,0.884201,0.864426,0.932557
marketplaces,0.774803,0.595238,0.754351,0.75915,0.764272,0.795798
social,0.758435,0.498047,0.722853,0.639597,0.744929,0.752981
development,0.820489,0.583333,0.813542,0.779186,0.754994,0.800032
media,0.816791,0.528439,0.768849,0.718986,0.748617,0.734321
