In [101]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import nltk
dler = nltk.downloader.Downloader()
dler._update_index()
dler.download('all')

# File loading, Train-test-split, result table

In [0]:
import nltk
import pickle
import os
import pandas as pd
import numpy as np
import re

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
%matplotlib inline

import time
seed = int(time.strftime("%Y%m%d"))

In [0]:
path = '/content/drive/My Drive/Colab Notebooks/sol_classification.pickle'
data = pickle.load(open(path, "rb"))
data.comments = data.comments.apply('\n'.join)

In [105]:
# suppress categories with freq less than 2%
freq = data['category'].value_counts(normalize=True)
data['category'].replace(to_replace=list(freq[freq<0.02].index),value='others',inplace=True)
data['category'].value_counts(normalize=True)

games           0.268832
exchanges       0.216102
finance         0.156309
gambling        0.093691
others          0.056026
high-risk       0.044727
marketplaces    0.039077
social          0.036723
development     0.033427
media           0.031544
property        0.023540
Name: category, dtype: float64

In [106]:
# dummy coding for target variables
dummies = data['category'].str.get_dummies()
X = data.loc[:,('source_code','uncommented','comments')]
dummies.shape, X.shape

((2124, 11), (2124, 3))

In [107]:
# train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, dummies, test_size = 0.25, random_state = seed, stratify=data.category)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((1593, 3), (531, 3), (1593, 11), (531, 11))

In [0]:
# initiate result matrixs for cv and the test set

iterables = [['comments_only', 'codes_only', 'combined'], #input type
        ['logit','lightbm','mlp','GRU','CNN'], #model types  
        data['category'].value_counts().index, #category
        ] 

index = pd.MultiIndex.from_product(iterables, names=['input_types','models','categories'])
result = pd.DataFrame(index=index)
result['AUC'] = None
result.reset_index(inplace=True)
cv_result = result.copy()

In [0]:
result = pd.read_csv('/content/drive/My Drive/Colab Notebooks/test_auc.csv')
cv_result = pd.read_csv('/content/drive/My Drive/Colab Notebooks/validation_auc.csv')

# Non-NLP
length of comments, length of codes and the comment/code ratio


In [0]:
def X_non_NLP_features (X):
  code_len = X['uncommented'].apply(lambda x: len([line for line in x.split('\n') if line.strip() != '']))
  X = X.assign(code_len = code_len)

  comment_len = X['comments'].apply(lambda x: len([line for line in x.split('\n') if line.strip() != '']))
  X = X.assign(comment_len = comment_len)

  comment_ratio = comment_len/code_len
  X = X.assign(comment_ratio = comment_ratio)

  X.drop(labels=['source_code','uncommented','comments'],axis=1,inplace=True)
  return np.array(X)

# BOW

## BOW tokenizer

In [0]:
# NLP imports
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
my_stopwords = stopwords.words("english")
my_stopwords.append("")

In [0]:
# check out the first 20 comments as a sample for tokenizing
regex_tokenizer = nltk.RegexpTokenizer(r"[\w^@]+")
corpus = ' '.join(X_train[0:20]['comments'].values)
new_words = regex_tokenizer.tokenize(corpus)

new_words = sum([word.split('_') for word in new_words],[])
new_words = [re.sub('[0-9]','', word) for word in new_words]
new_words = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in new_words] #split cramelCase
new_words = sum(new_words, [])

fdist1 = nltk.FreqDist(new_words)
fdist1.most_common(50)

In [0]:
def tokenizer_comments (text):
  
  #tokenize
  regex_tokenizer = nltk.RegexpTokenizer(r"[\w^@]+")
  new_words = regex_tokenizer.tokenize(text)

  #remove numbers
  new_words = [re.sub('[0-9]','', word) for word in new_words]

  #split additionally by under_score
  new_words = sum([word.split('_') for word in new_words],[])

  #clear camelCase
  new_words = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in new_words]
  new_words = sum(new_words, [])

  return new_words

In [0]:
def tokenizer_codes (text):
  
  #tokenize
  new_words = nltk.word_tokenize(text)

  #remove numbers
  new_words = [re.sub('[0-9]','', word) for word in new_words]

  #split additionally by under_score
  new_words = sum([word.split('_') for word in new_words],[])

  #clear camelCase
  new_words = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in new_words]
  new_words = sum(new_words, [])

  return new_words

In [0]:
vectorizer_comments = TfidfVectorizer(stop_words = my_stopwords, tokenizer = tokenizer_comments, lowercase = True,
                max_features=5000, smooth_idf=True, analyzer = 'word')

In [0]:
vectorizer_codes = TfidfVectorizer(stop_words = my_stopwords, tokenizer = tokenizer_codes, lowercase = True,
                max_features=5000, smooth_idf=True, analyzer = 'word')

## Models based on BOW: logit, lightbm, multilayer perceptron

In [0]:
path = '/content/drive/My Drive/Colab Notebooks/params_search.pickle'
DApps_model_params = pickle.load(open(path, "rb"))

Define models

In [0]:
# logit model
def logit_model (X_train,y_train,params):
  #logreg = LogisticRegression(penalty=params['penalty'],max_iter=1000)
  logreg = LogisticRegression(penalty=params['penalty'],C=params['C'],max_iter=10000)
  logreg.fit(X_train, y_train)
  return logreg

In [0]:
# lightbm model
def lightbm_model (X_train,y_train,params):

  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = seed, stratify=y_train)

  train_data = lgb.Dataset(X_train,label=y_train)
  validation_data = lgb.Dataset(X_val,label=y_val)

  params.update([('objective','binary'),('metric','auc')])
  num_round = 100
  bst = lgb.train(params, train_data, num_round, valid_sets=validation_data,verbose_eval=False,early_stopping_rounds=5)

  return bst

In [0]:
def mlp_model (X_train,y_train,params):
  mlp_classifier = MLPClassifier(hidden_layer_sizes=params['hidden_layer_sizes'],solver=params['solver'],early_stopping=True,max_iter=10000)
  mlp_classifier.fit(X_train, y_train)
  return mlp_classifier

Cross-validation with params search

In [0]:
# cross_validation for opt params
DApps_model_params = {}
DApps_model_score = {}
for DApp_type in data['category'].value_counts().index:

  y_train = np.array(Y_train[DApp_type])
  #X_train_NLP = vectorizer_comments.fit_transform(X_train['comments'])
  X_train_NLP = vectorizer_codes.fit_transform(X_train['uncommented'])
  X_train_CV = X_train_NLP

  scaler = MaxAbsScaler()
  X_train_CV = scaler.fit_transform(X_train_CV)

  params_dist = {'logit':{'penalty':['l1','l2'],'C':[0.5,1,2]},
           'lightbm':{'num_leaves':[32, 64, 128]},
           'mlp':{'hidden_layer_sizes':[(64,32),(128,32),(256,32)],
               'solver':['adam'],
               'n_iter_no_change':[3]}}

  #print('Fitting logit')
  logit_classifier = LogisticRegression(max_iter=10000)
  logit_search = RandomizedSearchCV(logit_classifier, param_distributions=params_dist['logit'], n_iter=3, cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  logit_search.fit(X_train_CV,y_train)

  #print('Fitting lightbm')
  lgb_classifier = lgb.LGBMClassifier()
  #lgb_search = GridSearchCV(lgb_classifier, param_grid=params_dist['lightbm'], cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  lgb_search = RandomizedSearchCV(lgb_classifier, param_distributions=params_dist['lightbm'], n_iter=3, cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  lgb_search.fit(X_train_CV,y_train)

  #print('Fitting MLP')
  mlp_classifier = MLPClassifier(early_stopping=True,max_iter=10000)
  #mlp_search = GridSearchCV(mlp_classifier, param_grid=params_dist['mlp'], cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  mlp_search = RandomizedSearchCV(mlp_classifier, param_distributions=params_dist['mlp'], n_iter=3, cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  mlp_search.fit(X_train_CV,y_train)

  searches = {'logit_params':logit_search.best_params_,'lgb_params':lgb_search.best_params_,'mlp_params':mlp_search.best_params_}
  DApps_model_params.update([(DApp_type,searches)])
  scores = {'logit_score':logit_search.best_score_,'lgb_score':lgb_search.best_score_,'mlp_score':mlp_search.best_score_}
  DApps_model_score.update([(DApp_type,scores)])

with open('params_search.pickle', 'wb') as handle:
  pickle.dump(DApps_model_params, handle, protocol=pickle.HIGHEST_PROTOCOL)

  'stop_words.' % sorted(inconsistent))


In [0]:
table_cv = []
for DApp_type in DApps_model_score:
  cv_aucs = [cv_score for cv_score in DApps_model_score[DApp_type].values()]
  cv_aucs.append(DApp_type)
  table_cv.append(cv_aucs)

table_cv
table_cv = pd.DataFrame(table_cv)
table_cv.columns = ['logit_cv','lightbm_cv','mlp_cv','category']
table_cv.set_index('category',inplace=True)
table_cv

Unnamed: 0_level_0,logit_cv,lightbm_cv,mlp_cv
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
games,0.930168,0.919171,0.922556
exchanges,0.965052,0.949511,0.961909
finance,0.933269,0.936111,0.914094
gambling,0.919571,0.933595,0.900696
others,0.839958,0.779749,0.689237
high-risk,0.951622,0.943349,0.906723
marketplaces,0.849292,0.745769,0.708756
social,0.810565,0.777832,0.66715
development,0.848061,0.83087,0.7461
media,0.78617,0.735522,0.708507


In [0]:
table_cv.mean()

logit_cv      0.878373
lightbm_cv    0.850948
mlp_cv        0.794308
dtype: float64

Optional: save cv scores?

In [0]:
table_cv.loc[:,'logit_cv'].values

array([0.93016824, 0.96505241, 0.93326879, 0.91957103, 0.83995784,
       0.95162174, 0.84929247, 0.81056464, 0.84806074, 0.78617041,
       0.82837419])

In [0]:
cv_result.loc[(cv_result['models']=='lightbm') & (cv_result['input_types']=='codes_only'),'AUC']

66    0.919171
67    0.949511
68    0.936111
69    0.933595
70    0.779749
71    0.943349
72    0.745769
73    0.777832
74    0.830870
75    0.735522
76    0.808944
Name: AUC, dtype: float64

In [0]:
cv_result.loc[(cv_result['models']=='logit') & (cv_result['input_types']=='codes_only'),'AUC'] = table_cv.loc[:,'logit_cv'].values

run on the test set

In [0]:
# main function

test_aucs = []
for DApp_type in data['category'].value_counts().index:
  y_train = np.array(Y_train[DApp_type])
  y_test = np.array(Y_test[DApp_type])

  #X_train_xNLP = np.array(X_non_NLP_features(X_train))
  #X_train_NLP = vectorizer_comments.fit_transform(X_train['comments'])
  X_train_NLP = vectorizer_codes.fit_transform(X_train['uncommented'])
  X_train_set = X_train_NLP

  #X_test_xNLP = np.array(X_non_NLP_features(X_test))
  #X_test_NLP = vectorizer_comments.transform(X_test['comments'])
  X_test_NLP = vectorizer_codes.transform(X_test['uncommented'])
  X_test_set = X_test_NLP

  scaler = MaxAbsScaler()
  X_train_set = scaler.fit_transform(X_train_set)
  X_test_set = scaler.transform(X_test_set)

  logit = logit_model(X_train_set,y_train,DApps_model_params[DApp_type]['logit_params'])
  lightbm = lightbm_model(X_train_set,y_train,DApps_model_params[DApp_type]['lgb_params'])
  mlp = mlp_model(X_train_set,y_train,DApps_model_params[DApp_type]['mlp_params'])

  test_aucs.append([DApp_type,roc_auc_score(y_test,logit.predict(X_test_set)),roc_auc_score(y_test,lightbm.predict(X_test_set)),roc_auc_score(y_test,[x[1] for x in mlp.predict_proba(X_test_set)])])

In [0]:
# output
table_test = pd.DataFrame(test_aucs)
table_test.columns = ['category','logit','lightbm','mlp']
table_test.set_index('category',inplace=True)
table_test

Unnamed: 0_level_0,logit,lightbm,mlp
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
games,0.845595,0.884156,0.933386
exchanges,0.91573,0.963232,0.97038
finance,0.811465,0.919858,0.947545
gambling,0.735842,0.850748,0.948919
others,0.615669,0.77322,0.768563
high-risk,0.669976,0.82613,0.902345
marketplaces,0.569468,0.703735,0.766013
social,0.496094,0.780787,0.666632
development,0.610136,0.839669,0.814598
media,0.558824,0.664111,0.832513


optional: save test auc?

In [0]:
table_test.loc[:,'mlp'].values

array([0.9333862 , 0.97038043, 0.94754464, 0.94891892, 0.76856287,
       0.90234509, 0.76601307, 0.6666324 , 0.81459822, 0.83251316,
       0.7061657 ])

In [0]:
result.loc[(result['models']=='mlp') & (result['input_types']=='codes_only'),'AUC']

77   NaN
78   NaN
79   NaN
80   NaN
81   NaN
82   NaN
83   NaN
84   NaN
85   NaN
86   NaN
87   NaN
Name: AUC, dtype: float64

In [0]:
result.loc[(result['models']=='mlp') & (result['input_types']=='codes_only'),'AUC'] = table_test.loc[:,'mlp'].values

A combined table of cv and test aucs

In [0]:
table = table_test.join(table_cv)
table.loc[:,('logit_cv','logit','lightbm_cv','lightbm','mlp_cv','mlp')]

Unnamed: 0_level_0,logit_cv,logit,lightbm_cv,lightbm,mlp_cv,mlp
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
games,0.903527,0.806944,0.91248,0.919544,0.886719,0.90673
exchanges,0.959379,0.89954,0.955496,0.937949,0.946653,0.961716
finance,0.904681,0.738059,0.908626,0.892642,0.875982,0.864202
gambling,0.887796,0.73896,0.900053,0.835281,0.88869,0.88079
others,0.845451,0.531337,0.839435,0.784331,0.818368,0.799468
high-risk,0.905216,0.694668,0.896943,0.884201,0.864426,0.932557
marketplaces,0.774803,0.595238,0.754351,0.75915,0.764272,0.795798
social,0.758435,0.498047,0.722853,0.639597,0.744929,0.752981
development,0.820489,0.583333,0.813542,0.779186,0.754994,0.800032
media,0.816791,0.528439,0.768849,0.718986,0.748617,0.734321


# Sequential Models

## Word-to-Vec

In [113]:
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [0]:
# check out the first 20 comments as a sample for tokenizing
regex_tokenizer = nltk.RegexpTokenizer(r"[\w^@]+")
corpus = ' '.join(X_train[0:20]['comments'].values)
new_words = regex_tokenizer.tokenize(corpus)

new_words = sum([word.split('_') for word in new_words],[])
new_words = [re.sub('[0-9]','', word) for word in new_words]
new_words = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in new_words] #split cramelCase
new_words = sum(new_words, [])

fdist1 = nltk.FreqDist(new_words)
fdist1.most_common(50)
#len(np.unique(new_words))

In [0]:
# define tokenizer that's fit for comments
def build_corpus_comments (list_of_text):

  corpus = []

  regex_tokenizer = nltk.RegexpTokenizer(r"[\w^@]+")
  my_stopwords = stopwords.words("english")
  my_stopwords.append("")

  for i in range(0,len(list_of_text)):
    text = list_of_text[i]
    text = regex_tokenizer.tokenize(text)
    text = sum([word.split('_') for word in text],[])
    text = [re.sub('[0-9]','', word) for word in text]
    text = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in text]
    text = sum(text, [])

    text = [w for w in text if not w in my_stopwords]
    corpus.append(text)
  return corpus

In [0]:
# define tokenizer that's fit for codes
def build_corpus_codes (codes):
  corpus = []
  my_stopwords = stopwords.words("english")
  my_stopwords.append("")
  
  for i in range(0,len(codes)):
    text = codes[i]
    text = nltk.word_tokenize(text)
    text = sum([word.split('_') for word in text],[])
    text = [re.sub('[0-9]','', word) for word in text]
    text = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in text]
    text = sum(text, [])
    text = [w for w in text if not w in my_stopwords]
    corpus.append(text)
  return corpus

In [0]:
# build corpus base on the comments
comments_train_corpus = build_corpus_comments(X_train['comments'].values)
comments_test_corpus = build_corpus_comments(X_test['comments'].values)

In [0]:
# build corpus base on the codes
codes_train_corpus = build_corpus_codes(X_train['uncommented'].values)
codes_test_corpus = build_corpus_codes(X_test['uncommented'].values)

In [0]:
# initialize but check if the num_words makes sense from the vocal sizes in the subsequent code blocks
num_words=5000

In [119]:
# tokenize_to_seq comments
tokenizer_obj=Tokenizer(num_words=num_words, lower=True)
tokenizer_obj.fit_on_texts(comments_train_corpus)
comments_train_seq=tokenizer_obj.texts_to_sequences(comments_train_corpus)
comments_test_seq=tokenizer_obj.texts_to_sequences(comments_test_corpus)
len(tokenizer_obj.word_index)

11728

In [120]:
# tokenize_to_seq codes
tokenizer_obj=Tokenizer(num_words=num_words, lower=True)
tokenizer_obj.fit_on_texts(codes_train_corpus)
codes_train_seq=tokenizer_obj.texts_to_sequences(codes_train_corpus)
codes_test_seq=tokenizer_obj.texts_to_sequences(codes_test_corpus)
len(tokenizer_obj.word_index)

22687

In [121]:
# set maxlen to be padded based on text length after tokenization
comment_len = [len(comments) for comments in comments_train_seq]
code_len = [len(codes) for codes in codes_train_seq]

maxlen = 5000
(np.mean(comment_len)+1*np.std(comment_len), np.mean(code_len)+1*np.std(code_len)),(max(comment_len),max(code_len))

((2196.707433569422, 6352.267279974838), (8452, 27223))

In [0]:
# pad
comments_train_seq=pad_sequences(comments_train_seq,maxlen=maxlen)
comments_test_seq=pad_sequences(comments_test_seq,maxlen=maxlen)

codes_train_seq=pad_sequences(codes_train_seq,maxlen=maxlen)
codes_test_seq=pad_sequences(codes_test_seq,maxlen=maxlen)

### Alternative Pre-trained Embedding

In [0]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [0]:
# download the pre-trained weights
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [0]:
# store it in the W2V format
EMBEDDING_FILE = '/root/input/GoogleNews-vectors-negative300.bin.gz' # from above
googlenews_w2v = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# initialize the W2V weight matrix
googlenews_w2v_matrix = np.zeros((len(word_index) + 1, 300))

In [0]:
# get the vocabulary
key = list(googlenews_w2v.vocab.keys())

In [0]:
# fill in the W2V weight matrix
for word,i in word_index.items():
  if word in key:
    googlenews_w2v_matrix[i] = googlenews_w2v.get_vector(word)

## Models

In [0]:
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Dropout, GRU, Conv1D, Conv2D, GlobalMaxPooling1D, MaxPooling1D, MaxPooling2D, Flatten, Input, Bidirectional
from keras.callbacks import EarlyStopping
from keras.layers.embeddings import Embedding

In [0]:
# decide X to be comments or codes or combined
X_train_seq = codes_train_seq
X_test_seq = codes_test_seq

define models


In [0]:
def create_GRU (optimizer='adam', GRU_size=128, dropout=0.2):
  initializer=keras.initializers.he_normal()

  input_NLP = Input(shape=(maxlen,))
  embedding_layer = Embedding(input_dim=num_words,output_dim=100,input_length=maxlen,trainable=True)
  RNN = embedding_layer(input_NLP)
  RNN = GRU(GRU_size,activation='tanh')(RNN)
  RNN = Dropout(dropout)(RNN)
  RNN = Dense(32,activation='tanh',kernel_initializer=initializer)(RNN)
  RNN = Dropout(dropout)(RNN)
  predictions = Dense(11,activation='softmax',kernel_initializer=initializer)(RNN)
  RNN = Model(inputs=input_NLP, outputs=predictions)
  RNN.compile(loss = 'categorical_crossentropy', optimizer = optimizer)
  return RNN

In [0]:
def create_CNN (optimizer='adam', filter_size=64, kernel_size=3, dropout=0.2):
  initializer=keras.initializers.he_normal()

  input_NLP = Input(shape=(maxlen,))
  embedding_layer = Embedding(input_dim=num_words,output_dim=100,input_length=maxlen,trainable=True)
  CNN = embedding_layer(input_NLP)
  CNN = Dropout(dropout)(CNN)
  CNN = Conv1D(filters=filter_size,kernel_size=kernel_size,padding='valid',activation='relu')(CNN)
  CNN = GlobalMaxPooling1D()(CNN)
  CNN = Dropout(dropout)(CNN)
  CNN = Dense(32,activation='relu',kernel_initializer=initializer)(CNN)
  CNN = Dropout(dropout)(CNN)
  predictions = Dense(11,activation='softmax',kernel_initializer=initializer)(CNN)
  CNN = Model(inputs=input_NLP, outputs=predictions)
  CNN.compile(loss = 'categorical_crossentropy', optimizer = optimizer)
  return CNN

Cross-validation

In [0]:
# validate again with AUC scoring

skf = StratifiedKFold(n_splits=3)
i = 0
val_auc = {'GRU':[[],[],[]],'CNN':[[],[],[]]} #initiate a matrix to save cv auc scores

for train_index, val_index in skf.split(X_train_seq, np.array(Y_train).argmax(1)):
  CV_X_train = X_train_seq[train_index]
  CV_Y_train = np.array(Y_train)[train_index]
  CV_X_val = X_train_seq[val_index]
  CV_Y_val = np.array(Y_train)[val_index]
  Y_val = pd.DataFrame(CV_Y_val,columns=Y_train.columns)
  
  cb=EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=2, restore_best_weights=True)

  #train GRU
  GRU_model = create_GRU()
  print("\n","Training for GRU, fold#=", i+1,"\n")
  GRU_model.fit(CV_X_train, CV_Y_train,batch_size=100, epochs=20, verbose=1,validation_data=(CV_X_val,CV_Y_val), callbacks=[cb],shuffle=False)
  GRU_pred = GRU_model.predict(CV_X_val)
  GRU_pred = pd.DataFrame(GRU_pred,columns=Y_train.columns)
  for DApp in Y_train.columns:
    val_auc['GRU'][i].append([DApp,roc_auc_score(Y_val[DApp],GRU_pred[DApp])])  

  #train CNN
  CNN = create_CNN()
  print("\n","Training for CNN, fold#=", i+1,"\n")
  CNN.fit(CV_X_train, CV_Y_train,batch_size=100, epochs=20, verbose=1,validation_data=(CV_X_val,CV_Y_val), callbacks=[cb],shuffle=False)
  CNN_pred = CNN.predict(CV_X_val)
  CNN_pred = pd.DataFrame(CNN_pred,columns=Y_train.columns)
  for DApp in Y_train.columns:
    val_auc['CNN'][i].append([DApp,roc_auc_score(Y_val[DApp],CNN_pred[DApp])])
  
  #count add
  i=i+1

In [0]:
with open('cv.pickle', 'wb') as handle:
  pickle.dump(val_auc, handle, protocol=pickle.HIGHEST_PROTOCOL)

Optional: save cv results?

In [0]:
val_auc

In [0]:
# output the average AUC score
AUC = val_auc['GRU']
cv = pd.concat([pd.DataFrame(AUC[0],columns=['categories','AUC']),pd.DataFrame(AUC[1],columns=['categories','AUC']),pd.DataFrame(AUC[2],columns=['categories','AUC'])]).groupby('categories').mean()
cv = cv.loc[data['category'].value_counts().index,:]
cv['AUC'].values

array([0.83927698, 0.93344563, 0.79822415, 0.83099787, 0.71119872,
       0.72077033, 0.70858381, 0.68583786, 0.77197907, 0.75099582,
       0.76013409])

In [0]:
cv['AUC'].values.mean()

0.7737676654355803

In [0]:
cv_result.loc[(cv_result['models']=='GRU') & (cv_result['input_types']=='comments_only'),'AUC']

33   NaN
34   NaN
35   NaN
36   NaN
37   NaN
38   NaN
39   NaN
40   NaN
41   NaN
42   NaN
43   NaN
Name: AUC, dtype: float64

In [0]:
# update validation results
cv_result.loc[(cv_result['models']=='GRU') & (cv_result['input_types']=='comments_only'),('AUC')] = cv['AUC'].values

## train the model and run on the test set

In [154]:
cb=EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=2, restore_best_weights=True)
GRU_model = create_GRU()
GRU_model.fit(X_train_seq, Y_train, batch_size=100, epochs=20, verbose=1,validation_split=0.25, callbacks=[cb],shuffle=False)
GRU_pred = GRU_model.predict(X_test_seq)
GRU_pred = pd.DataFrame(GRU_pred,columns=Y_train.columns)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1194 samples, validate on 399 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


In [155]:
GRU_test_auc = []
for DApp in data['category'].value_counts().index.values:
  GRU_test_auc.append([DApp,roc_auc_score(Y_test[DApp],GRU_pred[DApp])])  
GRU_test_auc = pd.DataFrame(GRU_test_auc,columns=['categories','AUC'])
GRU_test_auc

Unnamed: 0,categories,AUC
0,games,0.802204
1,exchanges,0.908811
2,finance,0.755997
3,gambling,0.792121
4,others,0.74827
5,high-risk,0.779399
6,marketplaces,0.563212
7,social,0.676758
8,development,0.715616
9,media,0.64826


In [156]:
cb=EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=2, restore_best_weights=True)
CNN = create_CNN()
CNN.fit(X_train_seq, Y_train, batch_size=100, epochs=20, verbose=1,validation_split=0.25, callbacks=[cb],shuffle=False)
CNN_pred = CNN.predict(X_test_seq)
CNN_pred = pd.DataFrame(CNN_pred,columns=Y_train.columns)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1194 samples, validate on 399 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [157]:
CNN_test_auc = []
for DApp in data['category'].value_counts().index.values:
  CNN_test_auc.append([DApp,roc_auc_score(Y_test[DApp],CNN_pred[DApp])])  
CNN_test_auc = pd.DataFrame(CNN_test_auc,columns=['categories','AUC'])
CNN_test_auc

Unnamed: 0,categories,AUC
0,games,0.926357
1,exchanges,0.967768
2,finance,0.932323
3,gambling,0.942225
4,others,0.916201
5,high-risk,0.908764
6,marketplaces,0.765173
7,social,0.798006
8,development,0.866147
9,media,0.880465


optional save results?

In [161]:
GRU_test_auc['AUC'].values

array([0.80220424, 0.90881062, 0.7559972 , 0.79212058, 0.74827013,
       0.77939918, 0.56321195, 0.67675781, 0.7156162 , 0.64826047,
       0.83493899])

In [162]:
result.loc[(result['models']=='GRU') & (result['input_types']=='codes_only'),'AUC']

88   NaN
89   NaN
90   NaN
91   NaN
92   NaN
93   NaN
94   NaN
95   NaN
96   NaN
97   NaN
98   NaN
Name: AUC, dtype: float64

In [0]:
result.loc[(result['models']=='GRU') & (result['input_types']=='codes_only'),'AUC'] = GRU_test_auc['AUC'].values

# Present the cv and the test table

In [0]:
order = data['category'].value_counts().index.values.tolist()
order = [('AUC',x) for x in order]

In [165]:
cv_result.set_index(keys=['input_types','models','categories']).unstack('categories').loc[:,order]

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC
Unnamed: 0_level_1,categories,games,exchanges,finance,gambling,others,high-risk,marketplaces,social,development,media,property
input_types,models,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
codes_only,CNN,0.940143,0.966873,0.932744,0.933317,0.851067,0.91568,0.842207,0.843397,0.885706,0.833062,0.847884
codes_only,GRU,0.633049,0.883496,0.65551,0.689534,0.631175,0.55772,0.585778,0.682109,0.674383,0.671397,0.69958
codes_only,lightbm,0.919171,0.949511,0.936111,0.933595,0.779749,0.943349,0.745769,0.777832,0.83087,0.735522,0.808944
codes_only,logit,0.930168,0.965052,0.933269,0.919571,0.839958,0.951622,0.849292,0.810565,0.848061,0.78617,0.828374
codes_only,mlp,0.922556,0.961909,0.914094,0.900696,0.689237,0.906723,0.708756,0.66715,0.7461,0.708507,0.611658
combined,CNN,,,,,,,,,,,
combined,GRU,,,,,,,,,,,
combined,lightbm,,,,,,,,,,,
combined,logit,,,,,,,,,,,
combined,mlp,,,,,,,,,,,


In [166]:
result.set_index(keys=['input_types','models','categories']).unstack('categories').loc[:,order]

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC
Unnamed: 0_level_1,categories,games,exchanges,finance,gambling,others,high-risk,marketplaces,social,development,media,property
input_types,models,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
codes_only,CNN,0.926357,0.967768,0.932323,0.942225,0.916201,0.908764,0.765173,0.798006,0.866147,0.880465,0.930636
codes_only,GRU,0.802204,0.908811,0.755997,0.792121,0.74827,0.779399,0.563212,0.676758,0.715616,0.64826,0.834939
codes_only,lightbm,0.884156,0.963232,0.919858,0.850748,0.77322,0.82613,0.703735,0.780787,0.839669,0.664111,0.958414
codes_only,logit,0.845595,0.91573,0.811465,0.735842,0.615669,0.669976,0.569468,0.496094,0.610136,0.558824,0.665703
codes_only,mlp,0.933386,0.97038,0.947545,0.948919,0.768563,0.902345,0.766013,0.666632,0.814598,0.832513,0.706166
combined,CNN,,,,,,,,,,,
combined,GRU,,,,,,,,,,,
combined,lightbm,,,,,,,,,,,
combined,logit,,,,,,,,,,,
combined,mlp,,,,,,,,,,,


In [0]:
result.to_csv('test_auc.csv',index=False)
cv_result.to_csv('validation_auc.csv',index=False)