In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import nltk
dler = nltk.downloader.Downloader()
dler._update_index()
dler.download('all')

# File loading, Train-test-split, result table

In [0]:
import nltk
import pickle
import os
import pandas as pd
import numpy as np
import re

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
%matplotlib inline

import time
seed = int(time.strftime("%Y%m%d"))

In [0]:
path = '/content/drive/My Drive/Colab Notebooks/sol_classification.pickle'
data = pickle.load(open(path, "rb"))
data.comments = data.comments.apply('\n'.join)

In [6]:
# suppress categories with freq less than 2%
freq = data['category'].value_counts(normalize=True)
data['category'].replace(to_replace=list(freq[freq<0.02].index),value='others',inplace=True)
data['category'].value_counts(normalize=True)

games           0.268832
exchanges       0.216102
finance         0.156309
gambling        0.093691
others          0.056026
high-risk       0.044727
marketplaces    0.039077
social          0.036723
development     0.033427
media           0.031544
property        0.023540
Name: category, dtype: float64

In [7]:
# dummy coding for target variables
dummies = data['category'].str.get_dummies()
X = data.loc[:,('source_code','uncommented','comments')]
dummies.shape, X.shape

((2124, 11), (2124, 3))

In [8]:
# train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, dummies, test_size = 0.25, random_state = seed, stratify=data.category)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((1593, 3), (531, 3), (1593, 11), (531, 11))

In [0]:
# initiate result matrixs for cv and the test set

iterables = [['comments_only', 'codes_only', 'combined'], #input type
        ['logit','lightbm','mlp','GRU','CNN'], #model types  
        data['category'].value_counts().index, #category
        ] 

index = pd.MultiIndex.from_product(iterables, names=['input_types','models','categories'])
result = pd.DataFrame(index=index)
result['AUC'] = None
result.reset_index(inplace=True)
cv_result = result.copy()

In [0]:
result = pd.read_csv('/content/drive/My Drive/Colab Notebooks/test_auc.csv')
cv_result = pd.read_csv('/content/drive/My Drive/Colab Notebooks/validation_auc.csv')

# Non-NLP
length of comments, length of codes and the comment/code ratio


In [0]:
def X_non_NLP_features (X):
  code_len = X['uncommented'].apply(lambda x: len([line for line in x.split('\n') if line.strip() != '']))
  X = X.assign(code_len = code_len)

  comment_len = X['comments'].apply(lambda x: len([line for line in x.split('\n') if line.strip() != '']))
  X = X.assign(comment_len = comment_len)

  comment_ratio = comment_len/code_len
  X = X.assign(comment_ratio = comment_ratio)

  X.drop(labels=['source_code','uncommented','comments'],axis=1,inplace=True)
  return np.array(X)

# BOW

## BOW tokenizer

In [0]:
# NLP imports
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
my_stopwords = stopwords.words("english")
my_stopwords.append("")

In [0]:
# check out the first 20 comments as a sample for tokenizing
regex_tokenizer = nltk.RegexpTokenizer(r"[\w^@]+")
corpus = ' '.join(X_train[0:20]['comments'].values)
new_words = regex_tokenizer.tokenize(corpus)

new_words = sum([word.split('_') for word in new_words],[])
new_words = [re.sub('[0-9]','', word) for word in new_words]
new_words = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in new_words] #split cramelCase
new_words = sum(new_words, [])

fdist1 = nltk.FreqDist(new_words)
fdist1.most_common(50)

In [0]:
def my_tokenizer (text):
  
  #tokenize
  regex_tokenizer = nltk.RegexpTokenizer(r"[\w^@]+")
  new_words = regex_tokenizer.tokenize(text)

  #remove numbers
  new_words = [re.sub('[0-9]','', word) for word in new_words]

  #split additionally by under_score
  new_words = sum([word.split('_') for word in new_words],[])

  #clear camelCase
  new_words = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in new_words]
  new_words = sum(new_words, [])

  return new_words

In [0]:
vectorizer = TfidfVectorizer(stop_words = my_stopwords, tokenizer = my_tokenizer, lowercase = True,
                max_features =1000, smooth_idf=True, analyzer = 'word')

## Models based on BOW: logit, lightbm, multilayer perceptron

In [0]:
path = '/content/drive/My Drive/Colab Notebooks/params_search.pickle'
DApps_model_params = pickle.load(open(path, "rb"))

In [0]:
# logit model
def logit_model (X_train,y_train,params):
  #logreg = LogisticRegression(penalty=params['penalty'],max_iter=1000)
  logreg = LogisticRegression(penalty=params['penalty'],C=params['C'],max_iter=10000)
  logreg.fit(X_train, y_train)
  return logreg

In [0]:
# lightbm model
def lightbm_model (X_train,y_train,params):

  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = seed, stratify=y_train)

  train_data = lgb.Dataset(X_train,label=y_train)
  validation_data = lgb.Dataset(X_val,label=y_val)

  params.update([('objective','binary'),('metric','auc')])
  num_round = 100
  bst = lgb.train(params, train_data, num_round, valid_sets=validation_data,verbose_eval=False,early_stopping_rounds=5)

  return bst

In [0]:
def mlp_model (X_train,y_train,params):
  mlp_classifier = MLPClassifier(hidden_layer_sizes=params['hidden_layer_sizes'],solver=params['solver'],early_stopping=True,max_iter=10000)
  mlp_classifier.fit(X_train, y_train)
  return mlp_classifier

In [0]:
# cross_validation for opt params
DApps_model_params = {}
DApps_model_score = {}
for DApp_type in data['category'].value_counts().index:

  y_train = np.array(Y_train[DApp_type])
  X_train_xNLP = np.array(X_non_NLP_features(X_train))
  X_train_NLP = vectorizer.fit_transform(X_train['comments'])
  X_train_CV = hstack((X_train_xNLP,X_train_NLP)).toarray()

  scaler = MinMaxScaler()
  X_train_CV = scaler.fit_transform(X_train_CV)

  params_dist = {'logit':{'penalty':['l1','l2'],'C':[0.5,1,2]},
           'lightbm':{'num_leaves':[32, 64, 128]},
           'mlp':{'hidden_layer_sizes':[(64,32),(128,32),(256,32)],
               'solver':['adam'],
               'n_iter_no_change':[5]}}

  #print('Fitting logit')
  logit_classifier = LogisticRegression(max_iter=10000)
  logit_search = RandomizedSearchCV(logit_classifier, param_distributions=params_dist['logit'], n_iter=3, cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  logit_search.fit(X_train_CV,y_train)

  #print('Fitting lightbm')
  lgb_classifier = lgb.LGBMClassifier()
  #lgb_search = GridSearchCV(lgb_classifier, param_grid=params_dist['lightbm'], cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  lgb_search = RandomizedSearchCV(lgb_classifier, param_distributions=params_dist['lightbm'], n_iter=3, cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  lgb_search.fit(X_train_CV,y_train)

  #print('Fitting MLP')
  mlp_classifier = MLPClassifier(early_stopping=True,max_iter=10000)
  #mlp_search = GridSearchCV(mlp_classifier, param_grid=params_dist['mlp'], cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  mlp_search = RandomizedSearchCV(mlp_classifier, param_distributions=params_dist['mlp'], n_iter=3, cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  mlp_search.fit(X_train_CV,y_train)

  searches = {'logit_params':logit_search.best_params_,'lgb_params':lgb_search.best_params_,'mlp_params':mlp_search.best_params_}
  DApps_model_params.update([(DApp_type,searches)])
  scores = {'logit_score':logit_search.best_score_,'lgb_score':lgb_search.best_score_,'mlp_score':mlp_search.best_score_}
  DApps_model_score.update([(DApp_type,scores)])

with open('params_search.pickle', 'wb') as handle:
  pickle.dump(DApps_model_params, handle, protocol=pickle.HIGHEST_PROTOCOL)



In [0]:
table_cv = []
for DApp_type in DApps_model_score:
  cv_aucs = [cv_score for cv_score in DApps_model_score[DApp_type].values()]
  cv_aucs.append(DApp_type)
  table_cv.append(cv_aucs)

table_cv
table_cv = pd.DataFrame(table_cv)
table_cv.columns = ['logit_cv','lightbm_cv','mlp_cv','category']
table_cv.set_index('category',inplace=True)
table_cv

Unnamed: 0_level_0,logit_cv,lightbm_cv,mlp_cv
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
games,0.900635,0.91248,0.90009
exchanges,0.959379,0.955496,0.954687
finance,0.904681,0.908626,0.892875
gambling,0.8881,0.900053,0.885413
others,0.842408,0.839435,0.614871
high-risk,0.905709,0.896943,0.74748
marketplaces,0.774803,0.754351,0.580236
social,0.762166,0.722853,0.60593
development,0.820489,0.813542,0.605416
media,0.816791,0.768849,0.573995


In [0]:
# main function

test_aucs = []
for DApp_type in data['category'].value_counts().index:
  y_train = np.array(Y_train[DApp_type])
  y_test = np.array(Y_test[DApp_type])

  X_train_xNLP = np.array(X_non_NLP_features(X_train))
  X_train_NLP = vectorizer.fit_transform(X_train['comments'])
  X_train_set = hstack((X_train_xNLP,X_train_NLP)).toarray()

  X_test_xNLP = np.array(X_non_NLP_features(X_test))
  X_test_NLP = vectorizer.transform(X_test['comments'])
  X_test_set = hstack((X_test_xNLP,X_test_NLP)).toarray()

  scaler = MinMaxScaler()
  X_train_set = scaler.fit_transform(X_train_set)
  X_test_set = scaler.transform(X_test_set)

  logit = logit_model(X_train_set,y_train,models_search[DApp_type]['logit_params'])
  lightbm = lightbm_model(X_train_set,y_train,models_search[DApp_type]['lgb_params'])
  mlp = mlp_model(X_train_set,y_train,models_search[DApp_type]['mlp_params'])

  test_aucs.append([DApp_type,roc_auc_score(y_test,logit.predict(X_test_set)),roc_auc_score(y_test,lightbm.predict(X_test_set)),roc_auc_score(y_test,[x[1] for x in mlp.predict_proba(X_test_set)])])

In [0]:
# output
table_test = pd.DataFrame(test_aucs)
table_test.columns = ['category','logit','lightbm','mlp']
table_test.set_index('category',inplace=True)
table_test

Unnamed: 0_level_0,logit,lightbm,mlp
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
games,0.806944,0.919544,0.90673
exchanges,0.89954,0.937949,0.961716
finance,0.738059,0.892642,0.864202
gambling,0.73896,0.835281,0.88079
others,0.531337,0.784331,0.799468
high-risk,0.694668,0.884201,0.932557
marketplaces,0.595238,0.75915,0.795798
social,0.498047,0.639597,0.752981
development,0.583333,0.779186,0.800032
media,0.528439,0.718986,0.734321


In [0]:
table = table_test.join(table_cv)
table.loc[:,('logit_cv','logit','lightbm_cv','lightbm','mlp_cv','mlp')]

Unnamed: 0_level_0,logit_cv,logit,lightbm_cv,lightbm,mlp_cv,mlp
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
games,0.903527,0.806944,0.91248,0.919544,0.886719,0.90673
exchanges,0.959379,0.89954,0.955496,0.937949,0.946653,0.961716
finance,0.904681,0.738059,0.908626,0.892642,0.875982,0.864202
gambling,0.887796,0.73896,0.900053,0.835281,0.88869,0.88079
others,0.845451,0.531337,0.839435,0.784331,0.818368,0.799468
high-risk,0.905216,0.694668,0.896943,0.884201,0.864426,0.932557
marketplaces,0.774803,0.595238,0.754351,0.75915,0.764272,0.795798
social,0.758435,0.498047,0.722853,0.639597,0.744929,0.752981
development,0.820489,0.583333,0.813542,0.779186,0.754994,0.800032
media,0.816791,0.528439,0.768849,0.718986,0.748617,0.734321


# Sequential Models

## Word-to-Vec

In [0]:
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [0]:
# check out the first 20 comments as a sample for tokenizing
regex_tokenizer = nltk.RegexpTokenizer(r"[\w^@]+")
corpus = ' '.join(X_train[0:20]['comments'].values)
new_words = regex_tokenizer.tokenize(corpus)

new_words = sum([word.split('_') for word in new_words],[])
new_words = [re.sub('[0-9]','', word) for word in new_words]
new_words = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in new_words] #split cramelCase
new_words = sum(new_words, [])

fdist1 = nltk.FreqDist(new_words)
fdist1.most_common(50)
#len(np.unique(new_words))

In [0]:
# check out the first 20 codes as a sample for tokenizing

corpus = ' '.join(X_train[0:20]['uncommented'].values)
new_words = nltk.word_tokenize(corpus)

#new_words = sum([word.split('_') for word in new_words],[])
#new_words = [re.sub('[0-9]','', word) for word in new_words]
new_words = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in new_words]
new_words = sum(new_words, [])

fdist1 = nltk.FreqDist(new_words)
fdist1.most_common(50)

In [0]:
# define tokenizer that's fit for comments
def build_corpus_comments (list_of_text):

  corpus = []

  regex_tokenizer = nltk.RegexpTokenizer(r"[\w^@]+")
  my_stopwords = stopwords.words("english")
  my_stopwords.append("")

  for i in range(0,len(list_of_text)):
    text = list_of_text[i]
    text = regex_tokenizer.tokenize(text)
    text = sum([word.split('_') for word in text],[])
    text = [re.sub('[0-9]','', word) for word in text]
    text = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in text]
    text = sum(text, [])

    text = [w for w in text if not w in my_stopwords]
    corpus.append(text)
  return corpus

In [0]:
# define tokenizer that's fit for codes
def build_corpus_codes (codes):
  corpus = []
  my_stopwords = stopwords.words("english")
  my_stopwords.append("")
  
  for i in range(0,len(codes)):
    text = codes[i]
    text = nltk.word_tokenize(text)
    text = sum([word.split('_') for word in text],[])
    text = [re.sub('[0-9]','', word) for word in text]
    text = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in text]
    text = sum(text, [])
    text = [w for w in text if not w in my_stopwords]
    corpus.append(text)
  return corpus

In [0]:
# build corpus base on the comments
comments_train_corpus = build_corpus_comments(X_train['comments'].values)
comments_test_corpus = build_corpus_comments(X_test['comments'].values)

In [0]:
# build corpus base on the codes
codes_train_corpus = build_corpus_codes(X_train['uncommented'].values)
codes_test_corpus = build_corpus_codes(X_test['uncommented'].values)

In [0]:
# initialize but check if the num_words makes sense from the vocal sizes in the subsequent code blocks
num_words=5000

In [29]:
# tokenize_to_seq comments
tokenizer_obj=Tokenizer(num_words=num_words, lower=True)
tokenizer_obj.fit_on_texts(comments_train_corpus)
comments_train_seq=tokenizer_obj.texts_to_sequences(comments_train_corpus)
comments_test_seq=tokenizer_obj.texts_to_sequences(comments_test_corpus)
len(tokenizer_obj.word_index)

11970

In [30]:
# tokenize_to_seq codes
tokenizer_obj=Tokenizer(num_words=num_words, lower=True)
tokenizer_obj.fit_on_texts(codes_train_corpus)
codes_train_seq=tokenizer_obj.texts_to_sequences(codes_train_corpus)
codes_test_seq=tokenizer_obj.texts_to_sequences(codes_test_corpus)
len(tokenizer_obj.word_index)

23039

In [31]:
# set maxlen to be padded based on text length after tokenization
comment_len = [len(comments) for comments in comments_train_seq]
code_len = [len(codes) for codes in codes_train_seq]

maxlen = 5000
(np.mean(comment_len)+1*np.std(comment_len), np.mean(code_len)+1*np.std(code_len)),(max(comment_len),max(code_len))

((2187.261740096789, 6438.595270490277), (8452, 38090))

In [0]:
# pad
comments_train_seq=pad_sequences(comments_train_seq,maxlen=maxlen)
comments_test_seq=pad_sequences(comments_test_seq,maxlen=maxlen)

codes_train_seq=pad_sequences(codes_train_seq,maxlen=maxlen)
codes_test_seq=pad_sequences(codes_test_seq,maxlen=maxlen)

### Alternative Pre-trained Embedding

In [0]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [0]:
# download the pre-trained weights
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [0]:
# store it in the W2V format
EMBEDDING_FILE = '/root/input/GoogleNews-vectors-negative300.bin.gz' # from above
googlenews_w2v = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# initialize the W2V weight matrix
googlenews_w2v_matrix = np.zeros((len(word_index) + 1, 300))

In [0]:
# get the vocabulary
key = list(googlenews_w2v.vocab.keys())

In [0]:
# fill in the W2V weight matrix
for word,i in word_index.items():
  if word in key:
    googlenews_w2v_matrix[i] = googlenews_w2v.get_vector(word)

## Models

In [0]:
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Dropout, GRU, Conv1D, Conv2D, GlobalMaxPooling1D, MaxPooling1D, MaxPooling2D, Flatten, Input, Bidirectional
from keras.callbacks import EarlyStopping
from keras.layers.embeddings import Embedding

define models


In [0]:
def create_GRU (optimizer='adam', GRU_size=128, dropout=0.2):
  initializer=keras.initializers.he_normal()

  input_NLP = Input(shape=(maxlen,))
  embedding_layer = Embedding(input_dim=num_words,output_dim=100,input_length=maxlen,trainable=True)
  RNN = embedding_layer(input_NLP)
  RNN = GRU(GRU_size,activation='tanh')(RNN)
  RNN = Dropout(dropout)(RNN)
  RNN = Dense(32,activation='tanh',kernel_initializer=initializer)(RNN)
  RNN = Dropout(dropout)(RNN)
  predictions = Dense(11,activation='softmax',kernel_initializer=initializer)(RNN)
  RNN = Model(inputs=input_NLP, outputs=predictions)
  RNN.compile(loss = 'categorical_crossentropy', optimizer = optimizer)
  return RNN

In [0]:
def create_CNN (optimizer='adam', filter_size=64, kernel_size=3, dropout=0.2):
  initializer=keras.initializers.he_normal()

  input_NLP = Input(shape=(maxlen,))
  embedding_layer = Embedding(input_dim=num_words,output_dim=100,input_length=maxlen,trainable=True)
  CNN = embedding_layer(input_NLP)
  CNN = Dropout(dropout)(CNN)
  CNN = Conv1D(filters=filter_size,kernel_size=kernel_size,padding='valid',activation='relu')(CNN)
  CNN = GlobalMaxPooling1D()(CNN)
  CNN = Dropout(dropout)(CNN)
  CNN = Dense(32,activation='relu',kernel_initializer=initializer)(CNN)
  CNN = Dropout(dropout)(CNN)
  predictions = Dense(11,activation='softmax',kernel_initializer=initializer)(CNN)
  CNN = Model(inputs=input_NLP, outputs=predictions)
  CNN.compile(loss = 'categorical_crossentropy', optimizer = optimizer)
  return CNN

Cross-validation

In [0]:
X_train_seq = comments_train_seq
X_test_seq = comments_test_seq

In [112]:
# validate again with AUC scoring

skf = StratifiedKFold(n_splits=3)
i = 0
val_auc = {'GRU':[[],[],[]],'CNN':[[],[],[]]} #initiate a matrix to save cv auc scores

for train_index, val_index in skf.split(X_train_seq, np.array(Y_train).argmax(1)):
  CV_X_train = X_train_seq[train_index]
  CV_Y_train = np.array(Y_train)[train_index]
  CV_X_val = X_train_seq[val_index]
  CV_Y_val = np.array(Y_train)[val_index]
  Y_val = pd.DataFrame(CV_Y_val,columns=Y_train.columns)
  
  cb=EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=2, restore_best_weights=True)

  #train GRU
  GRU_model = create_GRU()
  print("\n","Training for GRU, fold#=", i+1,"\n")
  GRU_model.fit(CV_X_train, CV_Y_train,batch_size=100, epochs=20, verbose=1,validation_data=(CV_X_val,CV_Y_val), callbacks=[cb],shuffle=False)
  GRU_pred = GRU_model.predict(CV_X_val)
  GRU_pred = pd.DataFrame(GRU_pred,columns=Y_train.columns)
  for DApp in Y_train.columns:
    val_auc['GRU'][i].append([DApp,roc_auc_score(Y_val[DApp],GRU_pred[DApp])])  

  #train CNN
  CNN = create_CNN()
  print("\n","Training for CNN, fold#=", i+1,"\n")
  CNN.fit(CV_X_train, CV_Y_train,batch_size=100, epochs=20, verbose=1,validation_data=(CV_X_val,CV_Y_val), callbacks=[cb],shuffle=False)
  CNN_pred = CNN.predict(CV_X_val)
  CNN_pred = pd.DataFrame(CNN_pred,columns=Y_train.columns)
  for DApp in Y_train.columns:
    val_auc['CNN'][i].append([DApp,roc_auc_score(Y_val[DApp],CNN_pred[DApp])])
  
  #count add
  i=i+1


 Training for GRU, fold#= 1 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200

 Training for CNN, fold#= 1 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200

 Training for GRU, fold#= 2 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200

 Training for CNN, fold#= 2 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200

 Training for GRU, fold#= 3 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200

 Training for CNN, fold#= 3 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200


In [0]:
with open('cv.pickle', 'wb') as handle:
  pickle.dump(val_auc, handle, protocol=pickle.HIGHEST_PROTOCOL)

Optional: save cv results?

In [115]:
val_auc

{'CNN': [[['development', 0.8778427550357375],
   ['exchanges', 0.9747909698996655],
   ['finance', 0.9243088425129088],
   ['gambling', 0.8613557456177492],
   ['games', 0.9237077355634058],
   ['high-risk', 0.9077909270216962],
   ['marketplaces', 0.8775910364145658],
   ['media', 0.7992718446601942],
   ['others', 0.8604890781700782],
   ['property', 0.8206118206118206],
   ['social', 0.8373776908023484]],
  [['development', 0.9459830624856947],
   ['exchanges', 0.9608734065379275],
   ['finance', 0.9380782056798623],
   ['gambling', 0.9148856548856549],
   ['games', 0.9477687261192416],
   ['high-risk', 0.9043392504930967],
   ['marketplaces', 0.8476190476190476],
   ['media', 0.872281986724651],
   ['others', 0.8850632069194944],
   ['property', 0.8003371868978805],
   ['social', 0.8321428571428572]],
  [['development', 0.7810807883907299],
   ['exchanges', 0.9584448160535117],
   ['finance', 0.9038968373493975],
   ['gambling', 0.861995841995842],
   ['games', 0.9184166696839133]

In [124]:
# output the average AUC score
AUC = val_auc['GRU']
cv = pd.concat([pd.DataFrame(AUC[0],columns=['categories','AUC']),pd.DataFrame(AUC[1],columns=['categories','AUC']),pd.DataFrame(AUC[2],columns=['categories','AUC'])]).groupby('categories').mean()
cv = cv.loc[data['category'].value_counts().index,:]
cv['AUC'].values

array([0.83927698, 0.93344563, 0.79822415, 0.83099787, 0.71119872,
       0.72077033, 0.70858381, 0.68583786, 0.77197907, 0.75099582,
       0.76013409])

In [125]:
cv['AUC'].values.mean()

0.7737676654355803

In [127]:
cv_result.loc[(cv_result['models']=='GRU') & (cv_result['input_types']=='comments_only'),'AUC']

33   NaN
34   NaN
35   NaN
36   NaN
37   NaN
38   NaN
39   NaN
40   NaN
41   NaN
42   NaN
43   NaN
Name: AUC, dtype: float64

In [0]:
# update validation results
cv_result.loc[(cv_result['models']=='GRU') & (cv_result['input_types']=='comments_only'),('AUC')] = cv['AUC'].values

## train the model and run on the test set

In [0]:
cb=EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=2, restore_best_weights=True)

model.fit(W2V_train_seq, np.array(Y_train),
      batch_size=50, epochs=200, verbose=1,
      #validation_data=(X_test, y_test.reshape(-1,y_test.shape[1])),
      validation_split=0.25,
      callbacks=[cb],shuffle=False)

In [0]:
Y_pred = model.predict(W2V_test_seq)
Y_pred = pd.DataFrame(Y_pred,columns=Y_train.columns)

In [0]:
table = []
for DApp in Y_train.columns:
  table.append([DApp,roc_auc_score(Y_test[DApp],Y_pred[DApp])])

table = pd.DataFrame(table)
table.columns = ['category','CNN']
table.set_index('category',inplace=True)
table.loc[data['category'].value_counts().index,:]

Unnamed: 0,CNN
games,0.580347
exchanges,0.734187
finance,0.557202
gambling,0.67052
others,0.554724
high-risk,0.416338
marketplaces,0.5324
social,0.635588
development,0.447206
media,0.556311


# Present the cv and the test table

In [0]:
order = data['category'].value_counts().index.values.tolist()
order = [('AUC',x) for x in order]

In [129]:
cv_result.set_index(keys=['input_types','models','categories']).unstack('categories').loc[:,order]

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC
Unnamed: 0_level_1,categories,games,exchanges,finance,gambling,others,high-risk,marketplaces,social,development,media,property
input_types,models,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
codes_only,CNN,0.940143,0.966873,0.932744,0.933317,0.851067,0.91568,0.842207,0.843397,0.885706,0.833062,0.847884
codes_only,GRU,0.633049,0.883496,0.65551,0.689534,0.631175,0.55772,0.585778,0.682109,0.674383,0.671397,0.69958
codes_only,lightbm,,,,,,,,,,,
codes_only,logit,,,,,,,,,,,
codes_only,mlp,,,,,,,,,,,
combined,CNN,,,,,,,,,,,
combined,GRU,,,,,,,,,,,
combined,lightbm,,,,,,,,,,,
combined,logit,,,,,,,,,,,
combined,mlp,,,,,,,,,,,


In [45]:
result.set_index(keys=['input_types','models','categories']).unstack('categories').loc[:,order]

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC
Unnamed: 0_level_1,categories,games,exchanges,finance,gambling,others,high-risk,marketplaces,social,development,media,property
input_types,models,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
codes_only,CNN,,,,,,,,,,,
codes_only,GRU,,,,,,,,,,,
codes_only,lightbm,,,,,,,,,,,
codes_only,logit,,,,,,,,,,,
codes_only,mlp,,,,,,,,,,,
combined,CNN,,,,,,,,,,,
combined,GRU,,,,,,,,,,,
combined,lightbm,,,,,,,,,,,
combined,logit,,,,,,,,,,,
combined,mlp,,,,,,,,,,,


In [0]:
result.to_csv('test_auc.csv',index=False)
cv_result.to_csv('validation_auc.csv',index=False)