In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import nltk
dler = nltk.downloader.Downloader()
dler._update_index()
dler.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

True

# File loading, Train-test-split, result table

In [0]:
import nltk
import pickle
import os
import pandas as pd
import numpy as np
import re

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, LabelEncoder
from nltk.corpus import stopwords
from scipy.stats import mannwhitneyu

import matplotlib.pyplot as plt
%matplotlib inline

import time
seed = int(time.strftime("%Y%m%d"))

In [0]:
path = '/content/drive/My Drive/Colab Notebooks/sol_classification.pickle'
data = pickle.load(open(path, "rb"))
data.comments = data.comments.apply('\n'.join)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2124 entries, 0 to 2123
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   source_code  2124 non-null   object
 1   uncommented  2124 non-null   object
 2   comments     2124 non-null   object
 3   category     2124 non-null   object
dtypes: object(4)
memory usage: 83.0+ KB


In [6]:
# suppress categories with freq less than 2%
freq = data['category'].value_counts(normalize=True)
data['category'].replace(to_replace=list(freq[freq<0.02].index),value='others',inplace=True)
data['category'].value_counts(normalize=True)

games           0.268832
exchanges       0.216102
finance         0.156309
gambling        0.093691
others          0.056026
high-risk       0.044727
marketplaces    0.039077
social          0.036723
development     0.033427
media           0.031544
property        0.023540
Name: category, dtype: float64

In [7]:
# dummy coding for target variables
dummies = data['category'].str.get_dummies()
X = data.loc[:,('source_code','uncommented','comments')]
dummies.shape, X.shape

((2124, 11), (2124, 3))

In [8]:
# train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, dummies, test_size = 0.25, random_state = seed, stratify=data.category)
Y_train_labels = Y_train.idxmax(axis=1)
Y_test_labels = Y_test.idxmax(axis=1)

label_coder = LabelEncoder()
Y_train_labels = label_coder.fit_transform(Y_train_labels)
Y_test_labels = label_coder.transform(Y_test_labels)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape, Y_train_labels.shape, Y_test_labels.shape

((1593, 3), (531, 3), (1593, 11), (531, 11), (1593,), (531,))

In [0]:
# initiate result matrixs for cv and the test set

iterables = [['punctuations_removed', 'punctuations_preserved'], #input type
        ['logit','lightbm','mlp','GRU','CNN'], #model types  
        data['category'].value_counts().index, #category
        ] 

index = pd.MultiIndex.from_product(iterables, names=['input_types','models','categories'])
result = pd.DataFrame(index=index)
result['AUC'] = None
result.reset_index(inplace=True)
cv_result = result.copy()

In [0]:
result = pd.read_csv('H2_test.csv')
cv_result = pd.read_csv('H2_validation.csv')

# Non-NLP
length of comments, length of codes and the comment/code ratio


In [0]:
def X_non_NLP_features (X):
  code_len = X['uncommented'].apply(lambda x: len([line for line in x.split('\n') if line.strip() != '']))
  X = X.assign(code_len = code_len)

  comment_len = X['comments'].apply(lambda x: len([line for line in x.split('\n') if line.strip() != '']))
  X = X.assign(comment_len = comment_len)

  comment_ratio = comment_len/code_len
  X = X.assign(comment_ratio = comment_ratio)

  X.drop(labels=['source_code','uncommented','comments'],axis=1,inplace=True)
  return np.array(X)

# BOW

## BOW tokenizer

In [0]:
# NLP imports
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
my_stopwords = stopwords.words("english")
my_stopwords.append("")

In [0]:
# check out the first 20 comments as a sample for tokenizing
regex_tokenizer = nltk.RegexpTokenizer(r"[\w^@]+")
corpus = ' '.join(X_train[0:20]['comments'].values)
new_words = regex_tokenizer.tokenize(corpus)

new_words = sum([word.split('_') for word in new_words],[])
new_words = [re.sub('[0-9]','', word) for word in new_words]
new_words = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in new_words] #split cramelCase
new_words = sum(new_words, [])

fdist1 = nltk.FreqDist(new_words)
fdist1.most_common(50)

In [0]:
def tokenizer_comments (text):
  
  #tokenize
  regex_tokenizer = nltk.RegexpTokenizer(r"[\w^@]+")
  new_words = regex_tokenizer.tokenize(text)

  #remove numbers
  new_words = [re.sub('[0-9]','', word) for word in new_words]

  #split additionally by under_score
  new_words = sum([word.split('_') for word in new_words],[])

  #clear camelCase
  new_words = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in new_words]
  new_words = sum(new_words, [])

  return new_words

In [0]:
def tokenizer_codes (text):
  
  #tokenize
  new_words = nltk.word_tokenize(text)

  #remove numbers
  new_words = [re.sub('[0-9]','', word) for word in new_words]

  #split additionally by under_score
  new_words = sum([word.split('_') for word in new_words],[])

  #clear camelCase
  new_words = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in new_words]
  new_words = sum(new_words, [])

  return new_words

In [0]:
vectorizer_comments = TfidfVectorizer(stop_words = my_stopwords, tokenizer = tokenizer_comments, lowercase = True,
                max_features=5000, smooth_idf=True, analyzer = 'word')

In [0]:
vectorizer_codes = TfidfVectorizer(stop_words = my_stopwords, tokenizer = tokenizer_codes, lowercase = True,
                max_features=5000, smooth_idf=True, analyzer = 'word')

In [0]:
vectorizer_combined = TfidfVectorizer(stop_words = my_stopwords, tokenizer = tokenizer_codes, lowercase = True,
                max_features=10000, smooth_idf=True, analyzer = 'word')

## Models based on BOW: logit, lightbm, multilayer perceptron

In [0]:
path = '/content/drive/My Drive/Colab Notebooks/params_search.pickle'
DApps_model_params = pickle.load(open(path, "rb"))

Define models

In [0]:
# logit model
def logit_model (X_train,y_train,params):
  #logreg = LogisticRegression(penalty=params['penalty'],max_iter=1000)
  logreg = LogisticRegression(penalty=params['penalty'],C=params['C'],max_iter=10000)
  logreg.fit(X_train, y_train)
  return logreg

In [0]:
# lightbm model
def lightbm_model (X_train,y_train,params):

  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = seed, stratify=y_train)

  train_data = lgb.Dataset(X_train,label=y_train)
  validation_data = lgb.Dataset(X_val,label=y_val)

  params.update([('objective','binary'),('metric','auc')])
  num_round = 100
  bst = lgb.train(params, train_data, num_round, valid_sets=validation_data,verbose_eval=False,early_stopping_rounds=5)

  return bst

In [0]:
def mlp_model (X_train,y_train,params):
  mlp_classifier = MLPClassifier(hidden_layer_sizes=params['hidden_layer_sizes'],solver=params['solver'],early_stopping=True,max_iter=10000)
  mlp_classifier.fit(X_train, y_train)
  return mlp_classifier

Cross-validation with params search

In [0]:
# cross_validation for opt params
DApps_model_params = {}
DApps_model_score = {}
for DApp_type in data['category'].value_counts().index:
  print("Category:", DApp_type)
  y_train = np.array(Y_train[DApp_type])

  #X_train_NLP = vectorizer_comments.fit_transform(X_train['comments'])
  #X_train_NLP = vectorizer_codes.fit_transform(X_train['uncommented'])
  X_train_NLP = vectorizer_combined.fit_transform(X_train['source_code'])

  X_train_CV = X_train_NLP

  scaler = MaxAbsScaler()
  X_train_CV = scaler.fit_transform(X_train_CV)

  params_dist = {'logit':{'penalty':['l2'],'C':[0.25,0.5,1]},
           'lightbm':{'num_leaves':[64, 128, 256]},
           'mlp':{'hidden_layer_sizes':[(64,32),(128,32),(256,32)],
               'solver':['adam'],
               'n_iter_no_change':[3]}}

  print('Fitting logit')
  logit_classifier = LogisticRegression(max_iter=10000)
  logit_search = RandomizedSearchCV(logit_classifier, param_distributions=params_dist['logit'], n_iter=3, cv=3, scoring='roc_auc', n_jobs=1, verbose=0)
  logit_search.fit(X_train_CV,y_train)

  print('Fitting lightbm')
  lgb_classifier = lgb.LGBMClassifier()
  #lgb_search = GridSearchCV(lgb_classifier, param_grid=params_dist['lightbm'], cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  lgb_search = RandomizedSearchCV(lgb_classifier, param_distributions=params_dist['lightbm'], n_iter=3, cv=3, scoring='roc_auc', n_jobs=1, verbose=0)
  lgb_search.fit(X_train_CV,y_train)

  print('Fitting MLP')
  mlp_classifier = MLPClassifier(early_stopping=True,max_iter=10000)
  #mlp_search = GridSearchCV(mlp_classifier, param_grid=params_dist['mlp'], cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
  mlp_search = RandomizedSearchCV(mlp_classifier, param_distributions=params_dist['mlp'], n_iter=3, cv=3, scoring='roc_auc', n_jobs=1, verbose=0)
  mlp_search.fit(X_train_CV,y_train)

  searches = {'logit_params':logit_search.best_params_,'lgb_params':lgb_search.best_params_,'mlp_params':mlp_search.best_params_}
  DApps_model_params.update([(DApp_type,searches)])
  scores = {'logit_score':logit_search.best_score_,'lgb_score':lgb_search.best_score_,'mlp_score':mlp_search.best_score_}
  DApps_model_score.update([(DApp_type,scores)])

with open('params_search.pickle', 'wb') as handle:
  pickle.dump(DApps_model_params, handle, protocol=pickle.HIGHEST_PROTOCOL)

Category: games
Fitting logit
Fitting lightbm
Fitting MLP
Category: exchanges
Fitting logit
Fitting lightbm
Fitting MLP
Category: finance
Fitting logit
Fitting lightbm
Fitting MLP
Category: gambling
Fitting logit
Fitting lightbm
Fitting MLP
Category: others
Fitting logit
Fitting lightbm
Fitting MLP
Category: high-risk
Fitting logit
Fitting lightbm
Fitting MLP
Category: marketplaces
Fitting logit
Fitting lightbm
Fitting MLP
Category: social
Fitting logit
Fitting lightbm
Fitting MLP
Category: development
Fitting logit
Fitting lightbm
Fitting MLP
Category: media
Fitting logit
Fitting lightbm
Fitting MLP
Category: property
Fitting logit
Fitting lightbm
Fitting MLP


In [0]:
table_cv = []
for DApp_type in DApps_model_score:
  cv_aucs = [cv_score for cv_score in DApps_model_score[DApp_type].values()]
  cv_aucs.append(DApp_type)
  table_cv.append(cv_aucs)

table_cv
table_cv = pd.DataFrame(table_cv)
table_cv.columns = ['logit_cv','lightbm_cv','mlp_cv','category']
table_cv.set_index('category',inplace=True)
table_cv

Unnamed: 0_level_0,logit_cv,lightbm_cv,mlp_cv
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
games,0.935441,0.928132,0.932729
exchanges,0.964137,0.952763,0.954837
finance,0.940485,0.92945,0.922458
gambling,0.929714,0.914253,0.893895
others,0.841859,0.824604,0.722192
high-risk,0.962059,0.938144,0.894833
marketplaces,0.792579,0.799204,0.665792
social,0.835865,0.824941,0.634988
development,0.879219,0.801234,0.688963
media,0.816943,0.719593,0.633485


In [0]:
table_cv.mean()

logit_cv      0.892546
lightbm_cv    0.870717
mlp_cv        0.762716
dtype: float64

Optional: save cv scores?

In [0]:
types = 'punctuations_preserved'
cv_result.loc[(cv_result['models']=='logit') & (cv_result['input_types']==types),'AUC'] = table_cv.loc[:,'logit_cv'].values
cv_result.loc[(cv_result['models']=='lightbm') & (cv_result['input_types']==types),'AUC'] = table_cv.loc[:,'lightbm_cv'].values
cv_result.loc[(cv_result['models']=='mlp') & (cv_result['input_types']==types),'AUC'] = table_cv.loc[:,'mlp_cv'].values

run on the test set

In [0]:
# main function

test_aucs = []
for DApp_type in data['category'].value_counts().index:
  y_train = np.array(Y_train[DApp_type])
  y_test = np.array(Y_test[DApp_type])

  #X_train_set = np.array(X_non_NLP_features(X_train))
  #X_train_set = vectorizer_comments.fit_transform(X_train['comments'])
  #X_train_set = vectorizer_codes.fit_transform(X_train['uncommented'])
  X_train_set = vectorizer_combined.fit_transform(X_train['source_code'])

  #X_test_xNLP = np.array(X_non_NLP_features(X_test))
  #X_test_set = vectorizer_comments.transform(X_test['comments'])
  #X_test_set = vectorizer_codes.transform(X_test['uncommented'])
  X_test_set = vectorizer_combined.transform(X_test['source_code'])

  scaler = MaxAbsScaler()
  X_train_set = scaler.fit_transform(X_train_set)
  X_test_set = scaler.transform(X_test_set)

  logit = logit_model(X_train_set,y_train,DApps_model_params[DApp_type]['logit_params'])
  lightbm = lightbm_model(X_train_set,y_train,DApps_model_params[DApp_type]['lgb_params'])
  mlp = mlp_model(X_train_set,y_train,DApps_model_params[DApp_type]['mlp_params'])

  test_aucs.append([DApp_type,roc_auc_score(y_test,logit.predict(X_test_set)),roc_auc_score(y_test,lightbm.predict(X_test_set)),roc_auc_score(y_test,[x[1] for x in mlp.predict_proba(X_test_set)])])

In [0]:
# output
table_test = pd.DataFrame(test_aucs)
table_test.columns = ['category','logit','lightbm','mlp']
table_test.set_index('category',inplace=True)
table_test

Unnamed: 0_level_0,logit,lightbm,mlp
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
games,0.875955,0.932629,0.95997
exchanges,0.891304,0.963336,0.972408
finance,0.792276,0.877071,0.928504
gambling,0.754802,0.944657,0.933971
others,0.582335,0.865203,0.801863
high-risk,0.737162,0.926395,0.918521
marketplaces,0.619048,0.796125,0.649486
social,0.5,0.636873,0.66031
development,0.664717,0.738304,0.721464
media,0.703937,0.813115,0.668116


optional: save test auc?

In [0]:
types = 'punctuations_preserved'
result.loc[(result['models']=='logit') & (result['input_types']==types),'AUC'] = table_test.loc[:,'logit'].values
result.loc[(result['models']=='lightbm') & (result['input_types']==types),'AUC'] = table_test.loc[:,'lightbm'].values
result.loc[(result['models']=='mlp') & (result['input_types']==types),'AUC'] = table_test.loc[:,'mlp'].values

A combined table of cv and test aucs

In [0]:
table = table_test.join(table_cv)
table.loc[:,('logit_cv','logit','lightbm_cv','lightbm','mlp_cv','mlp')]

Unnamed: 0_level_0,logit_cv,logit,lightbm_cv,lightbm,mlp_cv,mlp
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
games,0.903527,0.806944,0.91248,0.919544,0.886719,0.90673
exchanges,0.959379,0.89954,0.955496,0.937949,0.946653,0.961716
finance,0.904681,0.738059,0.908626,0.892642,0.875982,0.864202
gambling,0.887796,0.73896,0.900053,0.835281,0.88869,0.88079
others,0.845451,0.531337,0.839435,0.784331,0.818368,0.799468
high-risk,0.905216,0.694668,0.896943,0.884201,0.864426,0.932557
marketplaces,0.774803,0.595238,0.754351,0.75915,0.764272,0.795798
social,0.758435,0.498047,0.722853,0.639597,0.744929,0.752981
development,0.820489,0.583333,0.813542,0.779186,0.754994,0.800032
media,0.816791,0.528439,0.768849,0.718986,0.748617,0.734321


# Sequential Models

## Word-to-Vec

In [0]:
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [0]:
# check out the first 20 comments as a sample for tokenizing
regex_tokenizer = nltk.RegexpTokenizer(r"[\w^@]+")
corpus = ' '.join(X_train[0:20]['comments'].values)
new_words = regex_tokenizer.tokenize(corpus)

new_words = sum([word.split('_') for word in new_words],[])
new_words = [re.sub('[0-9]','', word) for word in new_words]
new_words = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in new_words] #split cramelCase
new_words = sum(new_words, [])

fdist1 = nltk.FreqDist(new_words)
fdist1.most_common(50)
#len(np.unique(new_words))

In [0]:
# define tokenizer that's fit for comments
def build_corpus_comments (list_of_text):

  corpus = []

  regex_tokenizer = nltk.RegexpTokenizer(r"[\w^@]+")
  my_stopwords = stopwords.words("english")
  my_stopwords.append("")

  for i in range(0,len(list_of_text)):
    text = list_of_text[i]
    text = regex_tokenizer.tokenize(text)
    text = sum([word.split('_') for word in text],[])
    text = [re.sub('[0-9]','', word) for word in text]
    text = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in text]
    text = sum(text, [])

    text = [w for w in text if not w in my_stopwords]
    corpus.append(text)
  return corpus

In [0]:
# define tokenizer that's fit for codes
def build_corpus_codes (codes):
  corpus = []
  my_stopwords = stopwords.words("english")
  my_stopwords.append("")
  
  for i in range(0,len(codes)):
    text = codes[i]
    text = nltk.word_tokenize(text)
    text = sum([word.split('_') for word in text],[])
    text = [re.sub('[0-9]','', word) for word in text]
    text = [re.sub('([A-Z][a-z]+)',r' \1',re.sub('([A-Z]+)',r' \1', word)).split() for word in text]
    text = sum(text, [])
    text = [w for w in text if not w in my_stopwords]
    corpus.append(text)
  return corpus

In [0]:
# initialize but check if the num_words makes sense from the vocal sizes in the subsequent code blocks
num_words=5000

In [0]:
# build corpus base on the comments
comments_train_corpus = build_corpus_comments(X_train['comments'].values)
comments_test_corpus = build_corpus_comments(X_test['comments'].values)

In [0]:
# build corpus base on the codes
codes_train_corpus = build_corpus_codes(X_train['uncommented'].values)
codes_test_corpus = build_corpus_codes(X_test['uncommented'].values)

In [0]:
# tokenize_to_seq comments
tokenizer_obj=Tokenizer(num_words=num_words, lower=True)
tokenizer_obj.fit_on_texts(comments_train_corpus)
comments_train_seq=tokenizer_obj.texts_to_sequences(comments_train_corpus)
comments_test_seq=tokenizer_obj.texts_to_sequences(comments_test_corpus)
len(tokenizer_obj.word_index)

11871

In [0]:
# tokenize_to_seq codes
tokenizer_obj=Tokenizer(num_words=num_words, lower=True)
tokenizer_obj.fit_on_texts(codes_train_corpus)
codes_train_seq=tokenizer_obj.texts_to_sequences(codes_train_corpus)
codes_test_seq=tokenizer_obj.texts_to_sequences(codes_test_corpus)
len(tokenizer_obj.word_index)

22304

In [0]:
# set maxlen to be padded based on text length after tokenization
comment_len = [len(comments) for comments in comments_train_seq]
code_len = [len(codes) for codes in codes_train_seq]

maxlen = 5000
(np.mean(comment_len), np.mean(code_len)),(1*np.std(comment_len),1*np.std(code_len)),(max(comment_len),max(code_len))

((931.4538606403013, 3181.263025737602),
 (1215.0385584799635, 3205.5700603279906),
 (8452, 38147))

In [0]:
# pad
comments_train_seq=pad_sequences(comments_train_seq,maxlen=maxlen)
comments_test_seq=pad_sequences(comments_test_seq,maxlen=maxlen)

codes_train_seq=pad_sequences(codes_train_seq,maxlen=maxlen)
codes_test_seq=pad_sequences(codes_test_seq,maxlen=maxlen)

In [0]:
# build corpus base on the combined
combined_train_corpus = build_corpus_codes(X_train['source_code'].values)
combined_test_corpus = build_corpus_codes(X_test['source_code'].values)

In [0]:
num_words=10000

tokenizer_obj=Tokenizer(num_words=num_words, lower=True)
tokenizer_obj.fit_on_texts(combined_train_corpus)
combined_train_seq=tokenizer_obj.texts_to_sequences(combined_train_corpus)
combined_test_seq=tokenizer_obj.texts_to_sequences(combined_test_corpus)
len(tokenizer_obj.word_index)

32956

In [0]:
combined_len = [len(combined) for combined in combined_train_corpus]
np.mean(combined_len),max(combined_len)

(4625.24670433145, 44904)

In [0]:
maxlen = 10000
combined_train_seq=pad_sequences(combined_train_seq,maxlen=maxlen)
combined_test_seq=pad_sequences(combined_test_seq,maxlen=maxlen)

### Alternative Pre-trained Embedding

In [0]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [0]:
# download the pre-trained weights
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [0]:
# store it in the W2V format
EMBEDDING_FILE = '/root/input/GoogleNews-vectors-negative300.bin.gz' # from above
googlenews_w2v = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# initialize the W2V weight matrix
googlenews_w2v_matrix = np.zeros((len(word_index) + 1, 300))

In [0]:
# get the vocabulary
key = list(googlenews_w2v.vocab.keys())

In [0]:
# fill in the W2V weight matrix
for word,i in word_index.items():
  if word in key:
    googlenews_w2v_matrix[i] = googlenews_w2v.get_vector(word)

## Models

In [0]:
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Dropout, GRU, Conv1D, Conv2D, GlobalMaxPooling1D, MaxPooling1D, MaxPooling2D, Flatten, Input, Bidirectional
from keras.callbacks import EarlyStopping
from keras.layers.embeddings import Embedding

In [0]:
# decide X to be comments or codes or combined
X_train_seq = comments_train_seq
X_test_seq = comments_test_seq

define models


In [0]:
def create_GRU (optimizer='adam', GRU_size=128, dropout=0.2):
  initializer=keras.initializers.he_normal()

  input_NLP = Input(shape=(maxlen,))
  embedding_layer = Embedding(input_dim=num_words,output_dim=100,input_length=maxlen,trainable=True)
  RNN = embedding_layer(input_NLP)
  RNN = GRU(GRU_size,activation='tanh',kernel_initializer=initializer,return_sequences=False)(RNN)
  #RNN = Bidirectional(GRU(GRU_size,activation='tanh'),merge_mode="concat")(RNN) #(Bidirectional(GRU(32,return_sequences=True)))
  #RNN = GRU(GRU_size,activation='tanh',kernel_initializer=initializer,return_sequences=False)(RNN)
  RNN = Dropout(dropout)(RNN)
  RNN = Dense(64,activation='tanh',kernel_initializer=initializer)(RNN)
  RNN = Dropout(dropout)(RNN)
  predictions = Dense(11,activation='softmax',kernel_initializer=initializer)(RNN)
  RNN = Model(inputs=input_NLP, outputs=predictions)
  RNN.compile(loss = 'categorical_crossentropy', optimizer = optimizer)
  return RNN

In [0]:
def create_CNN (optimizer='adam', filter_size=64, kernel_size=3, dropout=0.2):
  initializer=keras.initializers.he_normal()

  input_NLP = Input(shape=(maxlen,))
  embedding_layer = Embedding(input_dim=num_words,output_dim=100,input_length=maxlen,trainable=True)
  CNN = embedding_layer(input_NLP)
  CNN = Dropout(dropout)(CNN)
  CNN = Conv1D(filters=filter_size,kernel_size=kernel_size,padding='valid',activation='relu')(CNN)
  CNN = GlobalMaxPooling1D()(CNN)
  CNN = Dropout(dropout)(CNN)
  CNN = Dense(32,activation='relu',kernel_initializer=initializer)(CNN)
  CNN = Dropout(dropout)(CNN)
  predictions = Dense(11,activation='softmax',kernel_initializer=initializer)(CNN)
  CNN = Model(inputs=input_NLP, outputs=predictions)
  CNN.compile(loss = 'categorical_crossentropy', optimizer = optimizer)
  return CNN

Cross-validation

In [0]:
# validate for comments

skf = StratifiedKFold(n_splits=3)
i = 0
val_auc_comments = {'GRU':[[],[],[]],'CNN':[[],[],[]]} #initiate a matrix to save cv auc scores

for train_index, val_index in skf.split(combined_train_seq, np.array(Y_train).argmax(1)):
  CV_X_train = combined_train_seq[train_index]
  CV_Y_train = np.array(Y_train)[train_index]
  CV_X_val = combined_train_seq[val_index]
  CV_Y_val = np.array(Y_train)[val_index]
  Y_val = pd.DataFrame(CV_Y_val,columns=Y_train.columns)
  
  cb=EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=2, restore_best_weights=True)

  #train GRU
  GRU_model = create_GRU()
  print("\n","Training for GRU, fold#=", i+1,"\n")
  GRU_model.fit(CV_X_train, CV_Y_train,batch_size=100, epochs=30, verbose=1,validation_data=(CV_X_val,CV_Y_val), callbacks=[cb],shuffle=False)
  GRU_pred = GRU_model.predict(CV_X_val)
  GRU_pred = pd.DataFrame(GRU_pred,columns=Y_train.columns)
  for DApp in Y_train.columns:
    val_auc_comments['GRU'][i].append([DApp,roc_auc_score(Y_val[DApp],GRU_pred[DApp])])  

  #train CNN
  CNN = create_CNN()
  print("\n","Training for CNN, fold#=", i+1,"\n")
  CNN.fit(CV_X_train, CV_Y_train,batch_size=100, epochs=30, verbose=1,validation_data=(CV_X_val,CV_Y_val), callbacks=[cb],shuffle=False)
  CNN_pred = CNN.predict(CV_X_val)
  CNN_pred = pd.DataFrame(CNN_pred,columns=Y_train.columns)
  for DApp in Y_train.columns:
    val_auc_comments['CNN'][i].append([DApp,roc_auc_score(Y_val[DApp],CNN_pred[DApp])])
  
  #count add
  i=i+1


 Training for GRU, fold#= 1 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30

 Training for CNN, fold#= 1 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30

 Training for GRU, fold#= 2 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30

 Training for CNN, fold#= 2 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/30

In [0]:
  # validate again with AUC scoring

skf = StratifiedKFold(n_splits=3)
i = 0
val_auc_codes = {'GRU':[[],[],[]],'CNN':[[],[],[]]} #initiate a matrix to save cv auc scores

for train_index, val_index in skf.split(combined_train_seq, np.array(Y_train).argmax(1)):
  CV_X_train = combined_train_seq[train_index]
  CV_Y_train = np.array(Y_train)[train_index]
  CV_X_val = combined_train_seq[val_index]
  CV_Y_val = np.array(Y_train)[val_index]
  Y_val = pd.DataFrame(CV_Y_val,columns=Y_train.columns)
  
  cb=EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=2, restore_best_weights=True)

  #train GRU
  GRU_model = create_GRU()
  print("\n","Training for GRU, fold#=", i+1,"\n")
  GRU_model.fit(CV_X_train, CV_Y_train,batch_size=100, epochs=30, verbose=1,validation_data=(CV_X_val,CV_Y_val), callbacks=[cb],shuffle=False)
  GRU_pred = GRU_model.predict(CV_X_val)
  GRU_pred = pd.DataFrame(GRU_pred,columns=Y_train.columns)
  for DApp in Y_train.columns:
    val_auc_codes['GRU'][i].append([DApp,roc_auc_score(Y_val[DApp],GRU_pred[DApp])])

  #train CNN
  CNN = create_CNN()
  print("\n","Training for CNN, fold#=", i+1,"\n")
  CNN.fit(CV_X_train, CV_Y_train,batch_size=100, epochs=30, verbose=1,validation_data=(CV_X_val,CV_Y_val), callbacks=[cb],shuffle=False)
  CNN_pred = CNN.predict(CV_X_val)
  CNN_pred = pd.DataFrame(CNN_pred,columns=Y_train.columns)
  for DApp in Y_train.columns:
    val_auc_codes['CNN'][i].append([DApp,roc_auc_score(Y_val[DApp],CNN_pred[DApp])])
    
  #count add
  i=i+1  


 Training for GRU, fold#= 1 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30

 Training for CNN, fold#= 1 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30

 Training for GRU, fold#= 2 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30

 Training for CNN, fold#= 2 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30

 Training for GRU, fold#= 3 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30

 Training for CNN, fold#= 3 



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1062 samples, validate on 531 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [0]:
#with open('val_auc_comments.pickle', 'wb') as handle:
#  pickle.dump(val_auc_comments, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('val_auc_codes.pickle', 'wb') as handle:
  pickle.dump(val_auc_codes, handle, protocol=pickle.HIGHEST_PROTOCOL)

Optional: test cv results and save cv results?

Previous best cv aucs across comments and codes

In [0]:
best_comments = cv_result.loc[(cv_result['models']=='GRU') & (cv_result['input_types']=='punctuations_preserved'),'AUC']
best_codes = cv_result.loc[(cv_result['models']=='GRU') & (cv_result['input_types']=='punctuations_preserved'),'AUC']

In [0]:
best = np.array([best_comments.values,best_codes.values]).flatten()

In [0]:
best

array([0.83927698, 0.93344563, 0.79822415, 0.83099787, 0.71119872,
       0.72077033, 0.70858381, 0.68583786, 0.77197907, 0.75099582,
       0.76013409, 0.83927698, 0.93344563, 0.79822415, 0.83099787,
       0.71119872, 0.72077033, 0.70858381, 0.68583786, 0.77197907,
       0.75099582, 0.76013409])

Current aucs

In [0]:
# output the average AUC score
AUC = val_auc_codes['GRU']
cv = pd.concat([pd.DataFrame(AUC[0],columns=['categories','AUC']),pd.DataFrame(AUC[1],columns=['categories','AUC']),pd.DataFrame(AUC[2],columns=['categories','AUC'])]).groupby('categories').mean()
cv = cv.loc[data['category'].value_counts().index,:]
cv['AUC'].values

array([0.77002637, 0.89891768, 0.75014791, 0.74160776, 0.69213138,
       0.71951467, 0.5924536 , 0.68179321, 0.69730159, 0.63183452,
       0.6529898 ])

In [0]:
AUC = val_auc_codes['CNN']
cv1 = pd.concat([pd.DataFrame(AUC[0],columns=['categories','AUC']),pd.DataFrame(AUC[1],columns=['categories','AUC']),pd.DataFrame(AUC[2],columns=['categories','AUC'])]).groupby('categories').mean()
cv1 = cv1.loc[data['category'].value_counts().index,:]
cv1['AUC'].values

array([0.94148532, 0.97264802, 0.93970525, 0.94347193, 0.86655276,
       0.91998702, 0.82070857, 0.81734436, 0.88478902, 0.85260612,
       0.87936148])

In [0]:
cv['AUC'].values.mean()

0.7737676654355803

In [0]:
# update validation results
cv_result.loc[(cv_result['models']=='GRU') & (cv_result['input_types']=='punctuations_removed'),('AUC')] = cv['AUC'].values
cv_result.loc[(cv_result['models']=='CNN') & (cv_result['input_types']=='punctuations_removed'),('AUC')] = cv1['AUC'].values

## train the model and run on the test set

In [0]:
cb=EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=2, restore_best_weights=True)
GRU_model = create_GRU()
GRU_model.fit(combined_train_seq, Y_train, batch_size=100, epochs=30, verbose=1,validation_split=0.25, callbacks=[cb],shuffle=False)
GRU_pred = GRU_model.predict(combined_test_seq)
GRU_pred = pd.DataFrame(GRU_pred,columns=Y_train.columns)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1194 samples, validate on 399 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30


In [0]:
GRU_test_auc = []
for DApp in data['category'].value_counts().index.values:
  GRU_test_auc.append([DApp,roc_auc_score(Y_test[DApp],GRU_pred[DApp])])  
GRU_test_auc = pd.DataFrame(GRU_test_auc,columns=['categories','AUC'])
GRU_test_auc

Unnamed: 0,categories,AUC
0,games,0.835664
1,exchanges,0.899812
2,finance,0.811801
3,gambling,0.815226
4,others,0.774717
5,high-risk,0.758095
6,marketplaces,0.751167
7,social,0.732782
8,development,0.701863
9,media,0.68986


In [0]:
cb=EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=2, restore_best_weights=True)
CNN = create_CNN()
CNN.fit(combined_train_seq, Y_train, batch_size=100, epochs=30, verbose=1,validation_split=0.25, callbacks=[cb],shuffle=False)
CNN_pred = CNN.predict(combined_test_seq)
CNN_pred = pd.DataFrame(CNN_pred,columns=Y_train.columns)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1194 samples, validate on 399 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30


In [0]:
CNN_test_auc = []
for DApp in data['category'].value_counts().index.values:
  CNN_test_auc.append([DApp,roc_auc_score(Y_test[DApp],CNN_pred[DApp])])  
CNN_test_auc = pd.DataFrame(CNN_test_auc,columns=['categories','AUC'])
CNN_test_auc

Unnamed: 0,categories,AUC
0,games,0.957285
1,exchanges,0.973474
2,finance,0.943793
3,gambling,0.958506
4,others,0.840186
5,high-risk,0.938034
6,marketplaces,0.937068
7,social,0.865903
8,development,0.880171
9,media,0.881552


optional save results?

In [0]:
GRU_test_auc['AUC'].values

array([0.83566434, 0.89981187, 0.81180077, 0.81522568, 0.77471723,
       0.758095  , 0.75116713, 0.73278166, 0.70186268, 0.68986038,
       0.64884393])

In [0]:
result.loc[(result['models']=='GRU') & (result['input_types']=='codes_only'),'AUC']

88   NaN
89   NaN
90   NaN
91   NaN
92   NaN
93   NaN
94   NaN
95   NaN
96   NaN
97   NaN
98   NaN
Name: AUC, dtype: float64

In [0]:
result.loc[(result['models']=='GRU') & (result['input_types']=='punctuations_preserved'),'AUC'] = GRU_test_auc['AUC'].values
result.loc[(result['models']=='CNN') & (result['input_types']=='punctuations_preserved'),'AUC'] = CNN_test_auc['AUC'].values

# Present the cv and the test table

In [0]:
order = data['category'].value_counts().index.values.tolist()
order = [('AUC',x) for x in order]

## H1: Which of codes and comments in separation provides more information

In [0]:
H1_test = pd.read_csv("https://raw.githubusercontent.com/HektorLin/HU-IRTG/master/H1_test.csv",sep=',')
H1_val = pd.read_csv("https://raw.githubusercontent.com/HektorLin/HU-IRTG/master/H1_validation.csv",sep=',')

In [0]:
H1_val.set_index(keys=['input_types','models','categories']).unstack('categories').loc[:,order]

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC
Unnamed: 0_level_1,categories,games,exchanges,finance,gambling,others,high-risk,marketplaces,social,development,media,property
input_types,models,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
codes_only,CNN,0.940143,0.966873,0.932744,0.933317,0.851067,0.91568,0.842207,0.843397,0.885706,0.833062,0.847884
codes_only,GRU,0.633049,0.883496,0.65551,0.689534,0.631175,0.55772,0.585778,0.682109,0.674383,0.671397,0.69958
codes_only,lightbm,0.919171,0.949511,0.936111,0.933595,0.779749,0.943349,0.745769,0.777832,0.83087,0.735522,0.808944
codes_only,logit,0.930168,0.965052,0.933269,0.919571,0.839958,0.951622,0.849292,0.810565,0.848061,0.78617,0.828374
codes_only,mlp,0.922556,0.961909,0.914094,0.900696,0.689237,0.906723,0.708756,0.66715,0.7461,0.708507,0.611658
comments_only,CNN,0.929964,0.964703,0.922095,0.879412,0.84977,0.894121,0.847526,0.811475,0.868302,0.825457,0.802518
comments_only,GRU,0.839277,0.933446,0.798224,0.830998,0.711199,0.72077,0.708584,0.685838,0.771979,0.750996,0.760134
comments_only,lightbm,0.911934,0.953486,0.906044,0.887956,0.757649,0.898806,0.783574,0.754704,0.817701,0.700589,0.761726
comments_only,logit,0.899342,0.959587,0.920889,0.895338,0.806856,0.933733,0.821315,0.759296,0.856459,0.735973,0.817684
comments_only,mlp,0.909535,0.942705,0.896004,0.872231,0.68032,0.86522,0.631831,0.699329,0.707555,0.666602,0.543917


In [0]:
H1_test.set_index(keys=['input_types','models','categories']).unstack('categories').loc[:,order]

In [41]:
print('NULL: comments > codes')
for model_type in result['models'].unique():
  comments_only = H1_test.loc[(H1_test['input_types']=='comments_only') & (result['models']==model_type),'AUC'].values
  codes_only = H1_test.loc[(H1_test['input_types']=='codes_only') & (result['models']==model_type),'AUC'].values

  print(model_type, ' ', mannwhitneyu(comments_only,codes_only,alternative='less'))

NULL: comments > codes
logit   MannwhitneyuResult(statistic=62.0, pvalue=0.5522722327507594)
lightbm   MannwhitneyuResult(statistic=50.0, pvalue=0.2557029609250582)
mlp   MannwhitneyuResult(statistic=55.0, pvalue=0.3713329514598411)
GRU   MannwhitneyuResult(statistic=77.0, pvalue=0.8678542381330094)
CNN   MannwhitneyuResult(statistic=56.0, pvalue=0.3964063083160365)


## H2: With codes and comments combined, is preserving punctuations better than not when tokenizing?

In [47]:
cv_result.set_index(keys=['input_types','models','categories']).unstack('categories').loc[:,order]

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC
Unnamed: 0_level_1,categories,games,exchanges,finance,gambling,others,high-risk,marketplaces,social,development,media,property
input_types,models,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
punctuations_preserved,CNN,0.941485,0.972648,0.939705,0.943472,0.866553,0.919987,0.820709,0.817344,0.884789,0.852606,0.879361
punctuations_preserved,GRU,0.770026,0.898918,0.750148,0.741608,0.692131,0.719515,0.592454,0.681793,0.697302,0.631835,0.65299
punctuations_preserved,lightbm,0.928132,0.952763,0.92945,0.914253,0.824604,0.938144,0.799204,0.824941,0.801234,0.719593,0.795385
punctuations_preserved,logit,0.935441,0.964137,0.940485,0.929714,0.841859,0.962059,0.792579,0.835865,0.879219,0.816943,0.803798
punctuations_preserved,mlp,0.932729,0.954837,0.922458,0.893895,0.722192,0.894833,0.665792,0.634988,0.688963,0.633485,0.580804
punctuations_removed,CNN,0.952966,0.972703,0.94252,0.936643,0.882508,0.943039,0.81164,0.858374,0.893546,0.867473,0.914824
punctuations_removed,GRU,0.838403,0.926179,0.822374,0.772577,0.740017,0.812039,0.671206,0.716226,0.730886,0.74104,0.78572
punctuations_removed,lightbm,0.934061,0.953345,0.923282,0.922101,0.816849,0.937514,0.789868,0.811714,0.832611,0.701666,0.771046
punctuations_removed,logit,0.938664,0.964511,0.941005,0.933791,0.849385,0.96321,0.79709,0.855618,0.894471,0.827055,0.798405
punctuations_removed,mlp,0.93447,0.950718,0.920333,0.912473,0.700771,0.886725,0.645237,0.617972,0.670542,0.647992,0.622231


In [48]:
result.set_index(keys=['input_types','models','categories']).unstack('categories').loc[:,order]

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC,AUC
Unnamed: 0_level_1,categories,games,exchanges,finance,gambling,others,high-risk,marketplaces,social,development,media,property
input_types,models,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
punctuations_preserved,CNN,0.957285,0.973474,0.943793,0.958506,0.840186,0.938034,0.937068,0.865903,0.880171,0.881552,0.864001
punctuations_preserved,GRU,0.835664,0.899812,0.811801,0.815226,0.774717,0.758095,0.751167,0.732782,0.701863,0.68986,0.648844
punctuations_preserved,lightbm,0.932629,0.963336,0.877071,0.944657,0.865203,0.926395,0.796125,0.636873,0.738304,0.813115,0.799213
punctuations_preserved,logit,0.875955,0.891304,0.792276,0.754802,0.582335,0.737162,0.619048,0.5,0.664717,0.703937,0.623073
punctuations_preserved,mlp,0.95997,0.972408,0.928504,0.933971,0.801863,0.918521,0.649486,0.66031,0.721464,0.668116,0.757707
punctuations_removed,CNN,0.932503,0.962531,0.946711,0.920229,0.858916,0.892669,0.790476,0.779914,0.886358,0.856146,0.837829
punctuations_removed,GRU,0.842207,0.926662,0.827345,0.830478,0.74847,0.835141,0.698133,0.703125,0.783589,0.791943,0.716843
punctuations_removed,lightbm,0.932197,0.95624,0.921821,0.945655,0.82149,0.891689,0.66634,0.699784,0.742798,0.870451,0.791908
punctuations_removed,logit,0.868043,0.890102,0.810349,0.784802,0.582335,0.692699,0.619048,0.5,0.665692,0.675498,0.624037
punctuations_removed,mlp,0.948003,0.960723,0.924712,0.948191,0.835063,0.851934,0.855556,0.629677,0.805068,0.754063,0.757065


In [0]:
cv_result.to_csv('H2_validation.csv',index=False)
result.to_csv('H2_test.csv',index=False)

In [53]:
print('NULL: punctuations removed > punctuations preserved')
for model_type in result['models'].unique():
  preserved = result.loc[(result['input_types']=='punctuations_preserved') & (result['models']==model_type),'AUC'].values
  removed = result.loc[(result['input_types']=='punctuations_removed') & (result['models']==model_type),'AUC'].values

  print(model_type, ' ', mannwhitneyu(removed, preserved, alternative='less'))

NULL: punctuations removed > punctuations preserved
logit   MannwhitneyuResult(statistic=59.5, pvalue=0.48689297622562533)
lightbm   MannwhitneyuResult(statistic=59.0, pvalue=0.47382226481129724)
mlp   MannwhitneyuResult(statistic=67.0, pvalue=0.6771180898475139)
GRU   MannwhitneyuResult(statistic=74.0, pvalue=0.8210333150533)
CNN   MannwhitneyuResult(statistic=40.0, pvalue=0.09454090298225809)


## H3: Will the average of model predictions from codes and comments outperform that from combined?

## H4: The best AUC possible?

# Pytorch


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

In [0]:
train_data = []
for i in range(len(comments_train_seq)):
   train_data.append([comments_train_seq[i], np.array(Y_train_labels)[i]])
test_data = []
for i in range(len(comments_test_seq)):
   test_data.append([comments_test_seq[i], np.array(Y_test)[i]])

In [0]:
trainloader = torch.utils.data.DataLoader(train_data, shuffle=False, batch_size=100)
testloader = torch.utils.data.DataLoader(test_data, shuffle=False, batch_size=100)

In [0]:
VOCAB_SIZE = 5000
EMBED_DIM = 100
NUN_CLASS = 11
BATCH_SIZE = 100

In [0]:
class ConvNet(nn.Module):

  def __init__(self,vocab_size,embed_dim): #hyperparameters for creating a model with layers
    super(ConvNet, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=False)
    #self.cnn = nn.Conv1d(5000, 128, kernel_size=3, padding=1)
    self.cnn = nn.Conv2d(1, 64, (3, embed_dim), padding=1)
    self.pool1 = nn.MaxPool1d(kernel_size=100)
    self.layer2 = nn.Linear(128, 32)
    self.layer3 = nn.Linear(32, 11)

  def conv_block(self, input, conv_layer):
    conv_out = conv_layer(input) # conv_out.size() = (batch_size, out_channels, dim, 1)
    print(conv_out.size())
    activation = F.relu(conv_out.squeeze(3)) # activation.size() = (batch_size, out_channels, dim1)
    print(activation.size())
    max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2) # maxpool_out.size() = (batch_size, out_channels)
    return max_out

  def forward(self, input_texts): #define input to this model and the sequence
    input_emb = self.embedding(input_texts)
    input_emb = input_emb.unsqueeze(1)
    print(input_emb.size()) #=[batch_size,1,pad_len,emb_dim]
    max_out = self.conv_block(input_emb, self.cnn)
    print(max_out.size())

    return out

In [0]:
model = ConvNet(5000,100)
for text,labels in trainloader:
  text = text.long()
  model(text)

torch.Size([100, 1, 5000, 100])
torch.Size([100, 64, 5000, 3])
torch.Size([100, 64, 5000, 3])


RuntimeError: ignored

In [0]:
model = ConvNet(5000,100)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

In [0]:
epochs = 5
for epoch in range(epochs):
  train_loss = 0
  total = 0
  for i, (text, labels) in enumerate(trainloader):
    text = text.long()
    labels = labels.long()
    optimizer.zero_grad() # zero the gradient buffer ??
    output = model(text)
    loss = criterion(output, labels)
    loss.backward()
    optimizer.step()
    train_loss += loss.item()*labels.shape[0]
    total += labels.shape[0]
  print('epoch:', epoch+1, '; train_loss:' , train_loss/total)

In [0]:
def train_func(data):
  train_loss = 0
  train_auc = 0
  for text, labels in enumerate(data):
    optimizer.zero_grad() #???
    text = text.long() #???
    output = model(text)
    loss = criterion(output, labels)
    train_loss += loss.item()
    loss.backward()
    optimizer.step()
    
  scheduler.step()
  return train_loss

In [0]:
N_EPOCHS = 5
for epoch in range(N_EPOCHS):
  train_loss = train_func(trainloader)

In [0]:
def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

In [0]:
loss = nn.CrossEntropyLoss()
Xinput = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(Xinput, target)
output.backward()

In [0]:
Xinput

tensor([[ 0.6642,  0.0803, -0.4983, -1.0937, -0.6221],
        [-0.8313, -0.3897, -0.0465, -0.5053, -0.5417],
        [-0.5989, -0.1474,  0.8835,  0.3082,  1.0984]], requires_grad=True)

In [0]:
target

tensor([1, 4, 3])