In [33]:
import spacy as sp
from spacy.matcher import PhraseMatcher, Matcher
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import wordnet
import stanza
from nltk.corpus import stopwords
import re
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings('ignore')
from sklearn.metrics import roc_auc_score

In [2]:
df_train = pd.read_csv('./datasets/train_sessions.csv')
df_test = pd.read_csv('./datasets/test_sessions.csv')

In [3]:
time = [f'time{i}' for i in range(1,11)]
df_train[time] = df_train[time].apply(pd.to_datetime)
df_test[time] = df_test[time].apply(pd.to_datetime)
df_train_ = df_train.sort_values(by='time1')

In [4]:
sites = [f'site{i}' for i in range(1,11)]
df_train[sites] = df_train[sites].fillna(0).astype(int).astype(str)
df_test[sites] = df_test[sites].fillna(0).astype(int).astype(str)
df_train['list'] = df_train['site1']
df_test['list'] = df_test['site1']
for s in sites[1:]:
    df_train['list'] = df_train['list'] + ',' + df_train[s]
    df_test['list'] = df_test['list'] + ',' + df_test[s]
df_train['w_list'] = df_train['list'].apply(lambda x: x.split(','))
df_test['w_list'] = df_test['list'].apply(lambda x: x.split(','))

In [5]:
from gensim.models import word2vec

In [6]:
df_test['targen'] = -1
data = pd.concat([df_train,df_test], axis = 0)

model = word2vec.Word2Vec(data['w_list'], size = 300, window=3,workers = 4)


In [7]:
w2v = dict(zip(model.wv.index2word, model.wv.syn0))


In [8]:
class MeanVectorize:
    def __init__(self,w2v):
        self.word2vec = w2v
        self.dim = len(next(iter(w2v.values())))
    
    def fit(self,X):
        return self
    
    def tranform(self,X):
        
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [9]:
data_mean = MeanVectorize(w2v).tranform(data['w_list'])

In [47]:
temp =data['target'].fillna(-1)
train_idx = (temp == 0) | (temp == 1)
test_idx = (temp == -1)
X = data_mean[train_idx]
Y = temp[train_idx]
test = data_mean[test_idx]

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, Y_train, random_state = 42)

In [34]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input
from keras.preprocessing.text import Tokenizer
from keras import regularizers


Using TensorFlow backend.


In [35]:
model = Sequential()
model.add(Dense(128, input_dim=(Xtr.shape[1])))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['binary_accuracy'])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [37]:
history = model.fit(X_train, y_train,
                    batch_size=128,
                    epochs=10,
                    validation_data=(X_test, y_test),
                    class_weight='auto',
                    verbose=0)




In [39]:
y_pred = model.predict(X_test, batch_size=128)
roc_auc_score(y_test, y_pred)

0.9289532346332854

In [42]:
import xgboost as xgb

In [45]:
d_train = xgb.DMatrix(X_train, label= y_train,missing = np.nan)
d_test = xgb.DMatrix(X_test, label= y_test,missing = np.nan)
watchlist = [(d_train, 'train'), (d_test, 'eval')]
history = dict(watchlist)

In [48]:
params = {
    'max_depth': 26,
    'eta': 0.025,
    'nthread': 4,
    'gamma' : 1,
    'alpha' : 1,
    'subsample': 0.85,
    'eval_metric': ['auc'],
    'objective': 'binary:logistic',
    'colsample_bytree': 0.9,
    'min_child_weight': 100,
    'scale_pos_weight':(1)/Y.mean(),
    'seed':7
}

model = xgb.train(params, d_train, num_boost_round=200, evals=watchlist, evals_result=history, verbose_eval=20)


[0]	train-auc:0.95688	eval-auc:0.87422
[20]	train-auc:0.99017	eval-auc:0.92753
[40]	train-auc:0.99281	eval-auc:0.93328
[60]	train-auc:0.99444	eval-auc:0.93554
[80]	train-auc:0.99562	eval-auc:0.93765
[100]	train-auc:0.99652	eval-auc:0.93904
[120]	train-auc:0.99718	eval-auc:0.94035
[140]	train-auc:0.99765	eval-auc:0.94074
[160]	train-auc:0.99801	eval-auc:0.94183
[180]	train-auc:0.99830	eval-auc:0.94276
[199]	train-auc:0.99853	eval-auc:0.94345


In [52]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1, random_state=7, n_jobs=-1)
lr_fit = lr.fit(X_train,y_train)
y_pred = lr_fit.predict(X_test)
score = roc_auc_score(y_test, y_pred)
print(score)

0.5077658486187722


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

class tfidf_vectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(w2v.values())))

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [55]:
data_mean = tfidf_vectorizer(w2v).fit(data['w_list']).transform(data['w_list'])

temp =data['target'].fillna(-1)
train_idx = (temp == 0) | (temp == 1)
test_idx = (temp == -1)
X = data_mean[train_idx]
Y = temp[train_idx]
test = data_mean[test_idx]

lf_fit = lr.fit(X_train,y_train)
y_pred = lr_fit.predict(X_test)
score = roc_auc_score(y_test, y_pred)
print(score)

0.5077658486187722
