In [1]:

import numpy as np
from sklearn.metrics import roc_auc_score

import pandas as pd
n_rows = 300000
df = pd.read_csv("desktop/train.csv", nrows=n_rows)

X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
Y = df['click'].values

n_train = 10000
X_train = X[:n_train]
Y_train = Y[:n_train]
X_test = X[n_train:]
Y_test = Y[n_train:]
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [2]:

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
X_train_enc = enc.fit_transform(X_train)

X_test_enc = enc.transform(X_test)

In [3]:
# # Use scikit-learn package
from sklearn.linear_model import SGDClassifier
sgd_lr = SGDClassifier(loss='log_loss', penalty=None, fit_intercept=True, max_iter=10, learning_rate='constant', eta0=0.01)
sgd_lr.fit(X_train_enc.toarray(), Y_train)

pred = sgd_lr.predict_proba(X_test_enc.toarray())[:, 1]
print(f'Training samples: {n_train}, AUC on testing set: {roc_auc_score(Y_test, pred):.3f}')



Training samples: 10000, AUC on testing set: 0.712


In [4]:
# Feature selection with L1 regularization
sgd_lr_l1 = SGDClassifier(loss='log_loss', penalty='l1', alpha=0.0001, fit_intercept=True, max_iter=10, learning_rate='constant', eta0=0.01)
sgd_lr_l1.fit(X_train_enc.toarray(), Y_train)



In [5]:
coef_abs = np.abs(sgd_lr_l1.coef_)
print(coef_abs)

[[0.         0.         0.17655982 ... 0.20062663 0.1491544  0.1663061 ]]


In [6]:
# bottom 10 weights and the corresponding 10 least important features
print(np.sort(coef_abs)[0][:10])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [7]:
feature_names = enc.get_feature_names_out()
bottom_10 = np.argsort(coef_abs)[0][:10]
print('10 least important features are:\n', feature_names[bottom_10])

10 least important features are:
 ['x0_1001' 'x8_90f22c00' 'x8_911a3c09' 'x8_915696e0' 'x8_91abff7d'
 'x8_91c9c83e' 'x8_93347b0a' 'x8_93dd64c2' 'x8_94405e6a' 'x8_945dd5c4']


In [8]:
# top 10 weights and the corresponding 10 most important features
print(np.sort(coef_abs)[0][-10:])
top_10 = np.argsort(coef_abs)[0][-10:]
print('10 most important features are:\n', feature_names[top_10])

[0.48578417 0.49019302 0.50574464 0.50843239 0.51963493 0.51963493
 0.57383091 0.58352054 0.62847981 0.70355033]
10 most important features are:
 ['x8_84ebbcd4' 'x3_17d996e6' 'x18_15' 'x2_5ee41ff2' 'x2_543a539e'
 'x3_c7ca3108' 'x2_d9750ee7' 'x8_81b42528' 'x3_27e3c518' 'x18_61']


In [9]:
# ---------------------------------------------------------------------------------------------
# Online learning
n_rows = 100000 * 11
df = pd.read_csv("desktop/train.csv", nrows=n_rows)

X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
Y = df['click'].values

n_train = 100000 * 10
X_train = X[:n_train]
Y_train = Y[:n_train]
X_test = X[n_train:]
Y_test = Y[n_train:]

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_train)


In [11]:
# The number of iterations is set to 1 if using partial_fit.
sgd_lr_online = SGDClassifier(loss='log_loss', penalty=None, fit_intercept=True, max_iter=1, learning_rate='constant',
                              eta0=0.01)

import timeit
start_time = timeit.default_timer()
# Use the first 1,000,000 samples for training, and the next 100,000 for testing
for i in range(10):
    x_train = X_train[i*100000:(i+1)*100000]
    y_train = Y_train[i*100000:(i+1)*100000]
    x_train_enc = enc.transform(x_train)
    sgd_lr_online.partial_fit(x_train_enc.toarray(), y_train, classes=[0, 1])

print(f"--- {(timeit.default_timer() - start_time)}.3fs seconds ---")


--- 146.986652708.3fs seconds ---


In [12]:
x_test_enc = enc.transform(X_test)

pred = sgd_lr_online.predict_proba(x_test_enc.toarray())[:, 1]
print(f'Training samples: {n_train * 10}, AUC on testing set: {roc_auc_score(Y_test, pred):.3f}')

Training samples: 10000000, AUC on testing set: 0.762


In [13]:
# Feature selection with random forest
import numpy as np
import pandas as pd
n_rows = 100000
df = pd.read_csv("desktop/train.csv", nrows=n_rows)

X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
Y = df['click'].values


X_train = X
Y_train = Y

In [14]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
X_train_enc = enc.fit_transform(X_train)

In [15]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=8, criterion='gini', min_samples_split=30, n_jobs=-1)
random_forest.fit(X_train_enc.toarray(), Y_train)
feature_imp = random_forest.feature_importances_
print(feature_imp)

[1.30542024e-06 2.63790378e-03 1.16328475e-03 ... 2.86030805e-04
 9.78512346e-05 1.04269350e-02]


In [17]:
# bottom 10 weights and the corresponding 10 least important features
feature_names = enc.get_feature_names_out()
print(np.sort(feature_imp)[:10])
bottom_10 = np.argsort(feature_imp)[:10]
print('10 least important features are:\n', feature_names[bottom_10])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
10 least important features are:
 ['x3_24b5d447' 'x8_81cc6492' 'x8_81fc96cf' 'x8_8378547a' 'x3_10796335'
 'x3_fd869226' 'x8_83ae759f' 'x3_0ce1d23a' 'x8_81a9e2c3' 'x8_84915a27']


In [18]:
# top 10 weights and the corresponding 10 most important features
print(np.sort(feature_imp)[-10:])
top_10 = np.argsort(feature_imp)[-10:]
print('10 most important features are:\n', feature_names[top_10])

[0.00770449 0.00778128 0.00948719 0.00974425 0.01042693 0.0122425
 0.01327589 0.01407838 0.03150346 0.03433456]
10 most important features are:
 ['x4_28905ebd' 'x2_5b08c53b' 'x17_-1' 'x8_8a4875bd' 'x18_157' 'x13_250'
 'x2_d9750ee7' 'x13_50' 'x15_2' 'x18_33']


In [19]:
# ---------------------------------------------------------------------------------------------
# Multiclass classification with logistic regression using load_digits dataset

from sklearn import datasets
digits = datasets.load_digits()
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
Y = digits.target
X.shape


(1797, 64)

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [21]:
from sklearn.model_selection import GridSearchCV
parameters = {'penalty': ['l2', None],
              'alpha': [1e-07, 1e-06, 1e-05, 1e-04],
              'eta0': [0.01, 0.1, 1, 10]}

sgd_lr = SGDClassifier(loss='log_loss', learning_rate='constant', eta0=0.01, fit_intercept=True, max_iter=10)

In [22]:
grid_search = GridSearchCV(sgd_lr, parameters, n_jobs=-1, cv=5)

grid_search.fit(X_train, Y_train)
print(grid_search.best_params_)

sgd_lr_best = grid_search.best_estimator_
accuracy = sgd_lr_best.score(X_test, Y_test)
print(f'The accuracy on testing set is: {accuracy*100:.1f}%')

{'alpha': 1e-05, 'eta0': 10, 'penalty': None}
The accuracy on testing set is: 92.5%


