<a href="https://colab.research.google.com/github/GaoangLiu/AA_ipynb/blob/master/Santander_Customer_Transaction_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!curl -o data.zip ali.140714.xyz:8000/santander.zip 
!unzip -o data.zip 
!ls 

In [0]:
'''
Load data and import basic module
'''
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')


In [0]:
# small, _ = train_test_split(train, test_size=0.9, stratify=train['target'])
labels = train.target
df0 = train.drop(columns=['target', 'ID_code'])
df0

In [0]:
df0

In [0]:
'''write a baseline model
'''
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

ss = StandardScaler()
df1 = pd.DataFrame(ss.fit_transform(df0), columns=df0.columns)

test0 = test.copy()
test0.drop(columns=['ID_code'], inplace=True)
test1 = pd.DataFrame(ss.transform(test0), columns=test0.columns)

X_train, X_val, y_train, y_val = train_test_split(df1, labels, test_size=0.8, stratify=labels, random_state=0)



# XGBoost

In [83]:
# Load packages
import timeit
import numpy as np
import pandas as pd
import logging
import time

# sklearn modules
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score

import xgboost 

logging.basicConfig(
	format='[%(asctime)s %(levelname)8s] %(message)s',
	level=logging.INFO,
	datefmt='%m-%d %H:%M:%S')


class XGB():
	'''Logistic Regression
	'''
	def __init__(self, train, test, labels):
		self.train = train 
		self.test = test
		self.labels = labels
	
	def simple_train(self):
		'''In case the dataset is too large to do grid search, and we
		need to test xgboost performance directly
		'''
		params = {'colsample_bytree': 0.4,                 
			'learning_rate': 0.1,
			'max_depth': 4,
			'subsample': 1,
			'min_child_weight': 4,
			'gamma': 0.24,
			'alpha': 0,
			'lambda': 1,
			'objective': 'reg:squarederror',
			'seed': 1220,
			'n_estimators': 200
		}
		self.model = xgboost.XGBRegressor(**params)
		X_train, X_val, y_train, y_val = train_test_split(self.train, self.labels, stratify=self.labels, test_size=0.2, random_state=0)
		# eval_set = [(X_train, y_train), (X_val, y_val)]
		# eval_metric = [self.metric]

		self.model.fit(X_train, y_train)
		y_preds = self.model.predict(X_val)
		print("Validation roc auc score", roc_auc_score(y_val, y_preds))

		return self.model
	
	def grid_search_cv(self):
		'''Search optimized parameters
		'''
		gbm_param_grid = {
			'colsample_bytree': [0.2, 0.4, 0.6, 0.8],
			'n_estimators': [50, 100, 200],
			'max_depth': [4, 5, 6, 7]
		}

		params = {'learning_rate': 0.3,
			'subsample': 1,
			'min_child_weight': 4,
			'gamma': 0.24,
			'alpha': 0,
			'lambda': 1,
			'objective': 'reg:squarederror',
			'seed': 1220,
		}
		cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2)
		grid = GridSearchCV(estimator=xgboost.XGBRegressor(**params),
							param_grid=gbm_param_grid,
							scoring='roc_auc',
							n_jobs=-1,
							cv=cv, verbose=51)

		grid.fit(self.train, self.labels)
		print(grid.best_score_)
		print(grid.best_params_)
		print(grid.best_estimator_)
		
		self.model = grid.best_estimator_
		return self.model

	def predict(self, target, sample_file, sub_file):
		"""
		@parameters:
		target: column name of target
		sample_file: sample submission file
		sub_file: submission file to be exported to
		"""
		logging.info("Making predictions on test dataset")
		preds = self.model.predict(self.test)
		sub = pd.read_csv(sample_file)
		sub[target] = preds
		sub.to_csv(sub_file, index=False)
		return preds 

# df2, _, y2, _ = train_test_split(df1, labels, stratify=labels, random_state=2, train_size=0.1)
# o = XGB(df2, test1, y2)
# %time o.grid_search_cv()

o = XGB(df1, test1, labels)
%time o.simple_train()
o.predict('target', 'sample_submission.csv', 'submission_xgb.csv')

[06-04 12:17:56     INFO] Making predictions on test dataset


Validation roc auc score 0.8580663939889215
CPU times: user 4min 44s, sys: 136 ms, total: 4min 44s
Wall time: 4min 44s


array([ 0.13981298,  0.2654688 ,  0.2759922 , ..., -0.00616401,
        0.1353879 ,  0.08319959], dtype=float32)

In [62]:
# o.predict('target', 'sample_submission.csv', 'submission_xgb.csv')
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

# LR

In [47]:
'''Try logistic regression
The current best result is 0.86095, seems this score is unsurpassable
'''
import timeit
import numpy as np
import pandas as pd
import logging
import time

# sklearn modules
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score

logging.basicConfig(
	format='[%(asctime)s %(levelname)8s] %(message)s',
	level=logging.INFO,
	datefmt='%m-%d %H:%M:%S')


class LR():
	'''Logistic Regression
	'''
	def __init__(self, train, test, labels):
		self.train = train 
		self.test = test
		self.labels = labels
	
	def grid_search_cv(self):
		'''Search optimized parameters
		'''
		more_cvals = [i / 10 for i in range(1, 10)] + list(range(1, 20))
		simple_cvals = [0.2, 0.8, 1, 3, 5]
		penalties = ['l2', 'l1']
		params = {'penalty': penalties, 'C': more_cvals}
		cv = StratifiedShuffleSplit(n_splits=10, test_size=.25)
		grid = GridSearchCV(estimator=LogisticRegression(verbose=1),
							param_grid=params,
							scoring='accuracy',
							n_jobs=-1,
							cv=cv, verbose=1)

		grid.fit(self.train, self.labels)
		print(grid.best_score_)
		print(grid.best_params_)
		print(grid.best_estimator_)
		
		self.model = grid.best_estimator_
		return self.model

	def predict(self, target, sample_file, sub_file):
		logging.info("Making predictions on test dataset")
		preds = [p[1] for p in self.model.predict_proba(self.test)]
		sub = pd.read_csv(sample_file)
		sub[target] = preds
		sub.to_csv(sub_file, index=False)
		return preds 


m = LR(df1, test1, labels)	
%time m.grid_search_cv()
preds = m.predict('target', 'sample_submission.csv', 'submission_lr.csv')


Fitting 10 folds for each of 56 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   30.0s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:  5.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s finished
[06-04 07:38:30     INFO] Making predictions on test dataset


0.9146099999999999
{'C': 0.3, 'penalty': 'l2'}
LogisticRegression(C=0.3, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=1,
                   warm_start=False)
CPU times: user 6.07 s, sys: 750 ms, total: 6.82 s
Wall time: 5min 38s


[0.1907629451834064,
 0.24704067987227385,
 0.04723549197177474,
 0.2034318331738896,
 0.06367065295490693,
 0.0026678444838705534,
 0.007752910600382162,
 0.15401108439133168,
 0.0021436006227062094,
 0.007624999876553927,
 0.1401210039172066,
 0.09488135553032122,
 0.16693915818334992,
 0.011053377234360022,
 0.004858983711823622,
 0.030290939259779638,
 0.18613557470916917,
 0.027189749228913047,
 0.12383403636469066,
 0.025813565649018162,
 0.20447311929225867,
 0.049828629605520716,
 0.02035908963864961,
 0.06344323719970349,
 0.13391775542018744,
 0.05441872135486142,
 0.04679652186939345,
 0.0017809432809082022,
 0.12915142493398585,
 0.204281350968462,
 0.08856308298482743,
 0.03232456378849513,
 0.5578151748025758,
 0.18346061169722994,
 0.004486490572775629,
 0.021070587890364447,
 0.002772187477881773,
 0.034749939635970395,
 0.008276259167668937,
 0.007091549929339702,
 0.01774063865038707,
 0.014876680207721689,
 0.02083796526359474,
 0.003558547313574577,
 0.0312853532967

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

# Save model / upload file

In [0]:
%%time
import pickle
pickle.dump(model, open('santander_xgboost_0.87425.pkl', 'wb'))


In [0]:
# %%time
'''Make predictions
'''
model = m.model
preds = [p[1] for p in model.predict_proba(test1)]
sub['target'] = preds
sub.to_csv('santander_lr.csv', index=False)

In [84]:
!ls -lt submission_xgb.csv
!curl -X PUT --upload-file submission_xgb.csv ali.140714.xyz:8000/

-rw-r--r-- 1 root root 4572961 Jun  4 12:17 submission_xgb.csv
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
 95 4465k    0     0   95 4272k      0  1623k  0:00:02  0:00:02 --:--:-- 1622k

## Received: "submission_xgb.csv"

100 4465k    0    37  100 4465k     13  1585k  0:00:02  0:00:02 --:--:-- 1585k
