In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/e-commerce-shoppers-behaviour-understanding/test_data_v2.csv
/kaggle/input/e-commerce-shoppers-behaviour-understanding/train_data_v2.csv
/kaggle/input/e-commerce-shoppers-behaviour-understanding/sample.csv


In [2]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, StackingClassifier, IsolationForest
from imblearn.over_sampling import BorderlineSMOTE
from xgboost import XGBClassifier

In [3]:
train_data = pd.read_csv('/kaggle/input/e-commerce-shoppers-behaviour-understanding/train_data_v2.csv')
test_data = pd.read_csv('/kaggle/input/e-commerce-shoppers-behaviour-understanding/test_data_v2.csv')

In [4]:
x_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
x_test = test_data.iloc[:, :]
onehc = OneHotEncoder(drop='first', sparse=False)
scaler = StandardScaler()
imputer = IterativeImputer(random_state=42)
transformer = QuantileTransformer(output_distribution='normal', random_state=42)
resampler = BorderlineSMOTE(random_state=42)

In [5]:
# Encoding

x_train_df = onehc.fit_transform(x_train[['Month_SeasonalPurchase', 'CustomerType', 'Gender', 'Cookies Setting', 'Education', 'Marital Status']])
x_test_df = onehc.transform(x_test[['Month_SeasonalPurchase', 'CustomerType', 'Gender', 'Cookies Setting', 'Education', 'Marital Status']])
x_train_df = pd.DataFrame(x_train_df, columns=onehc.get_feature_names_out())
x_test_df = pd.DataFrame(x_test_df, columns=onehc.get_feature_names_out())
x_train = x_train.drop(['Month_SeasonalPurchase', 'CustomerType', 'Gender', 'Cookies Setting', 'Education', 'Marital Status'], axis=1)
x_test = x_test.drop(['Month_SeasonalPurchase', 'CustomerType', 'Gender', 'Cookies Setting', 'Education', 'Marital Status'], axis=1)
x_train = pd.concat([x_train, x_train_df], axis=1)
x_test = pd.concat([x_test, x_test_df], axis=1)

In [6]:
# Imputation

x_train_impute = imputer.fit_transform(x_train)
x_test_impute = imputer.transform(x_test)
x_train = pd.DataFrame(x_train_impute, columns=imputer.feature_names_in_)
x_test = pd.DataFrame(x_test_impute, columns=imputer.feature_names_in_)

In [7]:
# Scaling

x_train_scale = scaler.fit_transform(x_train)
x_test_scale = scaler.transform(x_test)
x_train = pd.DataFrame(x_train_scale, columns=scaler.feature_names_in_)
x_test = pd.DataFrame(x_test_scale, columns=scaler.feature_names_in_)

In [8]:
# Transformation

x_train_trans = transformer.fit_transform(x_train)
x_test_trans = transformer.transform(x_test)
x_train = pd.DataFrame(x_train_trans, columns=transformer.feature_names_in_)
x_test = pd.DataFrame(x_test_trans, columns=transformer.feature_names_in_)

In [9]:
# Outlier Detection & Removal

data = pd.concat([x_train, y_train], axis=1)
clf = IsolationForest(random_state=42).fit_predict(data)
clf_df = pd.DataFrame(clf, columns=['score'])
data = pd.concat([data, clf_df], axis=1)
data = data[data['score'] == 1]
data = data.drop(['score'], axis=1)
x_train = data.iloc[:, :-1]
y_train = data.iloc[:, -1]
y_train = y_train.astype('int')

In [10]:
# Class Rebalancing

x_train_resample, y_train = resampler.fit_resample(x_train, y_train)
x_train = pd.DataFrame(x_train_resample, columns=resampler.feature_names_in_)

In [11]:
# Estimation

xgb = XGBClassifier(max_depth=8, learning_rate=0.01, colsample_bytree=0.3, random_state=42)
et = ExtraTreesClassifier(n_estimators=1000, random_state=0)
ada = AdaBoostClassifier(n_estimators=49, random_state=0)
estimators = [('ada', ada), ('et', et)]
classifier = StackingClassifier(estimators=estimators, final_estimator=xgb, passthrough=True)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

In [12]:
df = pd.DataFrame()
df['id'] = range(6599)
df['Made_Purchase'] = y_pred
df['Made_Purchase'] = df['Made_Purchase'].astype(bool)
df.to_csv('submission.csv', index=False)