In [1]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/e-commerce-shoppers-behaviour-understanding/test_data_v2.csv
/kaggle/input/e-commerce-shoppers-behaviour-understanding/train_data_v2.csv
/kaggle/input/e-commerce-shoppers-behaviour-understanding/sample.csv


In [2]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, QuantileTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier, AdaBoostClassifier, StackingClassifier, IsolationForest
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import BorderlineSMOTE
from xgboost import XGBClassifier

In [3]:
train_data = pd.read_csv('/kaggle/input/e-commerce-shoppers-behaviour-understanding/train_data_v2.csv')
test_data = pd.read_csv('/kaggle/input/e-commerce-shoppers-behaviour-understanding/test_data_v2.csv')

In [4]:
x_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
x_test = test_data.iloc[:, :]
outlier = IsolationForest(random_state=42)
resampler = BorderlineSMOTE(random_state=42)
selector = VarianceThreshold(threshold=0.3)
encoder = make_column_transformer((OneHotEncoder(drop='first', sparse=False), ['Month_SeasonalPurchase', 'CustomerType', 'Gender', 'Cookies Setting', 'Education', 'Marital Status']), remainder='passthrough', verbose_feature_names_out=False)
pipe = make_pipeline(encoder, IterativeImputer(random_state=42), MinMaxScaler(), QuantileTransformer(output_distribution='normal', random_state=42))

In [5]:
# Encoding, Imputation, Scaling, Transformation

x_train = pd.DataFrame(pipe.fit_transform(x_train), columns=encoder.get_feature_names_out())
x_test = pd.DataFrame(pipe.transform(x_test), columns=encoder.get_feature_names_out())

In [6]:
# Outlier Detection & Removal

data = pd.concat([x_train, y_train], axis=1)
clf = outlier.fit_predict(data)
clf_df = pd.DataFrame(clf, columns=['score'])
data = pd.concat([data, clf_df], axis=1)
data = data[data['score'] == 1]
data = data.drop(['score'], axis=1)
x_train = data.iloc[:, :-1]
y_train = data.iloc[:, -1]
y_train = y_train.astype('int')

In [7]:
# Class Rebalancing

x_train_resample, y_train = resampler.fit_resample(x_train, y_train)
x_train = pd.DataFrame(x_train_resample)

In [8]:
# Feature Selection

x_train = pd.DataFrame(selector.fit_transform(x_train), columns=selector.get_feature_names_out())
x_test = pd.DataFrame(selector.transform(x_test), columns=selector.get_feature_names_out())

In [9]:
# Estimation

xgb = XGBClassifier(max_depth=8, learning_rate=0.01, colsample_bytree=0.3, objective="binary:logistic", random_state=42)
ada = AdaBoostClassifier(n_estimators=49, random_state=0)
et = ExtraTreesClassifier(n_estimators=1000, random_state=0)
estimators = [('ada', ada), ('et', et)]
classifier = StackingClassifier(estimators=estimators, final_estimator=xgb, passthrough=True)

classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

In [10]:
# Submission

df = pd.DataFrame()
df['id'] = range(6599)
df['Made_Purchase'] = y_pred
df['Made_Purchase'] = df['Made_Purchase'].astype(bool)
df.to_csv('submission.csv', index=False)