In [1]:
import pandas as pd
from pandasgui import show
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pathlib import Path

In [2]:
data_dir = Path('~/.cache/kaggle/datasets/shrutime').expanduser()
file_name = 'Churn_Modelling.csv'

data_path = data_dir / file_name
data: pd.DataFrame = pd.read_csv(data_path.as_posix())

In [None]:
data.head()

In [None]:
print(data.index, type(data.index), len(data.index))

In [None]:
train_index, valid_index = train_test_split(data.index, train_size=0.8, random_state=0, stratify=data['Exited'])

print(train_index, type(train_index), len(train_index))
print(valid_index, type(valid_index), len(valid_index))

In [None]:
data.iloc[train_index][['Exited']]

In [None]:
# Analyze it with Pandas GUI library
show(data)

In [8]:
# Drop unique columns
data.drop('RowNumber', axis=1, inplace=True)
data.drop('CustomerId', axis=1, inplace=True)

# Textual fields (second names)
data.drop('Surname', axis=1, inplace=True)

# Convert several numerical columns to floating point format
data['CreditScore'] = data['CreditScore'].astype(float)
data['Age'] = data['Age'].astype(float)
data['Tenure'] = data['Tenure'].astype(float)

In [None]:
for feature in ['Geography', 'Gender']:
    _label_encoder = LabelEncoder().fit(data[feature])
    data[feature] = _label_encoder.transform(data[feature])
    print(feature, _label_encoder.classes_)

In [10]:
train, valid = train_test_split(data, train_size=0.8, random_state=0, stratify=data['Exited'])

In [None]:
from xtime.estimators import Estimator
from xtime.ml import TaskType, ClassificationTask
from xtime.datasets import (Dataset, DatasetSplit, DatasetMetadata)
from sklearn.dummy import DummyClassifier

dataset = Dataset(
    metadata=DatasetMetadata(
        name='Churn_Modelling',
        version='NA',
        task=ClassificationTask(type_=TaskType.BINARY_CLASSIFICATION, num_classes=2)
    ),
    splits={
        'train': DatasetSplit(x=train.drop('Exited', axis=1, inplace=False), y=train['Exited']),
        'valid': DatasetSplit(x=train.drop('Exited', axis=1, inplace=False), y=train['Exited'])
    }
)

estimator = Estimator()
estimator.model = DummyClassifier(strategy="prior").fit(dataset.splits['train'].x, dataset.splits['train'].y)

metrics = estimator.evaluate(dataset)
print(metrics)