In [1]:
from pathlib import Path

import pandas as pd
from pandasgui import show
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
data_dir = Path("~/.cache/kaggle/datasets/blastchar").expanduser()
file_name = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

data_path = data_dir / file_name
data: pd.DataFrame = pd.read_csv(data_path.as_posix())

In [None]:
# Analyze it with Pandas GUI library
show(data)

In [None]:
from sklearn.preprocessing import OneHotEncoder

_ohe: OneHotEncoder = OneHotEncoder(sparse=False, dtype=int).fit(data["MultipleLines"].values.reshape(-1, 1))
print(_ohe.categories_)

In [5]:
_df = pd.DataFrame(
    _ohe.transform(data["MultipleLines"].values.reshape(-1, 1)),
    columns=[f"MultipleLines_{i}" for i in _ohe.categories_[0]],
)

In [None]:
show(pd.concat([_df, _df], axis=1))

In [None]:
[f"MultipleLines_{i}" for i in _ohe.categories_[0]]

In [None]:
# Pretty much all fields are categorical, except `customerID`. This one needs to be removed.
data.drop("customerID", axis=1, inplace=True)

# This field has type int: convert to float
data["tenure"] = data["tenure"].astype(float)

# This field is object, convert to floating point numbers and remove nans
_orig_size = len(data)
data["TotalCharges"] = pd.to_numeric(data["TotalCharges"], errors="coerce")
data.dropna(axis=0, how="any", inplace=True)
print(f"While casting TotalCharges to floats, {_orig_size - len(data)} instances have been removed.")

In [9]:
# binary 0/1 (Churn - label)
data["gender"] = LabelEncoder().fit_transform(data["gender"])
for feature in ["Partner", "Dependents", "PhoneService", "PaperlessBilling", "Churn"]:
    data[feature].replace({"No": 0, "Yes": 1}, inplace=True)

# categorical 0/1/2 or 0/1/2/3/4 (PaymentMethod)
for feature in [
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaymentMethod",
]:
    data[feature] = LabelEncoder().fit_transform(data[feature])

In [10]:
label: str = "Churn"
train, valid = train_test_split(data, train_size=0.8, random_state=0, stratify=data[label])

In [None]:
from sklearn.dummy import DummyClassifier

from xtime.datasets import Dataset, DatasetMetadata, DatasetSplit
from xtime.estimators import Estimator
from xtime.ml import ClassificationTask, TaskType

dataset = Dataset(
    metadata=DatasetMetadata(
        name="telco_customer_churn",
        version="NA",
        task=ClassificationTask(type_=TaskType.BINARY_CLASSIFICATION),
    ),
    splits={
        "train": DatasetSplit(x=train.drop(label, axis=1, inplace=False), y=train[label]),
        "valid": DatasetSplit(x=valid.drop(label, axis=1, inplace=False), y=valid[label]),
    },
)

estimator = Estimator()
estimator.model = DummyClassifier(strategy="prior").fit(dataset.splits["train"].x, dataset.splits["train"].y)

metrics = estimator.evaluate(dataset)
print(metrics)