In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import desired libraries

In [30]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.cluster import DBSCAN
from imblearn.under_sampling import TomekLinks
from sklearn.utils import resample
from sklearn.decomposition import PCA
import dill
import pickle

Data source link:https://www.kaggle.com/datasets/fridrichmrtn/e-commerce-churn-dataset-rees46

In [6]:
data = pd.read_csv('/content/drive/MyDrive/Churn Prediction/churn_data.csv')
dictionary = pd.read_csv('/content/drive/MyDrive/Churn Prediction/dictionary.csv', encoding='latin-1')

In [7]:
dictionary.head()

Unnamed: 0,Set,Attribute,Description,Variable name
0,Dependent,churn event,No transaction in future period.,target_event
1,Dependent,retention campaign profit,Incremental profit in future period.,target_actual_profit
2,Recency,session recency,time duration from the last session [days],session_recency
3,Recency,purchase recency,time duration from the last transaction [days],purchase_recency
4,Recency,time to session,time between sessions [days],inter_session_time


Term **'Target period'** refers to the future time period for which you are making predictions. It is the period you aim to predict outcomes for, such as customer churn, revenue, or profit.

**Dependent variables:**
* target_event ==> churn event ==> No transaction in future period.
* target_actual_profit ==> retention campaign profit ==> Incremental profit in future period. Profit retained or lost based on campaigns.

**Useless variables:**
* target_revenue ==> This is the revenue generated by the customer during the target period (the period you are predicting). It represents how much money the customer spent or generated for the company during this period.
* target_customer_value ==> This is the total value or contribution of the customer in the current period. It's an aggregate measure of how valuable a customer is right now (based on revenue, engagement, etc.).

**Independent Variable:**
* target_customer_value_lag1 ==> This is the customer’s value in the previous period (one period before the current one). It shows their past contribution or engagement.

# Check data types in each columns

In [8]:
data.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112610 entries, 0 to 112609
Data columns (total 276 columns):
 #    Column                           Dtype  
---   ------                           -----  
 0    row_id                           int64  
 1    user_id                          int64  
 2    length_mean                      float64
 3    start_year_mean                  float64
 4    start_yearday_mean               float64
 5    start_month_mean                 float64
 6    start_monthday_mean              float64
 7    start_week_mean                  float64
 8    start_weekday_mean               float64
 9    start_isweekend_mean             float64
 10   start_hour_mean                  float64
 11   haspurchase_mean                 float64
 12   click_count_mean                 float64
 13   view_count_mean                  float64
 14   cart_count_mean                  float64
 15   purchase_count_mean              float64
 16   time_to_click_mean               flo

# Deduplication

In [9]:
data.shape

(112610, 276)

In [10]:
subset = data.columns[2:].to_list() + data.columns[:-7].to_list() + [data.columns[-3]]
data = data.drop_duplicates(subset=subset)

In [11]:
data.shape

(112610, 276)

# Split data into train and test

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['target_event', 'target_actual_profit'], axis = 1), data[['target_event','target_actual_profit']], test_size=0.2, stratify=data['target_event'], shuffle=True, random_state=42)

In [13]:
X_test.to_parquet('/content/drive/MyDrive/Churn Prediction/X_test')
y_test.to_parquet('/content/drive/MyDrive/Churn Prediction/y_test')

In [14]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((90088, 274), (90088, 2), (22522, 274), (22522, 2))

# Drop unnecessary columns

In [15]:
class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    colls_to_drop = ['row_id', 'user_id', 'target_revenue', 'target_customer_value']

    def __init__(self, columns = None):
        self.columns = self.colls_to_drop

    def fit(self, data, y=None):
        for i in data.columns:
            if data[i].value_counts().values[0] / data.shape[0] > 0.95:
                self.columns.append(i)
        self.columns = list(set(self.columns))
        return self

    def transform(self, data):
        return data.drop(self.columns, axis=1)

# Missing value imputation

In [16]:
class MissingValueImputationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='median'):
        self.strategy = strategy
        self.imputer = None

    def fit(self, X, y=None):
        self.imputer = SimpleImputer(strategy=self.strategy)
        self.imputer.fit(X)
        return self

    def transform(self, X):
        if X.isnull().sum().sum() == 0:
            return X
        else:
            X_transformed = self.imputer.transform(X)
            return pd.DataFrame(X_transformed, index=X.index, columns=X.columns)

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

# Outlier imputation

In [17]:
class OutlierImputationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, factor=2.5):
        self.factor = factor
        self.bounds_ = {}

    def fit(self, X, y=None):
        for col in X.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - self.factor * IQR
            upper_bound = Q3 + self.factor * IQR
            self.bounds_[col] = {'lower': lower_bound, 'upper': upper_bound}
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in X.columns:
            lower = self.bounds_[col]['lower']
            upper = self.bounds_[col]['upper']
            X_transformed[col] = X_transformed[col].clip(lower=lower, upper=upper)
        return X_transformed

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

# Scaling

In [18]:
class MinMaxScalerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = MinMaxScaler()

    def fit(self, X, y=None):

        self.scaler.fit(X)
        return self

    def transform(self, X):
        X_scaled = self.scaler.transform(X)
        return pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

# PCA - Dimension reduction

In [19]:
class PCATransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=0.95, svd_solver='full'):
        self.n_components = n_components
        self.svd_solver = svd_solver
        self.pca = PCA(n_components=self.n_components, svd_solver=self.svd_solver)

    def fit(self, X, y=None):
        self.pca.fit(X)
        return self

    def transform(self, X):
        X_transformed = self.pca.transform(X)

        return pd.DataFrame(X_transformed, columns=[f'PC{i+1}' for i in range(self.pca.n_components_)])

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

# Anomaly detection for train data

In [20]:
class AnomalyDetectionTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, eps = 3, min_samples=3):
        self.eps = eps
        self.min_samples = min_samples
        self.dbscan = None

    def fit(self, X, y=None):
        self.dbscan = DBSCAN(eps=self.eps, min_samples=self.min_samples)
        self.dbscan.fit(X)
        return self

    def transform(self, X, y=None):
        labels = self.dbscan.labels_

        outlier_indices = np.where(labels == -1)[0]

        X_train = X.reset_index().drop(index=outlier_indices)
        y_train = y.reset_index().drop(index=outlier_indices)
        return X_train.drop(['index'], axis = 1), y_train.drop(['index'], axis = 1)

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

# Tomek Link sampling method for train data

In [21]:
class TomekLinksSampler(BaseEstimator, TransformerMixin):
    def __init__(self, sampling_strategy='majority', n_jobs=-1, target_column = 'target_event'):
        self.sampling_strategy = sampling_strategy
        self.n_jobs = n_jobs
        self.target_column = target_column
        self.tomek = TomekLinks(sampling_strategy=self.sampling_strategy, n_jobs=self.n_jobs)

    def fit(self, X, y=None):
        return self

    def transform(self, data):
        X, y = data
        dt = pd.concat([X, y], axis=1)

        X_train = dt.drop([self.target_column], axis=1)
        y_train = dt[self.target_column]
        X_resampled, y_resampled = self.tomek.fit_resample(X_train, y_train)

        new_data = pd.concat([X_resampled, y_resampled], axis=1)
        return new_data.drop([self.target_column, 'target_actual_profit'], axis=1), new_data[[self.target_column, 'target_actual_profit']]

# Oversampling method for train data

In [22]:
class OversamplingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, target_column='target_event'):
        self.target_column = target_column

    def fit(self, X, y=None):
        return self

    def transform(self, data):
        X, y = data
        data = pd.concat([X, y.reset_index(drop=True)], axis=1)

        majority_class = data[data[self.target_column] == 0]
        minority_class = data[data[self.target_column] == 1]

        oversampled_minority = resample(
            minority_class,
            replace=True,
            n_samples=len(majority_class),
            random_state=42
        )

        oversampled_data = pd.concat([majority_class, oversampled_minority])

        return oversampled_data.drop([self.target_column, 'target_actual_profit'], axis=1), oversampled_data[[self.target_column, 'target_actual_profit']]

# Pipline for test feature data

In [23]:
test_pipline = Pipeline([
    ('missing_imput', MissingValueImputationTransformer(strategy='median')),
    ('outlier imputation', OutlierImputationTransformer()),
    ('drop_columns', DropColumnsTransformer()),
    ('scaling', MinMaxScalerTransformer()),
    ('pca', PCATransformer())
])

# Pipliine for train data

In [24]:
train_pipeline = Pipeline([
    ('anomaly_detection', AnomalyDetectionTransformer()),
    ('tomek_link', TomekLinksSampler()),
    ('oversampling', OversamplingTransformer())
])

# Fit piplines

In [25]:
X_train = test_pipline.fit_transform(X_train)

In [26]:
X_transformed, y_transformed = train_pipeline.fit_transform(X_train, y_train)

# Save X_transformed and y_transformed

In [27]:
X_transformed.to_parquet('/content/drive/MyDrive/Churn Prediction/X_train_preprocessed')
y_transformed.to_parquet('/content/drive/MyDrive/Churn Prediction/y_train_preprocessed')

# Save piplines in dill

In [28]:
!pip install dill



In [33]:
with open('/content/drive/MyDrive/Churn Prediction/test_pipeline.pkl', 'wb') as file:
    dill.dump(test_pipline, file)

In [34]:
with open('/content/drive/MyDrive/Churn Prediction/train_pipeline.pkl', 'wb') as file:
    dill.dump(train_pipeline, file)