In [12]:
import pandas as pd
import numpy as np
import os
import sys
from src.entity.config_entity import DataTransformConfig
from src.exception import CustomException
from src.logger import logging
from src.utils import read_yaml, create_directories
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np
from src.components.data_transform import DataTransform


In [None]:
os.chdir("..")

In [2]:
# Read in datasets
members_df = pd.read_csv("artifacts/data_ingestion/members_v3.csv")
transactions_df = pd.read_csv("artifacts/data_ingestion/transactions_v2.csv")
user_logs_df = pd.read_csv("artifacts/data_ingestion/user_logs_v2.csv")
train_df = pd.read_csv("artifacts/data_ingestion/train_v2.csv")

### Members Dataset:
- Drop ages and gender due to outliers
- Drop registration time

In [3]:
members_df_drop = members_df.drop(["bd","gender","registration_init_time"], axis=1)

### User Logs Dataset
- Sum features by user IDs

In [4]:
user_logs_df_summed = user_logs_df.groupby("msno").sum()
user_logs_df_summed.reset_index(inplace=True)

### Transactions
- Get latest transaction of each user ID
- Add feature num_prev_transactions
- Add feature total_prev_paid
- Add feature num_prev_cancelled
- Add feature num_prev_discounts
- Add feature curr_discount
- Drop membership expiry date


In [5]:
latest_transactions = transactions_df.copy()
latest_transactions = latest_transactions.sort_values('transaction_date').drop_duplicates(['msno'], keep='last')

In [6]:
prev_transactions = transactions_df.copy()
mask = prev_transactions.apply(tuple, axis=1).isin(latest_transactions.apply(tuple, axis=1))
prev_transactions = prev_transactions[~mask]

In [7]:
num_transactions = prev_transactions.groupby('msno').size().reset_index(name='num_prev_transactions')
total_prev_paid = prev_transactions.groupby('msno')['actual_amount_paid'].sum().reset_index(name='total_prev_paid')
total_prev_cancelled = prev_transactions.groupby('msno')['is_cancel'].sum().reset_index(name='total_prev_cancelled')
num_prev_discounts = prev_transactions[prev_transactions['plan_list_price'] > prev_transactions['actual_amount_paid']].groupby('msno').size().reset_index(name='num_prev_discounts')

# Add new features
latest_transactions = latest_transactions.merge(num_transactions, on='msno', how='left')
latest_transactions = latest_transactions.merge(total_prev_paid, on='msno', how='left')
latest_transactions = latest_transactions.merge(num_prev_discounts, on='msno', how='left')
latest_transactions = latest_transactions.merge(total_prev_cancelled, on='msno', how='left')

# Filling NA values with 0 (for users with no previous transactions)
latest_transactions['num_prev_transactions'] = latest_transactions['num_prev_transactions'].fillna(0)
latest_transactions['total_prev_paid'] = latest_transactions['total_prev_paid'].fillna(0)
latest_transactions['num_prev_discounts'] = latest_transactions['num_prev_discounts'].fillna(0)
latest_transactions['total_prev_cancelled'] = latest_transactions['total_prev_cancelled'].fillna(0)

latest_transactions["curr_discount"] = (latest_transactions['plan_list_price'] > latest_transactions['actual_amount_paid']).astype(int)
latest_transactions = latest_transactions.drop('membership_expire_date',axis=1)
latest_transactions = latest_transactions.drop('transaction_date',axis=1)

### Merge with train dataset

In [24]:
final_dataset = train_df.merge(latest_transactions, on='msno', how='inner')
final_dataset = final_dataset.merge(user_logs_df_summed, on='msno', how='inner')
final_dataset = final_dataset.merge(members_df_drop, on='msno', how='inner')

In [25]:
final_dataset = final_dataset.drop("msno", axis=1)

In [None]:
#EXtra

In [30]:
X = final_dataset.drop("is_churn", axis=1)


Unnamed: 0,payment_method_id,city,registered_via
count,725722.0,725722.0,725722.0
mean,38.582516,6.197446,6.907958
std,3.935867,6.503173,1.961436
min,3.0,1.0,3.0
25%,37.0,1.0,7.0
50%,41.0,4.0,7.0
75%,41.0,13.0,9.0
max,41.0,22.0,13.0


In [31]:
X["payment_method_id"].unique()

array([36, 15, 41, 40, 38, 32, 39, 35, 20, 30, 29, 16, 37, 13, 22, 28, 34,
       10, 31, 26, 27, 21, 33, 17, 12, 18, 23, 19,  3, 14,  6,  8, 11],
      dtype=int64)

In [32]:
X["city"].unique()

array([13, 22,  9,  1, 14,  4, 15, 18, 12,  3, 21,  6,  5,  8,  7, 17, 11,
       10, 16, 20, 19], dtype=int64)

In [33]:
X["registered_via"].unique()

array([ 3,  7,  9,  4, 13], dtype=int64)

In [20]:
transformer = DataTransform()
X = final_dataset.drop("is_churn", axis=1)

categorical_var = ["payment_method_id", "city", "registered_via"]
numerical_var = X.columns.difference(categorical_var)
preprocessor = transformer.get_preprocessor(categorical_var, numerical_var)

X_train_final = preprocessor.fit_transform(X).toarray()

In [23]:
X_train_final.shape

(967019, 76)