In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from itertools import combinations
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

### Load data

In [2]:
train_cleaned = pd.read_csv('https://raw.githubusercontent.com/MohamedMostafa259/Customer-Churn-Prediction-and-Analysis/main/Milestone1_DataCollection_Exploration_Preprocessing/data/train_cleaned.csv')
train_cleaned.head()

Unnamed: 0,age,gender,region_category,membership_category,joining_date,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,last_visit_time,days_since_last_login,avg_time_spent,avg_transaction_value,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
0,35,F,City,Basic Membership,2016-04-08,Yes,Without Offers,Desktop,Mobile_Data,12:51:39,5.0,1175.956109,4248.84,515.15,No,Yes,Yes,Unsolved,No reason specified,5.0
1,34,M,City,Platinum Membership,2017-09-08,No,Gift Vouchers/Coupons,Smartphone,Wi-Fi,14:34:44,14.0,252.02,26946.21,716.692748,No,Yes,Yes,Solved in Follow-up,No reason specified,3.0
2,39,M,City,Platinum Membership,2017-06-23,No,Credit/Debit Card Offers,Smartphone,Fiber_Optic,18:52:18,18.0,258.69,6032.17,796.99,Yes,No,Yes,Solved in Follow-up,User Friendly Website,1.0
3,14,M,Town,No Membership,2017-04-02,No,Gift Vouchers/Coupons,Desktop,Wi-Fi,16:17:42,15.0,307.37,41719.14,698.24,No,Yes,Yes,No Information Available,Poor Product Quality,5.0
4,63,M,Town,Premium Membership,2016-12-17,No,Gift Vouchers/Coupons,Desktop,Wi-Fi,18:02:49,16.0,52.68,1775.36,784.28,Yes,No,Yes,No Information Available,Poor Website,3.0


#### Create a copy for feature engineering

In [3]:
train_cleaned_copy = train_cleaned.copy()

### Explore columns

In [4]:
train_cleaned_copy.columns

Index(['age', 'gender', 'region_category', 'membership_category',
       'joining_date', 'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback', 'churn_risk_score'],
      dtype='object')

In [5]:
train_cleaned_copy.select_dtypes('number').columns

Index(['age', 'days_since_last_login', 'avg_time_spent',
       'avg_transaction_value', 'points_in_wallet', 'churn_risk_score'],
      dtype='object')

### Extracting features

In [6]:
train_cleaned_copy['points_per_transaction'] = train_cleaned_copy['points_in_wallet'] / train_cleaned_copy['avg_transaction_value']
train_cleaned_copy['transaction_value_per_time_unit'] = train_cleaned_copy['avg_transaction_value'] / train_cleaned_copy['avg_time_spent']

In [7]:
def time_of_day(hour):
	if 5 <= hour < 12:
		return 'Morning'
	elif 12 <= hour < 17:
		return 'Afternoon'
	elif 17 <= hour < 21:
		return 'Evening'
	else:
		return 'Night'

In [8]:
def ampm_mapping(hour):
	if 0 <= hour < 12:
		return 'AM'
	else:
		return 'PM'

In [9]:
train_cleaned_copy['last_visit_hour'] = pd.to_datetime(train_cleaned_copy['last_visit_time']).dt.hour
train_cleaned_copy['last_visit_time_of_day'] = train_cleaned_copy['last_visit_hour'].apply(time_of_day)
train_cleaned_copy['last_visit_AMPM'] = train_cleaned_copy['last_visit_hour'].apply(ampm_mapping)

In [10]:
train_cleaned_copy['joining_date'] = pd.to_datetime(train_cleaned_copy['joining_date'])
train_cleaned_copy['joining_day_name'] = train_cleaned_copy['joining_date'].dt.day_name()

In [11]:
train_cleaned_copy['is_weekend'] = ((train_cleaned_copy['joining_day_name'] == 'Sunday') | 
									(train_cleaned_copy['joining_day_name'] == 'Saturday')).astype(int) 

### Save data (ready for advanced analysis)

In [12]:
train_cleaned_copy.isna().sum()

age                                0
gender                             0
region_category                    0
membership_category                0
joining_date                       0
joined_through_referral            0
preferred_offer_types              0
medium_of_operation                0
internet_option                    0
last_visit_time                    0
days_since_last_login              0
avg_time_spent                     0
avg_transaction_value              0
points_in_wallet                   0
used_special_discount              0
offer_application_preference       0
past_complaint                     0
complaint_status                   0
feedback                           0
churn_risk_score                   0
points_per_transaction             0
transaction_value_per_time_unit    0
last_visit_hour                    0
last_visit_time_of_day             0
last_visit_AMPM                    0
joining_day_name                   0
is_weekend                         0
d

In [None]:
train_cleaned_copy.to_csv('train_cleaned_basicFeatureEng.csv', index=False)

In [14]:
train_cleaned_copy.select_dtypes(np.number).columns.tolist()

['age',
 'days_since_last_login',
 'avg_time_spent',
 'avg_transaction_value',
 'points_in_wallet',
 'churn_risk_score',
 'points_per_transaction',
 'transaction_value_per_time_unit',
 'last_visit_hour',
 'is_weekend']

In [15]:
scale_cols = ['age', 'days_since_last_login', 'avg_time_spent', 'avg_transaction_value', 
			  'points_in_wallet', 'points_per_transaction', 'transaction_value_per_time_unit', 
			  'last_visit_hour']

## Encoding categorical variables

In [16]:
for col in train_cleaned_copy.select_dtypes(exclude='number').columns:
	if col in ['joining_date', 'last_visit_time']:
		continue
	print(f'{col} Column', '-'*50)
	print(train_cleaned_copy[col].unique())

gender Column --------------------------------------------------
['F' 'M' 'Unknown']
region_category Column --------------------------------------------------
['City' 'Town' 'Village']
membership_category Column --------------------------------------------------
['Basic Membership' 'Platinum Membership' 'No Membership'
 'Premium Membership' 'Silver Membership' 'Gold Membership']
joined_through_referral Column --------------------------------------------------
['Yes' 'No']
preferred_offer_types Column --------------------------------------------------
['Without Offers' 'Gift Vouchers/Coupons' 'Credit/Debit Card Offers']
medium_of_operation Column --------------------------------------------------
['Desktop' 'Smartphone' 'Both']
internet_option Column --------------------------------------------------
['Mobile_Data' 'Wi-Fi' 'Fiber_Optic']
used_special_discount Column --------------------------------------------------
['No' 'Yes']
offer_application_preference Column ----------------------

In [17]:
one_hot_cols = list(set(train_cleaned_copy.select_dtypes(exclude='number').columns.tolist()) - 
					{'joining_date', 'last_visit_time'} - {'membership_category', 'feedback'})
one_hot_cols

['preferred_offer_types',
 'joining_day_name',
 'last_visit_time_of_day',
 'past_complaint',
 'medium_of_operation',
 'gender',
 'joined_through_referral',
 'offer_application_preference',
 'region_category',
 'used_special_discount',
 'internet_option',
 'last_visit_AMPM',
 'complaint_status']

#### One-Hot Encoding (includes binary encoding)

In [18]:
train_cleaned_copy = pd.get_dummies(train_cleaned_copy, columns=one_hot_cols, drop_first=True, dtype=int)

#### Ordinal Encoding

In [19]:
# ordinal_cols = ['membership_category', 'feedback']
train_cleaned_copy['membership_category'] = pd.Categorical(train_cleaned_copy['membership_category'], 
														   categories=['No Membership', 'Basic Membership', 'Silver Membership', 
																	   'Gold Membership', 'Platinum Membership', 'Premium Membership'],
															ordered=True).codes

In [20]:
positive_feedback = ['Products always in Stock', 'Quality Customer Care', 'Reasonable Price', 'User Friendly Website']
negative_feedback = ['Poor Website' ,'Poor Customer Service', 'Poor Product Quality', 'Too many ads']
neutral_feedback = ['No reason specified']

def get_sentiment(feedback):
	if feedback in positive_feedback:
		return 1
	elif feedback in negative_feedback:
		return -1
	else:
		return 0 # neutral
	
train_cleaned_copy['feedback'] = train_cleaned_copy['feedback'].transform(get_sentiment)

#### General transformations numeric features

In [21]:
for num_col in train_cleaned_copy.select_dtypes(include=np.number).columns:
	train_cleaned_copy[f'{num_col}_sqrt'] = np.sqrt(train_cleaned_copy[num_col] + 1)
	train_cleaned_copy[f'{num_col}_square'] = np.square(train_cleaned_copy[num_col])
	train_cleaned_copy[f'{num_col}_log'] = np.log1p(train_cleaned_copy[num_col] + 0.01)  

In [22]:
train_cleaned_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28663 entries, 0 to 28662
Columns: 162 entries, age to complaint_status_Unsolved_log
dtypes: datetime64[ns](1), float16(1), float64(93), int32(60), int64(4), int8(2), object(1)
memory usage: 28.3+ MB


In [23]:
pd.set_option('display.max_rows', None)
train_cleaned_copy.corr(numeric_only=True)['churn_risk_score'].sort_values(ascending=False)

churn_risk_score                                      1.000000
churn_risk_score_sqrt                                 0.996109
churn_risk_score_log                                  0.983755
churn_risk_score_square                               0.979840
joined_through_referral_Yes_sqrt                      0.065683
joined_through_referral_Yes_log                       0.065683
joined_through_referral_Yes                           0.065683
joined_through_referral_Yes_square                    0.065683
preferred_offer_types_Without Offers_log              0.061909
preferred_offer_types_Without Offers_sqrt             0.061909
preferred_offer_types_Without Offers                  0.061909
preferred_offer_types_Without Offers_square           0.061909
medium_of_operation_Smartphone_square                 0.051193
medium_of_operation_Smartphone                        0.051193
medium_of_operation_Smartphone_log                    0.051193
medium_of_operation_Smartphone_sqrt                   0

In [24]:
train_cleaned_copy.isna().sum()[train_cleaned_copy.isna().sum() > 0]

Series([], dtype: int64)

features that we may need to transform are:

- `avg_time_spent`:

	avg_time_spent                                       -0.027045

	avg_time_spent_sqrt                                  -0.031828

	avg_time_spent_log                                   -0.032828

- `avg_transaction_value`:

	avg_transaction_value                                -0.362539

	avg_transaction_value_square                         -0.429874

In [25]:
scale_cols.append('avg_transaction_value_square')

In [26]:
pd.reset_option('display.max_rows')

## Build Feature Engineering pipeline

### FeatureEng custom transformer

In [27]:
# X is DataFrame of training features
# r_list is a list of desired combination sizes
# returns DataFrame with new combination features, excluding original categorical columns
def create_categorical_combinations(X, r_list, cat_cols):
	df_str = X[cat_cols].astype(str) # to allow concatenation
	for r in r_list:
		# generate all combinations of length r
		combinations_iter = combinations(cat_cols, r)
		for comb in combinations_iter:
			df_str['+'.join(comb)] = df_str[list(comb)].agg(''.join, axis=1)
	return df_str.drop(columns=cat_cols)

In [28]:
class FeatureEng(BaseEstimator, TransformerMixin):
	def __init__(self):
		self.membership_order = ['No Membership', 'Basic Membership', 'Silver Membership',
								 'Gold Membership', 'Platinum Membership', 'Premium Membership']
		self.positive_feedback = ['Products always in Stock', 'Quality Customer Care', 'Reasonable Price', 'User Friendly Website']
		self.negative_feedback = ['Poor Website', 'Poor Customer Service', 'Poor Product Quality', 'Too many ads']

	def time_of_day(self, hour):
		if 5 <= hour < 12:
			return 'Morning'
		elif 12 <= hour < 17:
			return 'Afternoon'
		elif 17 <= hour < 21:
			return 'Evening'
		else:
			return 'Night'
	
	def ampm_mapping(self, hour):
		return 'AM' if 0 <= hour < 12 else 'PM'

	def get_sentiment(self, feedback):
		if feedback in self.positive_feedback:
			return 1
		elif feedback in self.negative_feedback:
			return -1
		else:
			return 0

	def fit(self, X, y=None):
		return self

	def transform(self, X):
		X = X.copy()
		
		X['points_per_transaction'] = X['points_in_wallet'] / X['avg_transaction_value']
		X['transaction_value_per_time_unit'] = X['avg_transaction_value'] / X['avg_time_spent']
		
		X['last_visit_hour'] = pd.to_datetime(X['last_visit_time']).dt.hour
		X['last_visit_time_of_day'] = X['last_visit_hour'].apply(self.time_of_day)
		X['last_visit_AMPM'] = X['last_visit_hour'].apply(self.ampm_mapping)
		X.drop('last_visit_time', axis=1, inplace=True)
		
		X['joining_date'] = pd.to_datetime(X['joining_date'])
		X['joining_day_name'] = X['joining_date'].dt.day_name()
		X['is_weekend'] = X['joining_day_name'].isin(['Saturday', 'Sunday']).astype(int)
		X.drop('joining_date', axis=1, inplace=True)
		
		cat_cols = list(X.select_dtypes(include=['object', 'category']).columns)
		cat_combos_df = create_categorical_combinations(X, range(2, 3), cat_cols)
		X = pd.concat([X, cat_combos_df], axis=1)

		X['membership_category'] = pd.Categorical( X['membership_category'], 
												  categories=self.membership_order, 
												  ordered=True).codes

		X['feedback'] = X['feedback'].apply(self.get_sentiment)
		 
		X['avg_time_spent_log'] = np.log1p(X['avg_time_spent'])  
		X['avg_transaction_value_square'] = np.square(X['avg_transaction_value'])

		return X
	
	def fit_transform(self, X, y=None):
		X_transformed = self.transform(X)
		self.feature_names_out_ = X_transformed.columns
		return X_transformed
	
	def get_feature_names_out(self, input_features=None):
		return self.feature_names_out_

### encoder_scaler transformer 

In [29]:
encoder_scaler_transformer = ColumnTransformer([
	('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype=int), 
  make_column_selector(dtype_include=['object'])), 
	('scaler', StandardScaler(), scale_cols)
], remainder='passthrough')

In [30]:
featureEng_encoder_scaler_pipeline = Pipeline([
    ('featureEng', FeatureEng()), 
    ('encoder_scaler', encoder_scaler_transformer)
])

In [31]:
X_train = train_cleaned.drop(columns=['churn_risk_score'])
y_train = train_cleaned['churn_risk_score']

In [32]:
X_train_preprocessed = featureEng_encoder_scaler_pipeline.fit_transform(X_train)

In [33]:
len(featureEng_encoder_scaler_pipeline.get_feature_names_out())

1467

In [34]:
train_cleaned_copy.shape[1]

162

In [35]:
X_train_preprocessed = pd.DataFrame(X_train_preprocessed, 
                                    columns=featureEng_encoder_scaler_pipeline.get_feature_names_out())

In [36]:
np.isinf(X_train_preprocessed.values).sum()

0

#### Feature selection & Importance

In [37]:
y_train.unique()

array([5., 3., 1., 2., 4.])

In [38]:
# target labels (y_train) starts from '1': [1, 2, 3, 4, 5], 
# but XGBoost expects them to start from 0, like [0, 1, 2, 3, 4].
xgb_clf = XGBClassifier(random_state=42).fit(X_train_preprocessed, y_train-1)
# don't forget to add one in the prediction time:
# y_pred = xgb_clf.predict(X_test_preprocessed) + 1

In [39]:
pd.set_option('display.max_rows', None)
importances = xgb_clf.feature_importances_
feature_names = X_train_preprocessed.columns
feature_importance_df = pd.DataFrame({'feature':feature_names, 'importance':importances})
feature_importance_df['importance'] = feature_importance_df['importance'].round(4)
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False).reset_index()
feature_importance_df['cumulative_importance'] = feature_importance_df['importance'].cumsum()
feature_importance_df

Unnamed: 0,index,feature,importance,cumulative_importance
0,1463,remainder__membership_category,0.0847,0.0847
1,1464,remainder__feedback,0.0765,0.1612
2,1458,scaler__points_in_wallet,0.0129,0.1741
3,1338,encoder__feedback+last_visit_AMPM_Reasonable P...,0.0018,0.1759
4,1355,encoder__feedback+joining_day_name_Poor Custom...,0.0016,0.1775
5,1216,encoder__complaint_status+feedback_UnsolvedNo ...,0.0016,0.1791
6,1126,encoder__offer_application_preference+joining_...,0.0015,0.1806
7,126,encoder__gender+complaint_status_MSolved,0.0015,0.1821
8,798,encoder__preferred_offer_types+joining_day_nam...,0.0015,0.1836
9,1023,encoder__used_special_discount+complaint_statu...,0.0015,0.1851


In [40]:
# example
feature_importance_df.iloc[3, 1]

'encoder__feedback+last_visit_AMPM_Reasonable PriceAM'

In [41]:
pd.reset_option('display.max_rows')

In [42]:
# let's select the first 100 features where they have importance of around 30%
# important_features = feature_importance_df.iloc[:99]['index'].tolist()
# X_train_preprocessed_selected = X_train_preprocessed[:, important_features]
# ... let's continue this code in a separate notebook for model development!

### Save preprocessed data (ready for modeling)

In [43]:
train_preprocessed = pd.concat([X_train_preprocessed, y_train], axis=1)
# use .parquet instead of .csv as the csv version is too large (> 100 MB), 
# so it cannot be uploaded to GitHub
train_preprocessed.to_parquet('train_preprocessed.parquet', index=False)

### Save pipelines & models

In [44]:
import joblib
joblib.dump(featureEng_encoder_scaler_pipeline, 'featureEng_encoder_scaler_pipeline.joblib')
joblib.dump(xgb_clf, 'xgb_clf_all_features.joblib')

['xgb_clf_all_features.joblib']