In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from itertools import combinations
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

### Load data

In [None]:
train_cleaned = pd.read_csv('https://raw.githubusercontent.com/MohamedMostafa259/Customer-Churn-Prediction-and-Analysis/main/Milestone1_DataCollection_EDA_DataCleaning/data/train_cleaned_imputed.csv')
train_cleaned.head()

In [None]:
train_cleaned.isna().sum().sum()

#### Create a copy for feature engineering

In [None]:
train_cleaned_copy = train_cleaned.copy()

### Explore columns

In [None]:
train_cleaned_copy.columns

In [None]:
train_cleaned_copy.select_dtypes('number').columns

### Extracting features

In [None]:
train_cleaned_copy['points_per_transaction'] = train_cleaned_copy['points_in_wallet'] / train_cleaned_copy['avg_transaction_value']
train_cleaned_copy['transaction_value_per_time_unit'] = train_cleaned_copy['avg_transaction_value'] / train_cleaned_copy['avg_time_spent']

In [None]:
def time_of_day(hour):
	if 5 <= hour < 12:
		return 'Morning'
	elif 12 <= hour < 17:
		return 'Afternoon'
	elif 17 <= hour < 21:
		return 'Evening'
	else:
		return 'Night'

In [None]:
def ampm_mapping(hour):
	if 0 <= hour < 12:
		return 'AM'
	else:
		return 'PM'

In [None]:
train_cleaned_copy['last_visit_hour'] = pd.to_datetime(train_cleaned_copy['last_visit_time']).dt.hour
train_cleaned_copy['last_visit_time_of_day'] = train_cleaned_copy['last_visit_hour'].apply(time_of_day)
train_cleaned_copy['last_visit_AMPM'] = train_cleaned_copy['last_visit_hour'].apply(ampm_mapping)

In [None]:
train_cleaned_copy['joining_date'] = pd.to_datetime(train_cleaned_copy['joining_date'])
train_cleaned_copy['joining_day_name'] = train_cleaned_copy['joining_date'].dt.day_name()

In [None]:
train_cleaned_copy['is_weekend'] = ((train_cleaned_copy['joining_day_name'] == 'Sunday') | 
									(train_cleaned_copy['joining_day_name'] == 'Saturday')).astype(int) 

### Save data (ready for advanced analysis)

In [None]:
train_cleaned_copy.isna().sum().sum()

In [None]:
train_cleaned_copy.to_csv('train_basicFeatureEng.csv', index=False)

In [None]:
train_cleaned_copy.select_dtypes(np.number).columns.tolist()

In [None]:
scale_cols = ['age', 'days_since_last_login', 'avg_time_spent', 'avg_transaction_value', 
			  'avg_frequency_login_days', 'points_in_wallet', 'points_per_transaction', 
              'transaction_value_per_time_unit', 'last_visit_hour']

## Encoding categorical variables

In [None]:
for col in train_cleaned_copy.select_dtypes(exclude='number').columns:
	if col in ['joining_date', 'last_visit_time']:
		continue
	print(f'{col} Column', '-'*50)
	print(train_cleaned_copy[col].unique())

In [None]:
one_hot_cols = list(set(train_cleaned_copy.select_dtypes(exclude='number').columns.tolist()) - 
					{'joining_date', 'last_visit_time'} - {'membership_category', 'feedback'})
one_hot_cols

#### One-Hot Encoding (includes binary encoding)

In [None]:
train_cleaned_copy = pd.get_dummies(train_cleaned_copy, columns=one_hot_cols, drop_first=True, dtype=int)

#### Ordinal Encoding

In [None]:
# ordinal_cols = ['membership_category', 'feedback']
train_cleaned_copy['membership_category'] = pd.Categorical(train_cleaned_copy['membership_category'], 
														   categories=['No Membership', 'Basic Membership', 'Silver Membership', 
																	   'Gold Membership', 'Platinum Membership', 'Premium Membership'],
															ordered=True).codes

In [None]:
positive_feedback = ['Products always in Stock', 'Quality Customer Care', 'Reasonable Price', 'User Friendly Website']
negative_feedback = ['Poor Website' ,'Poor Customer Service', 'Poor Product Quality', 'Too many ads']
neutral_feedback = ['No reason specified']

def get_sentiment(feedback):
	if feedback in positive_feedback:
		return 1
	elif feedback in negative_feedback:
		return -1
	else:
		return 0 # neutral
	
train_cleaned_copy['feedback'] = train_cleaned_copy['feedback'].transform(get_sentiment)

#### General transformations numeric features

In [None]:
for num_col in train_cleaned_copy.select_dtypes(include=np.number).columns:
	train_cleaned_copy[f'{num_col}_sqrt'] = np.sqrt(train_cleaned_copy[num_col] + 1)
	train_cleaned_copy[f'{num_col}_square'] = np.square(train_cleaned_copy[num_col])
	train_cleaned_copy[f'{num_col}_log'] = np.log1p(train_cleaned_copy[num_col] + 0.01)  

The cell below is from the EDA.ipynb notebook in milestone 1

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# log1p(x) = log(x + 1): this avoids errors when x = 0
train_cleaned_copy['log_avg_time_spent'] = np.log1p(train_cleaned_copy['avg_time_spent'])

fig, axes = plt.subplots(1, 2)
sns.violinplot(train_cleaned_copy['avg_time_spent'], ax=axes[0])
sns.violinplot(train_cleaned_copy['log_avg_time_spent'], ax=axes[1])
plt.tight_layout()
plt.show()

In [None]:
train_cleaned_copy.info()

In [None]:
pd.set_option('display.max_rows', None)
train_cleaned_copy.corr(numeric_only=True)['churn_risk_score'].sort_values(ascending=False)

In [None]:
train_cleaned_copy.isna().sum().sum()

features that we may need to transform are:

- `avg_time_spent`:

	avg_time_spent                                       -0.027045

	avg_time_spent_sqrt                                  -0.031828

	avg_time_spent_log                                   -0.032828

- `avg_transaction_value`:

	avg_transaction_value                                -0.362539

	avg_transaction_value_square                         -0.429874

In [None]:
scale_cols.append('avg_transaction_value_square')

In [None]:
pd.reset_option('display.max_rows')

In [None]:
train_split_cleaned_imputed = pd.read_csv('https://raw.githubusercontent.com/MohamedMostafa259/Customer-Churn-Prediction-and-Analysis/main/Milestone1_DataCollection_EDA_DataCleaning/data/train_split_cleaned_imputed.csv')

In [None]:
train_split_cleaned_imputed.isna().sum()

In [None]:
X_train = train_split_cleaned_imputed.drop(columns=['churn_risk_score'])
y_train = train_split_cleaned_imputed['churn_risk_score']

### FeatureEng custom transformer

In [None]:
# X is DataFrame of training features
# r_list is a list of desired combination sizes
# returns DataFrame with new combination features, excluding original categorical columns
def create_categorical_combinations(X, r_list, cat_cols):
	df_str = X[cat_cols].astype(str) # to allow concatenation
	for r in r_list:
		# generate all combinations of length r
		combinations_iter = combinations(cat_cols, r)
		for comb in combinations_iter:
			df_str['+'.join(comb)] = df_str[list(comb)].agg(''.join, axis=1)
	return df_str.drop(columns=cat_cols)

In [None]:
class FeatureEng(BaseEstimator, TransformerMixin):
	def __init__(self):
		self.membership_order = ['No Membership', 'Basic Membership', 'Silver Membership',
								 'Gold Membership', 'Platinum Membership', 'Premium Membership']
		self.positive_feedback = ['Products always in Stock', 'Quality Customer Care', 'Reasonable Price', 'User Friendly Website']
		self.negative_feedback = ['Poor Website', 'Poor Customer Service', 'Poor Product Quality', 'Too many ads']

	def time_of_day(self, hour):
		if 5 <= hour < 12:
			return 'Morning'
		elif 12 <= hour < 17:
			return 'Afternoon'
		elif 17 <= hour < 21:
			return 'Evening'
		else:
			return 'Night'
	
	def ampm_mapping(self, hour):
		return 'AM' if 0 <= hour < 12 else 'PM'

	def get_sentiment(self, feedback):
		if feedback in self.positive_feedback:
			return 1
		elif feedback in self.negative_feedback:
			return -1
		else:
			return 0

	def fit(self, X, y=None):
		return self

	def transform(self, X):
		X = X.copy()
		
		X['points_per_transaction'] = X['points_in_wallet'] / X['avg_transaction_value']
		X['transaction_value_per_time_unit'] = X['avg_transaction_value'] / X['avg_time_spent']
		
		X['last_visit_hour'] = pd.to_datetime(X['last_visit_time']).dt.hour
		X['last_visit_time_of_day'] = X['last_visit_hour'].apply(self.time_of_day)
		X['last_visit_AMPM'] = X['last_visit_hour'].apply(self.ampm_mapping)
		X.drop('last_visit_time', axis=1, inplace=True)
		
		X['joining_date'] = pd.to_datetime(X['joining_date'])
		X['joining_day_name'] = X['joining_date'].dt.day_name()
		X['is_weekend'] = X['joining_day_name'].isin(['Saturday', 'Sunday']).astype(int)
		X.drop('joining_date', axis=1, inplace=True)
		
		cat_cols = list(X.select_dtypes(include=['object', 'category']).columns)
		cat_combos_df = create_categorical_combinations(X, range(2, 3), cat_cols)
		X = pd.concat([X, cat_combos_df], axis=1)
		
		X['membership_category'] = pd.Categorical( X['membership_category'], 
												  categories=self.membership_order, 
												  ordered=True).codes
		
		X['feedback'] = X['feedback'].apply(self.get_sentiment)
	
		 
		X['avg_time_spent_log'] = np.log1p(X['avg_time_spent'])  
		X['avg_transaction_value_square'] = np.square(X['avg_transaction_value'])
		
		return X
	
	def fit_transform(self, X, y=None):
		X_transformed = self.transform(X)
		self.feature_names_out_ = X_transformed.columns
		return X_transformed
	
	def get_feature_names_out(self, input_features=None):
		return self.feature_names_out_

### encoder_scaler transformer 

In [None]:
encoder_scaler_transformer = ColumnTransformer([
	('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype=int), 
  make_column_selector(dtype_include=['object'])), 
	('scaler', StandardScaler(), scale_cols)
], remainder='passthrough')

### Merge transformers into a pipeline

In [None]:
featureEng_encoder_scaler_pipeline = Pipeline([
    ('featureEng', FeatureEng()), 
    ('encoder_scaler', encoder_scaler_transformer)
])

In [None]:
X_train_preprocessed = featureEng_encoder_scaler_pipeline.fit_transform(X_train)

In [None]:
len(featureEng_encoder_scaler_pipeline.get_feature_names_out())

In [None]:
np.isinf(X_train_preprocessed).sum()

In [None]:
X_train_preprocessed = pd.DataFrame(X_train_preprocessed, 
                                    columns=featureEng_encoder_scaler_pipeline.get_feature_names_out())

#### Feature selection & Importance

In [None]:
y_train.unique()

In [None]:
class TopImportanceFeatures(BaseEstimator, TransformerMixin):
	def __init__(self, model, threshold=0.95):
		self.model = model
		self.threshold = threshold
		
	def fit(self, X, y=None):
		self.model.fit(X, y)
		importances = self.model.feature_importances_
		sorted_idx = np.argsort(importances)[::-1]
		cumulative_importances = importances[sorted_idx].cumsum()
		cutoff_idx = np.searchsorted(cumulative_importances, self.threshold) + 1
		self.top_features_ = sorted_idx[:cutoff_idx]
		return self
	
	def transform(self, X):
		return X.iloc[:, self.top_features_]

In [None]:
model_pipe = Pipeline([
    ('selector', TopImportanceFeatures(XGBClassifier(random_state=42), 0.3)), 
    ('model', XGBClassifier(random_state=42))
])

param_grid = {
    'selector__threshold': np.arange(0.05, 1, 0.05)
}

grid_search = GridSearchCV(model_pipe, param_grid=param_grid, verbose=5, cv=3, return_train_score=True)

# target labels (y_train) starts from '1': [1, 2, 3, 4, 5], 
# but XGBoost expects them to start from 0, like [0, 1, 2, 3, 4].
grid_search.fit(X_train_preprocessed, y_train-1)
# don't forget to add one in the prediction time:
# y_pred = xgb_clf.predict(X_test_preprocessed) + 1

In [None]:
grid_search.best_params_

In [None]:
len(grid_search.best_estimator_[1].feature_importances_)

In [None]:
feature_selector = grid_search.best_estimator_[0].model
len(feature_selector.feature_importances_)

In [None]:
importances = feature_selector.feature_importances_
feature_names = X_train_preprocessed.columns
feature_importance_df = pd.DataFrame({'feature':feature_names, 'importance':importances})
feature_importance_df['importance'] = feature_importance_df['importance'].round(5)
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False).reset_index()
feature_importance_df['cumulative_importance'] = feature_importance_df['importance'].cumsum()
feature_importance_df[:30]

The best model uses only two original features: `membership_category` and `feedback`.

Let's try to perform grid search, this time on a specific range → (0.15, 0.21, 0.01)

In [None]:
model_pipe2 = Pipeline([
    ('selector', TopImportanceFeatures(XGBClassifier(random_state=42), 0.3)), 
    ('model', XGBClassifier(random_state=42))
])

param_grid2 = {
    'selector__threshold': np.arange(0.15, 0.21, 0.01)
}

grid_search2 = GridSearchCV(model_pipe, param_grid=param_grid2, verbose=5, cv=4, return_train_score=True)

# target labels (y_train_split) starts from '1': [1, 2, 3, 4, 5], 
# but XGBoost expects them to start from 0, like [0, 1, 2, 3, 4].
grid_search2.fit(X_train_preprocessed, y_train-1)
# don't forget to add one in the prediction time:
# y_pred = xgb_clf.predict(X_test_preprocessed) + 1

In [None]:
grid_search2.best_params_

In [None]:
len(grid_search2.best_estimator_[1].feature_importances_)

As we increase, the number of features, the gap between train and validation scores increases with a very little increase in the validation score. So, There is a risk of overfitting. That's why I will just stick with the best two features: `membership_category` and `feedback`

Let's rebuild our `FeatureEng` class to remove unnecessary features.

## Build Feature Engineering pipeline

In [None]:
class FeatureEng(BaseEstimator, TransformerMixin):
	def __init__(self):
		self.membership_order = ['No Membership', 'Basic Membership', 'Silver Membership',
								 'Gold Membership', 'Platinum Membership', 'Premium Membership']
		self.positive_feedback = ['Products always in Stock', 'Quality Customer Care', 'Reasonable Price', 'User Friendly Website']
		self.negative_feedback = ['Poor Website', 'Poor Customer Service', 'Poor Product Quality', 'Too many ads']

	def get_sentiment(self, feedback):
		if feedback in self.positive_feedback:
			return 1
		elif feedback in self.negative_feedback:
			return -1
		else:
			return 0

	def fit(self, X, y=None):
		return self

	def transform(self, X):
		X = X.copy()
		
		X['membership_category'] = pd.Categorical( X['membership_category'], 
												  categories=self.membership_order, 
												  ordered=True).codes
		
		X['feedback'] = X['feedback'].apply(self.get_sentiment)
		
		return X
	
	def fit_transform(self, X, y=None):
		X_transformed = self.transform(X)
		self.feature_names_out_ = X_transformed.columns
		return X_transformed
	
	def get_feature_names_out(self, input_features=None):
		return self.feature_names_out_

In [None]:
X_train_subset = X_train[['membership_category', 'feedback']]

In [None]:
featureEng_trans = FeatureEng()
X_train_preprocessed_subset = featureEng_trans.fit_transform(X_train_subset)

In [None]:
X_train_preprocessed_subset.head()

### Save preprocessed data (ready for modeling)

In [None]:
train_split_preprocessed = pd.concat([X_train_preprocessed_subset, y_train], axis=1)
train_split_preprocessed.to_csv('train_split_preprocessed.csv', index=False)