## Import Library & Packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering, KMeans
from yellowbrick.cluster import KElbowVisualizer
from datetime import datetime
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV

dataset URL: https://www.kaggle.com/datasets/imakash3011/customer-personality-analysis

# Pipeline

In [3]:
# Data Import Function
def import_data(path):
    return pd.read_csv(path, sep='\t')

In [4]:
# Missing Values Imputation Function
def impute_missing(data):
    columns_for_imputation = ['Year_Birth', 'Kidhome', 'Teenhome', 'Income']
    data_for_imputation = data[columns_for_imputation].copy()

    data_categorical = data[['Education', 'Marital_Status']]

    encoder = OneHotEncoder(drop='first')
    data_encoded = encoder.fit_transform(data_categorical) 

    data_encoded = pd.DataFrame(data_encoded.toarray(), 
                                columns=encoder.get_feature_names_out(['Education', 'Marital_Status']))

    data_for_imputation = data_for_imputation.reset_index(drop=True)
    data_encoded = data_encoded.reset_index(drop=True)

    data_combined = pd.concat([data_for_imputation, data_encoded], axis=1)

    imputer = IterativeImputer()
    imputed_data = pd.DataFrame(imputer.fit_transform(data_combined), columns=data_combined.columns)

    data.reset_index(drop=True, inplace=True)
    imputed_data.reset_index(drop=True, inplace=True)
    data.loc[data['Income'].isnull(), 'Income'] = imputed_data.loc[data['Income'].isnull(), 'Income']
    return data

In [5]:
# Outliers Removal Function
def remove_outliers(data):
    data = data[~data['Year_Birth'].isin([1893, 1899, 1900])]
    data = data[(data["Income"] < 160000)] 
    return data

In [6]:
# Feature Engineering Function
def feature_engineering(data):
    data_newfeature = data.copy()

    # Membership enrollment time !! SHOULD WE USE A DATE INSTEAD?
    data_newfeature['Enroll_Year'] = data_newfeature['Dt_Customer'].str[-4:].astype(int)
    data_newfeature['Enroll_Month'] = data_newfeature['Dt_Customer'].str[3:5].astype(int)
    data_newfeature['Enroll_Day'] = data_newfeature['Dt_Customer'].str[0:2].astype(int)
    data_newfeature.drop(columns=['Dt_Customer'], inplace=True)
    data_newfeature['Member_Year'] = 2016 - data_newfeature['Enroll_Year']

    # Since max enroll_year is 2014, we assume the company is evaluating their customer in 2016
    data_newfeature['Age'] = 2016 - data_newfeature['Year_Birth']     

    # Add total amount
    data_newfeature['Total_amount'] = data_newfeature[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 
                                            'MntSweetProducts', 'MntGoldProds']].sum(axis=1)

    # Add total purchase
    data_newfeature['Total_purchase'] = data_newfeature[['NumWebPurchases', 'NumCatalogPurchases', 
                                            'NumStorePurchases']].sum(axis=1)

    # Calculate features regarding the user
    data_newfeature['Total_Children'] = data_newfeature['Kidhome'] + data_newfeature['Teenhome']
    data_newfeature["Is_Parent"] = np.where(data_newfeature['Total_Children']> 0, 1, 0)

    data_newfeature["Family_Size"] = data_newfeature["Marital_Status"].replace({"Married": 2, "Together": 2, \
                                "Absurd": 1, "Widow": 1, "YOLO": 1, "Divorced": 1, "Single": 1, "Alone": 1}).astype(int) \
                                + data_newfeature["Total_Children"]


    data_newfeature['Cmp_Attitude'] = np.where((data_newfeature['AcceptedCmp1'] + data_newfeature['AcceptedCmp2'] +
                                                data_newfeature['AcceptedCmp3'] + data_newfeature['AcceptedCmp4'] +
                                                data_newfeature['AcceptedCmp5']) > 0, 1, 0)

    # Calculate the percentiles for 'Recency', 'Frequency', and 'Monetary'
    recency_50th = data_newfeature['Recency'].quantile(0.5)
    frequency_50th = data_newfeature['Total_purchase'].quantile(0.5)
    monetary_50th = data_newfeature['Total_amount'].quantile(0.5)

    # Assign binary scores for each R,F,M category
    data_newfeature['R'] = (data_newfeature['Recency'] <= recency_50th).astype(int)
    data_newfeature['F'] = (data_newfeature['Total_purchase'] > frequency_50th).astype(int)
    data_newfeature['M'] = (data_newfeature['Total_amount'] > monetary_50th).astype(int)

    # Combine the R,F,M scores to a single group identifier
    data_newfeature['RFM_Score'] = data_newfeature['R'].astype(str) + data_newfeature['F'].astype(str) +\
        data_newfeature['M'].astype(str)

    # Define group names based on RFM_Score
    group_names = {
        '111': 'Loyalist',
        '011': 'Potential Loyalist',
        '101': 'New Customers',
        '001': 'Promising',
        '110': 'At Risk',
        '010': 'Need Attention',
        '100': 'About To Sleep',
        '000': 'Hibernating'
    }
        
    # Map the group names to the dataframe
    data_newfeature['RFM_Group'] = data_newfeature['RFM_Score'].map(group_names)
        
        # One-hot encoding the 'RFM_Group'
    data_newfeature = pd.get_dummies(data_newfeature, columns=['RFM_Group'])

    # Initialize a new column with empty strings
    data_newfeature['Age_Demographic'] = ""
        
        # Iterate over each row to assign the age demographic based on age
    for index, row in data_newfeature.iterrows():
        if row["Year_Birth"] <= 1964:
            data_newfeature.at[index, "Age_Demographic"] = "Baby Boomer"
        elif row["Year_Birth"] <= 1980:
            data_newfeature.at[index, "Age_Demographic"] = "Gen X"
        elif row["Year_Birth"] <= 1996:
            data_newfeature.at[index, "Age_Demographic"] = "Gen Y"
        else:
            data_newfeature.at[index, "Age_Demographic"] = "Gen Z"
    data_newfeature = pd.get_dummies(data_newfeature, columns=['Age_Demographic'])

    # Iterate over each row to assign the income category based on income
    for index, row in data_newfeature.iterrows():
        if row["Income"] <= 2333:
            data_newfeature.at[index, "Income_Category"] = "Low"
        elif row["Income"] <= 12427:
            data_newfeature.at[index, "Income_Category"] = "Medium"
        else:
            data_newfeature.at[index, "Income_Category"] = "High"
        
    data_newfeature = pd.get_dummies(data_newfeature, columns=['Income_Category'])

    # Drop the columns that are not needed
    data_newfeature.drop(columns=['Z_CostContact', 'Z_Revenue'], inplace=True)

    return data_newfeature

In [7]:
# One-Hot Encoding Function
def one_hot_encoding(data):
    encoder = OneHotEncoder(drop='first', sparse_output=False)

    encoded_data = encoder.fit_transform(data[['Education', 'Marital_Status']])

    encoded_feature_names = encoder.get_feature_names_out(['Education', 'Marital_Status'])

    encoded_df = pd.DataFrame(encoded_data, columns=encoded_feature_names)

    data.reset_index(drop=True, inplace=True)
    encoded_df.reset_index(drop=True, inplace=True)

    data = pd.concat([data.drop(columns=['Education', 'Marital_Status', 'RFM_Score']), encoded_df], axis=1)

    return data


In [8]:
# Pipeline Definition
preprocessing_pipeline = Pipeline([
    ('import_data', FunctionTransformer(import_data)),
    ('impute_missing', FunctionTransformer(impute_missing)),
    ('remove_outliers', FunctionTransformer(remove_outliers)),
    ('feature_engineering', FunctionTransformer(feature_engineering)),
    ('one_hot_encoding', FunctionTransformer(one_hot_encoding))
])

In [9]:
path = '/Users/qianlou/Documents/GitHub/Customer-Personality-Analysis-2.0/Data/Raw Data/marketing_campaign.csv'
processed_data = preprocessing_pipeline.transform(path)

  data_newfeature["Family_Size"] = data_newfeature["Marital_Status"].replace({"Married": 2, "Together": 2, \


# Feature Selection

## LassoCV

In [10]:
def feature_selection_lasso(df, target_column):
    # Drop columns that are not needed
    df = df.drop(columns=['ID', 'Year_Birth', 'Kidhome', 'Teenhome', 'AcceptedCmp1', 'AcceptedCmp2',
                          'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Enroll_Year', 'Enroll_Month',
                          'Enroll_Day', 'Member_Year', 'Total_Children', 'Is_Parent', 'Family_Size',
                          'RFM_Group_About To Sleep', 'RFM_Group_At Risk', 'RFM_Group_Hibernating',
                          'RFM_Group_Loyalist', 'RFM_Group_Need Attention', 'RFM_Group_New Customers',
                          'RFM_Group_Potential Loyalist', 'RFM_Group_Promising', 'Age_Demographic_Baby Boomer',
                          'Age_Demographic_Gen X', 'Age_Demographic_Gen Y', 'Income'])

    if target_column == 'Total_purchase':
        df = df.drop(columns=['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumDealsPurchases',
                              'Total_amount'])
    elif target_column == 'Total_amount':
        df = df.drop(columns=['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 
                              'MntSweetProducts', 'MntGoldProds', 'Total_purchase'])

    # Separate the features and the target
    X = df.drop(columns=[target_column])
    y = df[target_column]
            
    # Normalize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
            
    # Perform feature selection using LassoCV
    lasso = LassoCV(cv=5).fit(X_scaled, y)
            
    # Get the coefficients
    coef = lasso.coef_
            
    # Select features that have non-zero coefficients
    selected_features = X.columns[coef != 0]
            
    # Create a new dataframe with selected features and normalize them
    df_selected = df[selected_features]
    df_selected = pd.DataFrame(scaler.fit_transform(df_selected), columns=df_selected.columns)
            
    # Add the target column back
    df_selected[target_column] = y
            
    return df_selected

### Target column: 'Response' (whether accepted the offer in the recent campaign)

In [11]:
target_column = 'Response'
data_selected = feature_selection_lasso(processed_data, target_column)
data_selected.sort_values(by=target_column, ascending=False)
data_selected.reset_index(drop=True, inplace=True)
data_selected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2234 entries, 0 to 2233
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Recency                  2234 non-null   float64
 1   MntFruits                2234 non-null   float64
 2   MntMeatProducts          2234 non-null   float64
 3   MntFishProducts          2234 non-null   float64
 4   MntSweetProducts         2234 non-null   float64
 5   MntGoldProds             2234 non-null   float64
 6   NumDealsPurchases        2234 non-null   float64
 7   NumWebPurchases          2234 non-null   float64
 8   NumCatalogPurchases      2234 non-null   float64
 9   NumStorePurchases        2234 non-null   float64
 10  NumWebVisitsMonth        2234 non-null   float64
 11  Complain                 2234 non-null   float64
 12  Age                      2234 non-null   float64
 13  Total_amount             2234 non-null   float64
 14  Cmp_Attitude            

In [12]:
data_selected.describe()

Unnamed: 0,Recency,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,...,Education_Basic,Education_Master,Education_PhD,Marital_Status_Alone,Marital_Status_Married,Marital_Status_Single,Marital_Status_Together,Marital_Status_Widow,Marital_Status_YOLO,Response
count,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,...,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0
mean,3.8167020000000006e-17,-4.6913630000000007e-17,1.9083510000000003e-17,2.9420410000000005e-17,-2.2264100000000002e-17,4.2937900000000003e-17,6.997287000000001e-17,8.587580000000001e-17,-1.9083510000000003e-17,-1.196695e-16,...,2.5444680000000002e-17,-3.498644e-17,4.7708780000000007e-17,-6.3611700000000005e-18,1.049593e-16,5.725053000000001e-17,6.281656000000001e-17,0.0,1.2722340000000001e-17,0.149508
std,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,...,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,0.356668
min,-1.697005,-0.6617849,-0.7441592,-0.6872581,-0.6562155,-0.8455772,-1.212761,-1.4728,-0.922995,-1.785309,...,-0.1573869,-0.445531,-0.525207,-0.03666999,-0.7933899,-0.5224313,-0.5901014,-0.188938,-0.02993422,0.0
25%,-0.868136,-0.6366142,-0.6726053,-0.6323672,-0.6320035,-0.6726956,-0.6903338,-0.7528363,-0.922995,-0.8618592,...,-0.1573869,-0.445531,-0.525207,-0.03666999,-0.7933899,-0.5224313,-0.5901014,-0.188938,-0.02993422,0.0
50%,0.01253754,-0.4604191,-0.4445273,-0.4676945,-0.4625199,-0.3845596,-0.1679064,-0.03287213,-0.2272059,-0.2462258,...,-0.1573869,-0.445531,-0.525207,-0.03666999,-0.7933899,-0.5224313,-0.5901014,-0.188938,-0.02993422,0.0
75%,0.8586749,0.1688489,0.2922541,0.2275899,0.142779,0.2301304,0.354521,0.687092,0.4685831,0.6772243,...,-0.1573869,-0.445531,-0.525207,-0.03666999,1.260414,-0.5224313,1.694624,-0.188938,-0.02993422,0.0
max,1.72208,4.347188,6.970244,4.051655,5.711528,6.108104,6.62365,8.246715,8.818051,2.216308,...,6.353768,2.244513,1.904011,27.27025,1.260414,1.914127,1.694624,5.29273,33.40659,1.0


### Target column: 'Total_purchase' (total number of purchase)

In [13]:
target_column = 'Total_purchase'
data_selected2 = feature_selection_lasso(processed_data, target_column)
data_selected2.sort_values(by=target_column, ascending=False)
data_selected2.reset_index(drop=True, inplace=True)
data_selected2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2234 entries, 0 to 2233
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   MntWines               2234 non-null   float64
 1   MntFruits              2234 non-null   float64
 2   MntMeatProducts        2234 non-null   float64
 3   MntFishProducts        2234 non-null   float64
 4   MntSweetProducts       2234 non-null   float64
 5   MntGoldProds           2234 non-null   float64
 6   Response               2234 non-null   float64
 7   Age                    2234 non-null   float64
 8   Cmp_Attitude           2234 non-null   float64
 9   R                      2234 non-null   float64
 10  F                      2234 non-null   float64
 11  M                      2234 non-null   float64
 12  Income_Category_High   2234 non-null   float64
 13  Income_Category_Low    2234 non-null   float64
 14  Education_Basic        2234 non-null   float64
 15  Mari

In [14]:
data_selected2.describe()

Unnamed: 0,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,Response,Age,Cmp_Attitude,R,F,M,Income_Category_High,Income_Category_Low,Education_Basic,Marital_Status_Single,Marital_Status_YOLO,Total_purchase
count,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0
mean,9.223697000000001e-17,-4.6913630000000007e-17,1.9083510000000003e-17,2.9420410000000005e-17,-2.2264100000000002e-17,4.2937900000000003e-17,4.2937900000000003e-17,2.711449e-16,-4.850392e-17,0.0,-5.884083000000001e-17,0.0,1.129108e-16,3.1805850000000002e-18,2.5444680000000002e-17,5.725053000000001e-17,1.2722340000000001e-17,12.544315
std,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,7.197244
min,-0.904167,-0.6617849,-0.7441592,-0.6872581,-0.6562155,-0.8455772,-0.4192726,-2.316854,-0.5106098,-1.0,-0.9884286,-1.0,-7.813805,-0.02116195,-0.1573869,-0.5224313,-0.02993422,0.0
25%,-0.8328644,-0.6366142,-0.6726053,-0.6323672,-0.6320035,-0.6726956,-0.4192726,-0.6931512,-0.5106098,-1.0,-0.9884286,-1.0,0.1279786,-0.02116195,-0.1573869,-0.5224313,-0.02993422,6.0
50%,-0.385738,-0.4604191,-0.4445273,-0.4676945,-0.4625199,-0.3845596,-0.4192726,-0.09494488,-0.5106098,0.0,-0.9884286,0.0,0.1279786,-0.02116195,-0.1573869,-0.5224313,-0.02993422,12.0
75%,0.5954147,0.1688489,0.2922541,0.2275899,0.142779,0.2301304,-0.4192726,0.8450936,-0.5106098,1.0,1.011707,1.0,0.1279786,-0.02116195,-0.1573869,-0.5224313,-0.02993422,18.0
max,3.531445,4.347188,6.970244,4.051655,5.711528,6.108104,2.385083,2.468796,1.958443,1.0,1.011707,1.0,0.1279786,47.25463,6.353768,1.914127,33.40659,32.0


### Target column: 'Total_amount' (total amount spent)

In [15]:
target_column = 'Total_amount'
data_selected3 = feature_selection_lasso(processed_data, target_column)
data_selected3.sort_values(by=target_column, ascending=False)
data_selected3.reset_index(drop=True, inplace=True)
data_selected3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2234 entries, 0 to 2233
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Recency                  2234 non-null   float64
 1   NumDealsPurchases        2234 non-null   float64
 2   NumWebPurchases          2234 non-null   float64
 3   NumCatalogPurchases      2234 non-null   float64
 4   NumStorePurchases        2234 non-null   float64
 5   NumWebVisitsMonth        2234 non-null   float64
 6   Complain                 2234 non-null   float64
 7   Response                 2234 non-null   float64
 8   Age                      2234 non-null   float64
 9   Cmp_Attitude             2234 non-null   float64
 10  M                        2234 non-null   float64
 11  Income_Category_High     2234 non-null   float64
 12  Income_Category_Low      2234 non-null   float64
 13  Education_Basic          2234 non-null   float64
 14  Education_Graduation    

In [16]:
data_selected3.describe()

Unnamed: 0,Recency,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Response,Age,Cmp_Attitude,...,Income_Category_High,Income_Category_Low,Education_Basic,Education_Graduation,Education_Master,Marital_Status_Alone,Marital_Status_Married,Marital_Status_Together,Marital_Status_YOLO,Total_amount
count,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,...,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0
mean,3.8167020000000006e-17,6.997287000000001e-17,8.587580000000001e-17,-1.9083510000000003e-17,-1.196695e-16,7.633405000000001e-17,2.2264100000000002e-17,4.2937900000000003e-17,2.711449e-16,-4.850392e-17,...,1.129108e-16,3.1805850000000002e-18,2.5444680000000002e-17,-9.541756e-18,-3.498644e-17,-6.3611700000000005e-18,1.049593e-16,6.281656000000001e-17,1.2722340000000001e-17,605.712623
std,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,...,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,601.582708
min,-1.697005,-1.212761,-1.4728,-0.922995,-1.785309,-2.196857,-0.09504432,-0.4192726,-2.316854,-0.5106098,...,-7.813805,-0.02116195,-0.1573869,-1.00809,-0.445531,-0.03666999,-0.7933899,-0.5901014,-0.02993422,5.0
25%,-0.868136,-0.6903338,-0.7528363,-0.922995,-0.8618592,-0.9587694,-0.09504432,-0.4192726,-0.6931512,-0.5106098,...,0.1279786,-0.02116195,-0.1573869,-1.00809,-0.445531,-0.03666999,-0.7933899,-0.5901014,-0.02993422,69.0
50%,0.01253754,-0.1679064,-0.03287213,-0.2272059,-0.2462258,0.2793178,-0.09504432,-0.4192726,-0.09494488,-0.5106098,...,0.1279786,-0.02116195,-0.1573869,0.9919749,-0.445531,-0.03666999,-0.7933899,-0.5901014,-0.02993422,396.5
75%,0.8586749,0.354521,0.687092,0.4685831,0.6772243,0.6920135,-0.09504432,-0.4192726,0.8450936,-0.5106098,...,0.1279786,-0.02116195,-0.1573869,0.9919749,-0.445531,-0.03666999,1.260414,1.694624,-0.02993422,1044.75
max,1.72208,6.62365,8.246715,8.818051,2.216308,6.057058,10.52141,2.385083,2.468796,1.958443,...,0.1279786,47.25463,6.353768,0.9919749,2.244513,27.27025,1.260414,1.694624,33.40659,2525.0


### Target column: 'Cmp_Attitude' (whether accepted any deal from campaigns)

In [17]:
target_column = 'Cmp_Attitude'
data_selected4 = feature_selection_lasso(processed_data, target_column)
data_selected4.sort_values(by=target_column, ascending=False)
data_selected4.reset_index(drop=True, inplace=True)
data_selected4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2234 entries, 0 to 2233
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Recency                  2234 non-null   float64
 1   MntWines                 2234 non-null   float64
 2   MntFruits                2234 non-null   float64
 3   MntGoldProds             2234 non-null   float64
 4   NumDealsPurchases        2234 non-null   float64
 5   NumCatalogPurchases      2234 non-null   float64
 6   NumStorePurchases        2234 non-null   float64
 7   Complain                 2234 non-null   float64
 8   Response                 2234 non-null   float64
 9   Age                      2234 non-null   float64
 10  Education_Basic          2234 non-null   float64
 11  Education_Master         2234 non-null   float64
 12  Education_PhD            2234 non-null   float64
 13  Marital_Status_Alone     2234 non-null   float64
 14  Marital_Status_Divorced 

In [18]:
data_selected4.describe()

Unnamed: 0,Recency,MntWines,MntFruits,MntGoldProds,NumDealsPurchases,NumCatalogPurchases,NumStorePurchases,Complain,Response,Age,Education_Basic,Education_Master,Education_PhD,Marital_Status_Alone,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_YOLO,Cmp_Attitude
count,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0
mean,3.8167020000000006e-17,9.223697000000001e-17,-4.6913630000000007e-17,4.2937900000000003e-17,6.997287000000001e-17,-1.9083510000000003e-17,-1.196695e-16,2.2264100000000002e-17,4.2937900000000003e-17,2.711449e-16,2.5444680000000002e-17,-3.498644e-17,4.7708780000000007e-17,-6.3611700000000005e-18,-3.0215560000000005e-17,1.049593e-16,5.725053000000001e-17,1.2722340000000001e-17,0.206804
std,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,0.405104
min,-1.697005,-0.904167,-0.6617849,-0.8455772,-1.212761,-0.922995,-1.785309,-0.09504432,-0.4192726,-2.316854,-0.1573869,-0.445531,-0.525207,-0.03666999,-0.3395983,-0.7933899,-0.5224313,-0.02993422,0.0
25%,-0.868136,-0.8328644,-0.6366142,-0.6726956,-0.6903338,-0.922995,-0.8618592,-0.09504432,-0.4192726,-0.6931512,-0.1573869,-0.445531,-0.525207,-0.03666999,-0.3395983,-0.7933899,-0.5224313,-0.02993422,0.0
50%,0.01253754,-0.385738,-0.4604191,-0.3845596,-0.1679064,-0.2272059,-0.2462258,-0.09504432,-0.4192726,-0.09494488,-0.1573869,-0.445531,-0.525207,-0.03666999,-0.3395983,-0.7933899,-0.5224313,-0.02993422,0.0
75%,0.8586749,0.5954147,0.1688489,0.2301304,0.354521,0.4685831,0.6772243,-0.09504432,-0.4192726,0.8450936,-0.1573869,-0.445531,-0.525207,-0.03666999,-0.3395983,1.260414,-0.5224313,-0.02993422,0.0
max,1.72208,3.531445,4.347188,6.108104,6.62365,8.818051,2.216308,10.52141,2.385083,2.468796,6.353768,2.244513,1.904011,27.27025,2.944655,1.260414,1.914127,33.40659,1.0


### Target column: 'Recency' (Number of days since customer's last purchase)

In [19]:
target_column = 'Recency'
data_selected5 = feature_selection_lasso(processed_data, target_column)
data_selected5.sort_values(by=target_column, ascending=False)
data_selected5.reset_index(drop=True, inplace=True)
data_selected5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2234 entries, 0 to 2233
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   MntMeatProducts       2234 non-null   float64
 1   MntGoldProds          2234 non-null   float64
 2   NumWebVisitsMonth     2234 non-null   float64
 3   Complain              2234 non-null   float64
 4   Response              2234 non-null   float64
 5   Age                   2234 non-null   float64
 6   Cmp_Attitude          2234 non-null   float64
 7   R                     2234 non-null   float64
 8   F                     2234 non-null   float64
 9   Income_Category_High  2234 non-null   float64
 10  Education_Graduation  2234 non-null   float64
 11  Education_Master      2234 non-null   float64
 12  Marital_Status_Alone  2234 non-null   float64
 13  Marital_Status_Widow  2234 non-null   float64
 14  Marital_Status_YOLO   2234 non-null   float64
 15  Recency              

In [20]:
data_selected5.describe()

Unnamed: 0,MntMeatProducts,MntGoldProds,NumWebVisitsMonth,Complain,Response,Age,Cmp_Attitude,R,F,Income_Category_High,Education_Graduation,Education_Master,Marital_Status_Alone,Marital_Status_Widow,Marital_Status_YOLO,Recency
count,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0
mean,1.9083510000000003e-17,4.2937900000000003e-17,7.633405000000001e-17,2.2264100000000002e-17,4.2937900000000003e-17,2.711449e-16,-4.850392e-17,0.0,-5.884083000000001e-17,1.129108e-16,-9.541756e-18,-3.498644e-17,-6.3611700000000005e-18,0.0,1.2722340000000001e-17,49.136974
std,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,1.000224,28.961593
min,-0.7441592,-0.8455772,-2.196857,-0.09504432,-0.4192726,-2.316854,-0.5106098,-1.0,-0.9884286,-7.813805,-1.00809,-0.445531,-0.03666999,-0.188938,-0.02993422,0.0
25%,-0.6726053,-0.6726956,-0.9587694,-0.09504432,-0.4192726,-0.6931512,-0.5106098,-1.0,-0.9884286,0.1279786,-1.00809,-0.445531,-0.03666999,-0.188938,-0.02993422,24.0
50%,-0.4445273,-0.3845596,0.2793178,-0.09504432,-0.4192726,-0.09494488,-0.5106098,0.0,-0.9884286,0.1279786,0.9919749,-0.445531,-0.03666999,-0.188938,-0.02993422,49.5
75%,0.2922541,0.2301304,0.6920135,-0.09504432,-0.4192726,0.8450936,-0.5106098,1.0,1.011707,0.1279786,0.9919749,-0.445531,-0.03666999,-0.188938,-0.02993422,74.0
max,6.970244,6.108104,6.057058,10.52141,2.385083,2.468796,1.958443,1.0,1.011707,0.1279786,0.9919749,2.244513,27.27025,5.29273,33.40659,99.0


# Export Preprocessed data

In [21]:
processed_data.to_csv('Final Preprocessed Data.csv', index=False)