In [131]:
import pandas as pd
import numpy as np

## Exploration

In [132]:
df = pd.read_csv('../data/Small.csv')
print(df.shape)
print(df.head())

(1623, 11)
       ID  Gender Ever_Married  Age Graduated     Profession  Work_Experience  \
0  462809    Male           No   22        No     Healthcare              1.0   
1  462643  Female          Yes   38       Yes       Engineer              NaN   
2  466315  Female          Yes   67       Yes       Engineer              1.0   
3  461735    Male          Yes   67       Yes         Lawyer              0.0   
4  462669  Female          Yes   40       Yes  Entertainment              NaN   

  Spending_Score  Family_Size  Var_1 Segmentation  
0            Low          4.0  Cat_4            D  
1        Average          3.0  Cat_4            A  
2            Low          1.0  Cat_6            B  
3           High          2.0  Cat_6            B  
4           High          6.0  Cat_6            A  


In [133]:
df.isna().sum()

ID                   0
Gender               0
Ever_Married        29
Age                  0
Graduated           19
Profession          32
Work_Experience    185
Spending_Score       0
Family_Size         79
Var_1               17
Segmentation         0
dtype: int64

In [134]:
df.nunique()

ID                 1558
Gender                2
Ever_Married          2
Age                  67
Graduated             2
Profession            9
Work_Experience      15
Spending_Score        3
Family_Size           9
Var_1                 7
Segmentation          4
dtype: int64

In [135]:
df['Profession'].value_counts()

Profession
Artist           481
Healthcare       266
Entertainment    207
Engineer         151
Doctor           134
Lawyer           115
Executive        110
Marketing         84
Homemaker         43
Name: count, dtype: int64

In [136]:
df['Var_1'].value_counts()

Var_1
Cat_6    978
Cat_4    264
Cat_3    192
Cat_2     77
Cat_7     46
Cat_1     33
Cat_5     16
Name: count, dtype: int64

In [137]:
df['Segmentation'].value_counts()

Segmentation
D    486
A    475
B    340
C    322
Name: count, dtype: int64

## Preprocessing

In [138]:
### Drop rows with missing values ###

df = df.dropna(axis=0)
df.shape

(1303, 11)

In [139]:
### Drop ID ###

df = df.drop(columns = ['ID'])
df.columns

Index(['Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession',
       'Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1',
       'Segmentation'],
      dtype='object')

In [140]:
### Group numerical variables based on quartiles ###

# Function to categorize based on quartiles
def categorize_quartiles(series):
    quartiles = series.quantile([0.25, 0.5, 0.75])
    return pd.cut(series, bins=[-np.inf, quartiles[0.25], quartiles[0.5], quartiles[0.75], np.inf],
                  labels=['Q1', 'Q2', 'Q3', 'Q4'])

# Apply the function to numerical columns
for column in ['Age', 'Work_Experience', 'Family_Size']:
    df[column] = categorize_quartiles(df[column])

print(df[['Age', 'Work_Experience', 'Family_Size']].nunique())

Age                4
Work_Experience    4
Family_Size        4
dtype: int64


## Check final df

In [141]:
print(df.head())

   Gender Ever_Married Age Graduated  Profession Work_Experience  \
0    Male           No  Q1        No  Healthcare              Q2   
2  Female          Yes  Q4       Yes    Engineer              Q2   
3    Male          Yes  Q4       Yes      Lawyer              Q1   
5    Male          Yes  Q4        No      Artist              Q1   
6    Male          Yes  Q4        No      Artist              Q1   

  Spending_Score Family_Size  Var_1 Segmentation  
0            Low          Q3  Cat_4            D  
2            Low          Q1  Cat_6            B  
3           High          Q1  Cat_6            B  
5        Average          Q1  Cat_6            C  
6            Low          Q1  Cat_6            A  
