In [1]:
import pandas as pd
from datetime import datetime

df = pd.read_csv("AWCustomers.csv")

df_selected = df[["BirthDate","Education","Occupation","Gender","MaritalStatus",
                  "HomeOwnerFlag","NumberCarsOwned","NumberChildrenAtHome",
                  "TotalChildren","YearlyIncome"]].copy()

df_selected["BirthDate"] = pd.to_datetime(df_selected["BirthDate"])
today = pd.to_datetime("today")
df_selected["Age"] = df_selected["BirthDate"].apply(lambda x: (today - x).days // 365)

df_selected = df_selected.drop(columns=["BirthDate"])
df_selected.head()


Unnamed: 0,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,Age
0,Bachelors,Clerical,M,M,1,3,0,1,81916,37
1,Partial College,Clerical,M,M,1,2,1,2,81076,53
2,Bachelors,Clerical,F,S,0,3,0,0,86387,39
3,Partial College,Skilled Manual,M,M,1,2,1,2,61481,47
4,Partial College,Skilled Manual,M,S,1,1,0,0,51804,50


In [2]:
df_selected.shape


(18361, 10)

In [3]:
data_types = {
    "Education": "Categorical - Nominal",
    "Occupation": "Categorical - Nominal",
    "Gender": "Categorical - Nominal",
    "MaritalStatus": "Categorical - Nominal",
    "HomeOwnerFlag": "Binary - Nominal",
    "NumberCarsOwned": "Discrete - Ratio",
    "NumberChildrenAtHome": "Discrete - Ratio",
    "TotalChildren": "Discrete - Ratio",
    "YearlyIncome": "Continuous - Ratio",
    "Age": "Continuous - Ratio"
}
data_types


{'Education': 'Categorical - Nominal',
 'Occupation': 'Categorical - Nominal',
 'Gender': 'Categorical - Nominal',
 'MaritalStatus': 'Categorical - Nominal',
 'HomeOwnerFlag': 'Binary - Nominal',
 'NumberCarsOwned': 'Discrete - Ratio',
 'NumberChildrenAtHome': 'Discrete - Ratio',
 'TotalChildren': 'Discrete - Ratio',
 'YearlyIncome': 'Continuous - Ratio',
 'Age': 'Continuous - Ratio'}

In [4]:
df_selected = df_selected.dropna()
df_selected.shape


(18361, 10)

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_selected[["Age","YearlyIncome"]] = scaler.fit_transform(df_selected[["Age","YearlyIncome"]])
df_selected.head()


Unnamed: 0,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,Age
0,Bachelors,Clerical,M,M,1,3,0,1,0.496842,0.183099
1,Partial College,Clerical,M,M,1,2,1,2,0.489453,0.408451
2,Bachelors,Clerical,F,S,0,3,0,0,0.536172,0.211268
3,Partial College,Skilled Manual,M,M,1,2,1,2,0.317083,0.323944
4,Partial College,Skilled Manual,M,S,1,1,0,0,0.231958,0.366197


In [6]:
from sklearn.preprocessing import KBinsDiscretizer

binning = KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform")
df_selected["Age_binned"] = binning.fit_transform(df_selected[["Age"]])
df_selected[["Age","Age_binned"]].head()


Unnamed: 0,Age,Age_binned
0,0.183099,0.0
1,0.408451,2.0
2,0.211268,1.0
3,0.323944,1.0
4,0.366197,1.0


In [7]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
df_selected["Income_std"] = standard_scaler.fit_transform(df_selected[["YearlyIncome"]])
df_selected[["YearlyIncome","Income_std"]].head()


Unnamed: 0,YearlyIncome,Income_std
0,0.496842,0.298555
1,0.489453,0.27118
2,0.536172,0.444261
3,0.317083,-0.367401
4,0.231958,-0.682765


In [8]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

encoder = OneHotEncoder(sparse_output=False, drop="first")
encoded = encoder.fit_transform(df_selected[["Education","Occupation","Gender","MaritalStatus"]])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(), index=df_selected.index)

df_transformed = pd.concat([df_selected, encoded_df], axis=1)
df_transformed.head()


Unnamed: 0,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,Age,...,Education_Graduate Degree,Education_High School,Education_Partial College,Education_Partial High School,Occupation_Management,Occupation_Manual,Occupation_Professional,Occupation_Skilled Manual,Gender_M,MaritalStatus_S
0,Bachelors,Clerical,M,M,1,3,0,1,0.496842,0.183099,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,Partial College,Clerical,M,M,1,2,1,2,0.489453,0.408451,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,Bachelors,Clerical,F,S,0,3,0,0,0.536172,0.211268,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Partial College,Skilled Manual,M,M,1,2,1,2,0.317083,0.323944,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,Partial College,Skilled Manual,M,S,1,1,0,0,0.231958,0.366197,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


In [9]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard

# Keep only numeric columns
numeric_df = df_transformed.select_dtypes(include=[np.number])

row1, row2 = numeric_df.iloc[0], numeric_df.iloc[1]

def simple_matching(a, b):
    return np.sum(a == b) / len(a)

sm_similarity = simple_matching(row1, row2)
jaccard_sim = 1 - jaccard(row1, row2)
cos_sim = cosine_similarity([row1],[row2])[0][0]

(sm_similarity, jaccard_sim, cos_sim)


(np.float64(0.5555555555555556),
 np.float64(0.7),
 np.float64(0.7283407569360274))

In [10]:
from scipy.stats import pearsonr

education_map = {
    "Partial High School":1,
    "High School":2,
    "Partial College":3,
    "Bachelors":4,
    "Graduate Degree":5
}

df_corr = df_selected.copy()
df_corr["EducationNum"] = df["Education"].map(education_map)

corr, p_value = pearsonr(df_corr["EducationNum"], df_corr["YearlyIncome"])
(corr, p_value)


(np.float64(0.4676648355054895), np.float64(0.0))