In [10]:
import pandas as pd;
df=pd.read_csv("AWCustomers.csv")
df.head()

Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,...,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,LastUpdated
0,21173,,Chad,C,Yuan,,7090 C. Mount Hood,,Wollongong,New South Wales,...,Bachelors,Clerical,M,M,1,3,0,1,81916,2017-03-06
1,13249,,Ryan,,Perry,,3651 Willow Lake Rd,,Shawnee,British Columbia,...,Partial College,Clerical,M,M,1,2,1,2,81076,2017-03-06
2,29350,,Julia,,Thompson,,1774 Tice Valley Blvd.,,West Covina,California,...,Bachelors,Clerical,F,S,0,3,0,0,86387,2017-03-06
3,13503,,Theodore,,Gomez,,2103 Baldwin Dr,,Liverpool,England,...,Partial College,Skilled Manual,M,M,1,2,1,2,61481,2017-03-06
4,22803,,Marshall,J,Shan,,Am Gallberg 234,,Werne,Nordrhein-Westfalen,...,Partial College,Skilled Manual,M,S,1,1,0,0,51804,2017-03-06


In [11]:
newdf=df.iloc[:,[0,2,11,13,14,15,16,17,18,19,20,21,22]]
newdf.head()

Unnamed: 0,CustomerID,FirstName,PostalCode,BirthDate,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome
0,21173,Chad,2500,1987-11-13,Bachelors,Clerical,M,M,1,3,0,1,81916
1,13249,Ryan,V9B 2C3,1972-07-21,Partial College,Clerical,M,M,1,2,1,2,81076
2,29350,Julia,91791,1985-11-09,Bachelors,Clerical,F,S,0,3,0,0,86387
3,13503,Theodore,L4 4HB,1977-10-18,Partial College,Skilled Manual,M,M,1,2,1,2,61481
4,22803,Marshall,59368,1975-02-05,Partial College,Skilled Manual,M,S,1,1,0,0,51804


In [12]:
types = {
    "CustomerID": "Discrete and Nominal",
    "FirstName": "Discrete and Nominal",
    "PostalCode": "Discrete and Nominal",
    "BirthDate": "Continuous and Interval",
    "Education": "Discrete and Ordinal",
    "Occupation": "Discrete and Nominal",
    "Gender": "Discrete and Nominal",
    "MaritalStatus": "Discrete and Nominal",
    "HomeOwnerFlag": "Discrete and Nominal",
    "NumberCarsOwned": "Discrete and Ratio",
    "NumberChildrenAtHome": "Discrete and Ratio",
    "TotalChildren": "Discrete and Ratio",
    "YearlyIncome": "Continuous and Ratio"
}

In [13]:
newdf.isnull().sum()

CustomerID              0
FirstName               0
PostalCode              0
BirthDate               0
Education               0
Occupation              0
Gender                  0
MaritalStatus           0
HomeOwnerFlag           0
NumberCarsOwned         0
NumberChildrenAtHome    0
TotalChildren           0
YearlyIncome            0
dtype: int64

In [14]:
# Handling Nulls
newdf = newdf.copy()

for col in newdf.columns:
    if newdf[col].dtype in ['float64', 'int64']:
        newdf[col] = newdf[col].fillna(newdf[col].median())
    else:
        newdf[col] = newdf[col].fillna(newdf[col].mode()[0])

#Normalization

from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()
numeric_cols = newdf.select_dtypes(include=[np.number]).columns

newdf[numeric_cols] = scaler.fit_transform(newdf[numeric_cols])


#Discretization(Binning)
newdf['Income_bin'] = pd.cut(newdf['YearlyIncome'], bins=3, labels=['Low','Medium','High'])



#Standardization
from sklearn.preprocessing import StandardScaler

scaler_std = StandardScaler()
newdf[numeric_cols] = scaler_std.fit_transform(newdf[numeric_cols])

#Binarization
newdf = pd.get_dummies(newdf, drop_first=True)
print(newdf)

       CustomerID  HomeOwnerFlag  NumberCarsOwned  NumberChildrenAtHome  \
0        0.174472       0.798603         1.892524             -0.594371   
1       -1.310484       0.798603         0.798389              1.163279   
2        1.706839      -1.252187         1.892524             -0.594371   
3       -1.262884       0.798603         0.798389              1.163279   
4        0.479933       0.798603        -0.295746             -0.594371   
...           ...            ...              ...                   ...   
18356    0.969234      -1.252187        -0.295746             -0.594371   
18357   -1.645929      -1.252187         0.798389             -0.594371   
18358   -1.514562      -1.252187         0.798389             -0.594371   
18359   -1.103595      -1.252187        -1.389881             -0.594371   
18360   -0.668265       0.798603         0.798389             -0.594371   

       TotalChildren  YearlyIncome  FirstName_Abby  FirstName_Abhijit  \
0           0.161342      

In [15]:
#Calculation proximity
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard

obj1 = newdf.iloc[0].values.reshape(1,-1)
obj2 = newdf.iloc[1].values.reshape(1,-1)

# Cosine Similarity
cos_sim = cosine_similarity(obj1, obj2)[0][0]

# Jaccard Similarity (binary columns only)
jac_sim = 1 - jaccard(newdf.iloc[0].values, newdf.iloc[1].values)

# Simple Matching Coefficient (SMC)
smc = (newdf.iloc[0] == newdf.iloc[1]).sum() / len(newdf.columns)

print("Cosine:", cos_sim)
print("Jaccard:", jac_sim)
print("SMC:", smc)


#Correlation analysis - numeric
corr = newdf['TotalChildren'].corr(newdf['YearlyIncome'])
print("Correlation:", corr)

Cosine: 0.32559043220453193
Jaccard: 0.5333333333333333
SMC: 0.9987010175362633
Correlation: 0.022013822892024196
