In [11]:
import pandas as pd

data_sales = pd.read_csv("C:/Users/kanav/OneDrive/Desktop/ML/AWCustomers.csv")
# print(data_sales)

selected_columns = [
    'CustomerID', 'City', 'StateProvinceName', 'CountryRegionName','BirthDate' ,'Education', 'Occupation',
    'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned',
    'TotalChildren', 'YearlyIncome'
]
df = data_sales[selected_columns]
print(df)
data_types={
    'CustomerID':'Discrete',
    'City':'Nominal',
    'StateProvinceName':'Nominal',
    'CountryRegionName':'Nominal',
    'BirthDate':'Discrete',
    'Education':'Ordinal',
    'Occupation':'Nominal',
    'Gender':'Nominal',
    'MaritalStatus':'Nominal',
    'HomeOwnerFlag':'Discrete',
    'NumberCarsOwned':'Discrete Ratio',
    'TotalChildren':'Discrete Ratio',
    'YearlyIncome':'Continous'
}
print(data_types)

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import datetime as dt

df = df.ffill()

today = pd.to_datetime("today")
df['BirthDate'] = pd.to_datetime(df['BirthDate'], errors='coerce')
df['Age'] = (today.year - df['BirthDate'].dt.year) - ((today.month < df['BirthDate'].dt.month) | ((today.month == df['BirthDate'].dt.month) & (today.day < df['BirthDate'].dt.day)))
df['Age'] = df['Age'].fillna(df['Age'].median()).astype(int)

num_cols = ['Age','YearlyIncome','NumberCarsOwned','TotalChildren']
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

scaler = MinMaxScaler()
if 'Age' in df.columns:
    df['Age_norm'] = scaler.fit_transform(df[['Age']])
if 'YearlyIncome' in df.columns:
    df['YearlyIncome_norm'] = scaler.fit_transform(df[['YearlyIncome']])

df['Age_disc'] = pd.cut(df['Age'], bins=[0,25,40,60,120], labels=['Youth','YoungAdult','Adult','Senior'])

std = StandardScaler()
if 'Age' in df.columns:
    df['Age_std'] = std.fit_transform(df[['Age']])
if 'YearlyIncome' in df.columns:
    df['YearlyIncome_std'] = std.fit_transform(df[['YearlyIncome']])

cat_cols = ['City','StateProvinceName','CountryRegionName','Education','Occupation','Gender','MaritalStatus','HomeOwnerFlag','Age_disc']
existing_cat_cols = [c for c in cat_cols if c in df.columns]
if existing_cat_cols:
    df = pd.get_dummies(df, columns=existing_cat_cols)

binary_cols = [c for c in df.columns if set(df[c].dropna().unique()).issubset({0,1})]
numeric_cols = [c for c in df.columns if np.issubdtype(df[c].dtype, np.number) and c not in binary_cols]

if len(binary_cols) >= 1:
    b1 = df.iloc[0][binary_cols].astype(int)
    b2 = df.iloc[1][binary_cols].astype(int)
    smc = np.sum(b1.values == b2.values) / len(b1)
    inter = np.sum((b1.values & b2.values) == 1)
    union = np.sum(((b1.values | b2.values) == 1))
    jaccard = inter / union if union != 0 else 0.0
else:
    smc = None
    jaccard = None

if len(numeric_cols) >= 1:
    v1 = df.iloc[0][numeric_cols].fillna(0).astype(float).values
    v2 = df.iloc[1][numeric_cols].fillna(0).astype(float).values
    cos_sim = cosine_similarity([v1],[v2])[0][0]
else:
    all_cols = [c for c in df.columns if np.issubdtype(df[c].dtype, np.number)]
    v1 = df.iloc[0][all_cols].fillna(0).astype(float).values
    v2 = df.iloc[1][all_cols].fillna(0).astype(float).values
    cos_sim = cosine_similarity([v1],[v2])[0][0] if len(all_cols)>0 else None

if 'CommuteDistance' in data_sales.columns:
    cd = data_sales['CommuteDistance'].copy()
    yi = pd.to_numeric(data_sales['YearlyIncome'], errors='coerce')
    codes, uniques = pd.factorize(cd)
    cd_series = pd.Series(codes, index=data_sales.index)
    mask = (~pd.isna(yi)) & (cd_series != -1)
    if mask.sum() > 0:
        corr = cd_series[mask].corr(yi[mask])
    else:
        corr = None
else:
    if 'NumberCarsOwned' in df.columns and 'YearlyIncome' in df.columns:
        corr = df['NumberCarsOwned'].corr(df['YearlyIncome'])
    else:
        corr = None

print("SMC:", smc)
print("Jaccard:", jaccard)
print("Cosine:", cos_sim)
print("Correlation:", corr)


       CustomerID            City    StateProvinceName CountryRegionName  \
0           21173      Wollongong      New South Wales         Australia   
1           13249         Shawnee     British Columbia            Canada   
2           29350     West Covina           California     United States   
3           13503       Liverpool              England    United Kingdom   
4           22803           Werne  Nordrhein-Westfalen           Germany   
...           ...             ...                  ...               ...   
18356       25414        Coronado           California     United States   
18357       11459  Port Macquarie      New South Wales         Australia   
18358       12160       Beaverton               Oregon     United States   
18359       14353       Vancouver     British Columbia            Canada   
18360       16676       Grossmont           California     United States   

        BirthDate        Education      Occupation Gender MaritalStatus  \
0      1987-

  v1 = df.iloc[0][numeric_cols].fillna(0).astype(float).values
  v2 = df.iloc[1][numeric_cols].fillna(0).astype(float).values
