# Adventure Works Customer Data Analysis

This notebook solves the assignment step by step (Part I, II, III).

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard
from scipy.stats import pearsonr

## Part I: Feature Selection, Cleaning, Preprocessing

In [6]:
# Load dataset (update path with actual csv file downloaded)
df = pd.read_csv('C:/Users/aniru/Downloads/AWCustomers.csv')
df.head()

PermissionError: [Errno 13] Permission denied: 'C:/Users/aniru/Downloads/AWCustomers.csv'

In [None]:
# Select relevant features for predicting Bike Buyer
selected_features = ['Age','Gender','MaritalStatus','Income','CommuteDistance','Occupation',
                     'NumberCarsOwned','NumberChildrenAtHome','Region','TotalChildren','HouseOwnerFlag','BikeBuyer']
df_selected = df[selected_features]
df_selected.head()

In [None]:
# Check data types
df_selected.dtypes

## Part II: Data Preprocessing and Transformation

In [None]:
# Handle missing values
df_selected = df_selected.dropna()
df_selected.isnull().sum()

In [None]:
# Normalization example
scaler = MinMaxScaler()
df_selected['Income_norm'] = scaler.fit_transform(df_selected[['Income']])
df_selected['Age_norm'] = scaler.fit_transform(df_selected[['Age']])
df_selected.head()

In [None]:
# Discretization example (bin Age)
kbins = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
df_selected['Age_binned'] = kbins.fit_transform(df_selected[['Age']])
df_selected.head()

In [None]:
# Standardization example
std_scaler = StandardScaler()
df_selected['Income_std'] = std_scaler.fit_transform(df_selected[['Income']])
df_selected.head()

In [None]:
# One Hot Encoding categorical features
encoded_df = pd.get_dummies(df_selected, columns=['Gender','MaritalStatus','Occupation','Region','CommuteDistance'])
encoded_df.head()

## Part III: Proximity / Correlation Analysis

In [None]:
# Select two rows for similarity
x = encoded_df.iloc[0].values.reshape(1,-1)
y = encoded_df.iloc[1].values.reshape(1,-1)

# Cosine similarity
cos_sim = cosine_similarity(x,y)

# Jaccard similarity on binary attributes
bin_cols = ['HouseOwnerFlag','BikeBuyer']
jaccard_sim = 1 - jaccard(encoded_df[bin_cols].iloc[0], encoded_df[bin_cols].iloc[1])

cos_sim, jaccard_sim

In [None]:
# Correlation between CommuteDistance and Income
df_selected['CommuteDistance_codes'] = df_selected['CommuteDistance'].astype('category').cat.codes
corr, pval = pearsonr(df_selected['CommuteDistance_codes'], df_selected['Income'])
corr