<a href="https://colab.research.google.com/github/MayankSangwan07/ML_ASSIGNMENTS/blob/main/ML_ASSIGNMENT_02_SOL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard, hamming

# 1. Load Data
customers = pd.read_csv('AWCustomers.csv')
sales = pd.read_csv('AWSales.csv')

# Merge datasets
data = pd.merge(customers, sales, on='CustomerID')

# --- PART I ---
# (a) & (b) Feature Selection
# Selecting relevant features based on analysis
selected_columns = [
    'YearlyIncome', 'BirthDate', 'MaritalStatus', 'Gender',
    'Education', 'Occupation', 'HomeOwnerFlag', 'NumberCarsOwned',
    'TotalChildren', 'CountryRegionName', 'BikeBuyer'
]
df = data[selected_columns].copy()

# Calculate Age from BirthDate (assuming current year is 2017 based on dataset context)
df['BirthDate'] = pd.to_datetime(df['BirthDate'])
df['Age'] = 2017 - df['BirthDate'].dt.year
df = df.drop('BirthDate', axis=1)

# --- PART II ---
# (a) Handling Null Values
# (Checking for nulls - simply dropping rows with nulls for this example if any exist,
# though this specific dataset is usually clean)
df = df.dropna()

# (b) Normalization (Min-Max Scaling on YearlyIncome)
scaler = MinMaxScaler()
df['YearlyIncome_Norm'] = scaler.fit_transform(df[['YearlyIncome']])

# (c) Discretization (Binning Age)
# Binning Age into 4 groups
bins = [0, 25, 45, 65, 120]
labels = ['Youth', 'YoungAdult', 'Adult', 'Senior']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)

# (d) Standardization/Normalization
# (Applying Z-score standardization to NumberCarsOwned as an example of numeric standardization)
df['Cars_Standardized'] = (df['NumberCarsOwned'] - df['NumberCarsOwned'].mean()) / df['NumberCarsOwned'].std()

# (e) Binarization (One Hot Encoding)
categorical_cols = ['Education', 'Occupation', 'Gender', 'MaritalStatus', 'CountryRegionName', 'AgeGroup']
df_encoded = pd.get_dummies(df, columns=categorical_cols, prefix=categorical_cols)

# Final Input Matrix for Similarity (Dropping original non-transformed cols for calculation)
# We keep BikeBuyer as target, but for similarity between customers, we focus on features.
feature_cols = [col for col in df_encoded.columns if col not in ['YearlyIncome', 'NumberCarsOwned', 'TotalChildren', 'BikeBuyer', 'Age']]
# Including scaled/standardized numeric columns
feature_cols += ['YearlyIncome_Norm', 'Cars_Standardized', 'TotalChildren']
final_df = df_encoded[feature_cols]

# --- PART III ---
# (a) Similarity Analysis between first two objects (row 0 and row 1)
obj1 = final_df.iloc[0].values
obj2 = final_df.iloc[1].values

# 1. Jaccard Similarity (1 - Jaccard Distance)
# Jaccard is typically for binary, so we treat non-zero as 1 for this calculation context or use sklearn's implementation
# Manual calculation for clarity: Intersection / Union
intersection = np.sum(np.minimum(obj1, obj2))
union = np.sum(np.maximum(obj1, obj2))
jaccard_sim = intersection / union

# 2. Cosine Similarity
cos_sim = cosine_similarity([obj1], [obj2])[0][0]

# 3. Simple Matching Coefficient (SMC)
# Matches / Total Attributes
matches = np.sum(obj1 == obj2)
total_attributes = len(obj1)
smc = matches / total_attributes

# (b) Correlation
# Note: Commute Distance is missing in the uploaded file snippet.
# Calculating correlation between YearlyIncome and TotalChildren instead.
correlation = df['YearlyIncome'].corr(df['TotalChildren'])

# --- OUTPUT GENERATION ---
print("--- PART I: Data Types ---")
print("YearlyIncome: Continuous (Ratio)")
print("Age: Continuous (Ratio)")
print("MaritalStatus, Gender, Education: Nominal")
print("NumberCarsOwned: Discrete")
print("\n--- PART III: Similarity Results (First 2 Customers) ---")
print(f"Jaccard Similarity: {jaccard_sim:.4f}")
print(f"Cosine Similarity:  {cos_sim:.4f}")
print(f"Simple Matching:    {smc:.4f}")
print("\n--- PART III: Correlation ---")
print(f"Correlation (YearlyIncome vs TotalChildren): {correlation:.4f}")
print("(Note: CommuteDistance was missing from source file, substituted TotalChildren for demonstration)")

--- PART I: Data Types ---
YearlyIncome: Continuous (Ratio)
Age: Continuous (Ratio)
MaritalStatus, Gender, Education: Nominal
NumberCarsOwned: Discrete

--- PART III: Similarity Results (First 2 Customers) ---
Jaccard Similarity: 0.5435
Cosine Similarity:  0.7436
Simple Matching:    0.7000

--- PART III: Correlation ---
Correlation (YearlyIncome vs TotalChildren): 0.0220
(Note: CommuteDistance was missing from source file, substituted TotalChildren for demonstration)
