# Clustering and Fitting on Mall Dataset
### Muhammad Hamza
### 23068870
### Data Handling and Visualisation

### Importing Necessary Libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split

## Loading Dataset

In [None]:
# Load the dataset
df = pd.read_csv('Mall_Customers.csv')

# Display the first few rows of the dataset
df.head()

### Data Cleaning

In [None]:
# Check for and remove duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
if duplicates > 0:
    df = df.drop_duplicates()
    print("Duplicates removed. Remaining rows:", len(df))

# Check for missing values and handle them (if any)
missing_values = df.isnull().sum()
print("\nMissing Values Count:\n", missing_values)
if missing_values.any():
    df = df.dropna()  # Drop rows with missing values
    print("Missing values removed. Remaining rows:", len(df))

# Detect and remove outliers using the IQR method
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

for col in numerical_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
    print(f"Outliers detected in {col}: {outliers}")
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

print("\nData after cleaning:")
print(df.describe())


## Exploratory Data Analysis

In [None]:
# Display basic statistics
df.describe()

# Check for missing values
df.isnull().sum()

# Distribution of Gender
plt.figure(figsize=(8, 6))
sns.countplot(x='Gender', data=df)
plt.title('Gender Distribution')
plt.show()

### Histogram

In [None]:
#Histogram of Spending Score
plt.figure(figsize=(8, 6))
sns.histplot(df['Spending Score (1-100)'], bins=20, kde=True)
plt.title('Distribution of Spending Score')
plt.xlabel('Spending Score')
plt.ylabel('Frequency')
plt.show()

### Scatter plot: Annual Income vs Spending Score

In [None]:
# Scatter plot: Annual Income vs Spending Score
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', data=df, hue='Gender')
plt.title('Annual Income vs Spending Score')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.show()

### Corelation Matrix

In [None]:
from sklearn.preprocessing import LabelEncoder

# Convert 'Gender' column to numeric values (0 for Male, 1 for Female)
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])

# Compute the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

### Box Plot

In [None]:
# Boxplot: Spending Score by Gender
plt.figure(figsize=(8, 6))
sns.boxplot(x='Gender', y='Spending Score (1-100)', data=df)
plt.title('Spending Score by Gender')
plt.show()

# K-MEANS Clustering

In [None]:
# -----------------------------------------------
# KMeans Clustering
# -----------------------------------------------

# Select features for clustering
X = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Find the optimal number of clusters using the Elbow Method
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o')
plt.title('Elbow Method for Optimal Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

# Using 5 clusters (as seen from the Elbow plot)
kmeans = KMeans(n_clusters=5, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# Scatter plot: Clusters based on Annual Income vs Spending Score
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster', data=df, palette='tab10')
plt.title('Clustering of Customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.show()

# Linear Regression (Fitting)

In [None]:
# -----------------------------------------------
# Linear Regression (Fitting)
# -----------------------------------------------

# Selecting features for regression
X_reg = df[['Annual Income (k$)']]
y_reg = df['Spending Score (1-100)']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Fit a linear regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predict values
y_pred = regressor.predict(X_test)

# Plot the regression line
plt.figure(figsize=(8, 6))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test, y_pred, color='red', label='Predicted')
plt.title('Linear Regression: Annual Income vs Spending Score')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

# Calculate the regression model performance
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

### Additional Plots

In [None]:
# Gender vs Spending Score (Boxplot)
plt.figure(figsize=(8, 6))
sns.boxplot(x='Gender', y='Spending Score (1-100)', data=df)
plt.title('Spending Score by Gender')
plt.show()

# Gender distribution (Pie chart)
gender_counts = df['Gender'].value_counts()
plt.figure(figsize=(8, 6))
gender_counts.plot.pie(autopct='%1.1f%%', colors=['skyblue', 'salmon'], startangle=90, textprops={'fontsize': 12})
plt.title('Gender Distribution')
plt.ylabel('')
plt.show()