In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

###  1. Generate two datasets, X (training set) and X1 (test set), each consisting of N = 1000 3-dimensional vectors that stem from three classes, ω1, ω2, and ω3, with prior-probabilities P(ω1)=P(ω2)=P(ω3)=1/3. The classes are modeled by Gaussian distributions with means m1 = [0, 0, 0]T , m2 = [1, 2, 2]T , and m3 = [3, 3, 4]T respectively; their covariance matrices are given

In [None]:
# (a) Use the Euclidean distance classifier to classify the points of X1.
# (b) Use the Mahalanobis distance classifier to classify the points of X1.
# (c) Use the Bayesian classifier to classify the points of X1.
# (d) For each class, compute the error probability and compare the results.
# (e) Experiment with the mean values (bringing them closer or taking them farther away)
# and the a prior-probabilities. Comment on the results.


In [9]:
m1 = np.array([0, 0, 0])
m2 = np.array([1, 2, 2])
m3 = np.array([3, 3, 4])


S1 = np.array([[0.8,0.2,0.1],
              [0.2,0.8,0.2],
              [0.1,0.2,0.8]])

S2 = np.array([[0.6,0.01,0.01],
              [0.01,0.8,0.01],
              [0.01,0.01,0.6]])

S3 = np.array([[0.6,0.1,0.1],
              [0.1,0.6,0.1],
              [0.1,0.1,0.6]])

In [10]:
N = 1000

prior_probs = [1/3, 1/3, 1/3]

X = np.vstack([
    np.random.multivariate_normal(m1,S1,N),
    np.random.multivariate_normal(m2,S2,N),
    np.random.multivariate_normal(m3,S3,N)
])

X1 = np.vstack([
    np.random.multivariate_normal(m1,S1,N),
    np.random.multivariate_normal(m2,S2,N),
    np.random.multivariate_normal(m3,S3,N)
])

In [31]:
def euclidean_distance(x, y):
    return np.linalg.norm(x - y)
def euclidean_classifier(x, means):
    distances = [euclidean_distance(x, m) for m in means]
    return np.argmin(distances)
means = [m1, m2, m3]
predicted_euclidean = [euclidean_classifier(test, means) for test in X1]


In [33]:
actual_labels = np.repeat(np.arange(3), N)
euclidean_error = np.mean(predicted_euclidean != actual_labels)
print("Euclidean Classifier   ", euclidean_error)

Euclidean Classifier    0.059333333333333335


In [34]:
#Use the Bayesian classifier to classify the points of X1.

In [35]:
def multivariate_normal_pdf(x, mean, cov):
    d = len(x)
    coefficient = 1 / (np.sqrt((2 * np.pi)**d * np.linalg.det(cov)))
    exponent = -0.5 * np.dot(np.dot((x - mean).T, np.linalg.inv(cov)), (x-mean))
    return coefficient * np.exp(exponent)

def bayesian_classifier(point, means, covariances, priors):
    likelihoods = [multivariate_normal_pdf(point, mean, cov) for mean, cov in zip(means, covariances)]
    posterior_probs = [likelihood * prior for likelihood, prior in zip(likelihoods, prior_probs)]
    return np.argmax(posterior_probs)

predicted_bayesian = [bayesian_classifier(test, [m1, m2, m3], [S1, S2, S3], prior_probs) for test in X1]

In [44]:
actual_labels = np.repeat(np.arange(3), N)
predicted_bayesian_error = np.mean( predicted_bayesian!= actual_labels)
print("Bayesian classifier   ",predicted_bayesian_error)

Bayesian classifier    0.05733333333333333


In [None]:
#(b) Use the Mahalanobis distance classifier to classify the points of X1.

In [38]:
def mahalanobis_distance(x, mean, cov):
    inv_cov = np.linalg.inv(cov)
    diff = x - mean
    mahalano = np.sqrt(np.dot(np.dot(diff.T, inv_cov), diff))
    return mahalano

def mahalanobis_classifier(x, means, covs):
    distances = [mahalanobis_distance(x, mean, cov) for mean, cov in zip(means, covariances)]
    return np.argmin(distances)
covariances = [S1, S2, S3]

predicted_mahalanobis = [mahalanobis_classifier(test,[m1, m2, m3], [S1, S2, S3]) for test in X1]

In [42]:
actual_labels = np.repeat(np.arange(3), N)
predicted_mahalanobis_error = np.mean( predicted_mahalanobis!= actual_labels)
print("Mahalanobis distance  ",predicted_mahalanobis_error)

Mahalanobis distance   0.059


In [None]:
 #For each class, compute the error probability and compare the results

In [46]:
print("Mahalanobis distance ",predicted_mahalanobis_error)
print("Bayesian classifier  ",predicted_bayesian_error)
print("Euclidean Classifier   ", euclidean_error)

Mahalanobis distance  0.059
Bayesian classifier   0.05733333333333333
Euclidean Classifier    0.059333333333333335


In [None]:
# Experiment with the mean values (bringing them closer or taking them farther away) and the a prior-probabilities. Comment on the results.

In [48]:
m1 = np.array([1, 1, 1])
m2 = np.array([6, 6, 2])
m3 = np.array([6, 6, 4])


S11 = np.array([[0.8,0.2,0.1],
              [0.2,0.8,0.2],
              [0.1,0.2,0.8]])

S12 = np.array([[0.6,0.01,0.01],
              [0.01,0.8,0.01],
              [0.01,0.01,0.6]])

S13 = np.array([[0.6,0.1,0.1],
              [0.1,0.6,0.1],
              [0.1,0.1,0.6]])

In [52]:
N = 1000

prior_probs = [1/3, 1/3, 1/3]

X_new = np.vstack([
    np.random.multivariate_normal(m1,S1,N),
    np.random.multivariate_normal(m2,S2,N),
    np.random.multivariate_normal(m3,S3,N)
])

X1_new = np.vstack([
    np.random.multivariate_normal(m1,S1,N),
    np.random.multivariate_normal(m2,S2,N),
    np.random.multivariate_normal(m3,S3,N)
])

In [55]:

predicted_bayesian = [bayesian_classifier(test, [m1, m2, m3], [S1, S2, S3], prior_probs) for test in X1_new]
predicted_mahalanobis = [mahalanobis_classifier(test,[m1, m2, m3], [S1, S2, S3]) for test in X1_new]
predicted_euclidean = [euclidean_classifier(test, means) for test in X1]

In [56]:
actual_labels = np.repeat(np.arange(3), N)

euclidean_error = np.mean(predicted_euclidean != actual_labels)

mahalanobis_error = np.mean(predicted_mahalanobis != actual_labels)

bayesian_error = np.mean(predicted_bayesian != actual_labels)

print("Error Occured due to the, ")
print("Euclidean Classifier is:   ", euclidean_error)
print("Mahalanobis Classifier is: ", mahalanobis_error)
print("Bayesian Classifier is:    ", bayesian_error)

Error Occured due to the, 
Euclidean Classifier is:    0.5143333333333333
Mahalanobis Classifier is:  0.06533333333333333
Bayesian Classifier is:     0.065


### Considering the California Housing dataset, design a linear regression model considering each feature with non zero values, and report the best feature and model accordng to the R2 metric. (Evaluate your linear regression model using sum of squares due to regression (SSR), sum of squares error (SSE), sum of squares total (SST) and coefficient of determination R2 metric and adjusted R2 metric.)


In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

caldata = datasets.fetch_california_housing()
X = caldata.data
y = caldata.target
feature_names = caldata.feature_names

In [67]:
print("Data shape:", X.shape)
print("Target shape:", y.shape)
print("Feature names:", feature_names)

Data shape: (20640, 8)
Target shape: (20640,)
Feature names: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [59]:
results = {}

for i, feature_name in enumerate(feature_names):

    X_single_feature = X[:, [i]]
    X_train, X_test, y_train, y_test = train_test_split(X_single_feature, y, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    y_mean = np.mean(y_test)
    SST = np.sum((y_test - y_mean) ** 2)
    SSR = np.sum((y_pred - y_mean) ** 2)
    SSE = np.sum((y_test - y_pred) ** 2)

    R2 = SSR / SST
    n = len(y_test)
    p = 1  
    adjusted_R2 = 1 - (1 - R2) * (n - 1) / (n - p - 1)

    results[feature_name] = {
        'R2': R2,
        'Adjusted R2': adjusted_R2,
        'SST': SST,
        'SSR': SSR,
        'SSE': SSE
    }


In [64]:
results_df = pd.DataFrame(results).T
print(results_df)


best_feature = results_df['R2'].idxmax()
best_results = results_df.loc[best_feature]

print("\nBest Feature:", best_feature)
print("R^2:", best_results['R2'])
print("Adjusted R^2:", best_results['Adjusted R2'])
print("SST:", best_results['SST'])
print("SSR:", best_results['SSR'])
print("SSE:", best_results['SSE'])

                  R2  Adjusted R2          SST          SSR          SSE
MedInc      0.474947     0.474820  5409.368262  2569.163353  2927.229928
HouseAge    0.011083     0.010843  5409.368262    59.949346  5341.474007
AveRooms    0.035268     0.035034  5409.368262   190.776231  5334.744201
AveBedrms   0.005555     0.005314  5409.368262    30.049652  5411.343822
Population  0.000886     0.000644  5409.368262     4.792260  5408.865012
AveOccup    0.000238    -0.000004  5409.368262     1.287342  5406.214294
Latitude    0.021122     0.020885  5409.368262   114.258037  5290.818102
Longitude   0.002344     0.002103  5409.368262    12.681961  5399.860942

Best Feature: MedInc
R^2: 0.4749470231159636
Adjusted R^2: 0.47481976839543905
SST: 5409.368262178434
SSR: 2569.1633530596205
SSE: 2927.229928184818
