<a href="https://colab.research.google.com/github/MamidalaSaiPranathi/MSP/blob/lab2/module1_lab_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt

rng = np.random.default_rng(seed=42)

In [None]:
dataset = datasets.fetch_california_housing()
# Dataset description
print(dataset.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [None]:
print("Orignal target values:", dataset.target)

dataset.target = dataset.target.astype(int)

print("Target values after conversion:", dataset.target)
print("Input variables shape:", dataset.data.shape)
print("Output variables shape:", dataset.target.shape)

Orignal target values: [4.526 3.585 3.521 ... 0.923 0.847 0.894]
Target values after conversion: [4 3 3 ... 0 0 0]
Input variables shape: (20640, 8)
Output variables shape: (20640,)


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the 1-nearest neighbor classifier
knn_1 = KNeighborsClassifier(n_neighbors=1)
knn_1.fit(X_train, y_train)

# Predict using the 1-nearest neighbor classifier
y_pred_1 = knn_1.predict(X_test)

# Calculate accuracy for 1-nearest neighbor
accuracy_1 = accuracy_score(y_test, y_pred_1)
print(f"Accuracy of 1-Nearest Neighbor Classifier: {accuracy_1:.2f}")

# Initialize and train the 3-nearest neighbor classifier
knn_3 = KNeighborsClassifier(n_neighbors=3)
knn_3.fit(X_train, y_train)

# Predict using the 3-nearest neighbor classifier
y_pred_3 = knn_3.predict(X_test)

# Calculate accuracy for 3-nearest neighbor
accuracy_3 = accuracy_score(y_test, y_pred_3)
print(f"Accuracy of 3-Nearest Neighbor Classifier: {accuracy_3:.2f}")

# Compare the results
if accuracy_1 > accuracy_3:
    print("The 1-Nearest Neighbor Classifier performed better.")
elif accuracy_1 < accuracy_3:
    print("The 3-Nearest Neighbor Classifier performed better.")
else:
    print("Both classifiers performed equally well.")


Accuracy of 1-Nearest Neighbor Classifier: 1.00
Accuracy of 3-Nearest Neighbor Classifier: 1.00
Both classifiers performed equally well.


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target

def evaluate_knn_with_splits(test_sizes):
    results = []
    for test_size in test_sizes:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
        knn_1 = KNeighborsClassifier(n_neighbors=1)
        knn_1.fit(X_train, y_train)
        y_pred_1 = knn_1.predict(X_test)
        accuracy_1 = accuracy_score(y_test, y_pred_1)
        knn_3 = KNeighborsClassifier(n_neighbors=3)
        knn_3.fit(X_train, y_train)
        y_pred_3 = knn_3.predict(X_test)
        accuracy_3 = accuracy_score(y_test, y_pred_3)
        results.append((test_size, accuracy_1, accuracy_3))

    return results
test_sizes = [0.1, 0.2, 0.3, 0.4, 0.5]
results = evaluate_knn_with_splits(test_sizes)

# Print results
print("Test Size | 1-NN Accuracy | 3-NN Accuracy")
print("----------------------------------------")
for test_size, accuracy_1, accuracy_3 in results:
    print(f"  {test_size:.2f}   |     {accuracy_1:.2f}     |     {accuracy_3:.2f}")

# Analyze performance trends
for test_size, accuracy_1, accuracy_3 in results:
    if accuracy_1 > accuracy_3:
        print(f"At test size {test_size:.2f}, 1-NN performed better.")
    elif accuracy_1 < accuracy_3:
        print(f"At test size {test_size:.2f}, 3-NN performed better.")
    else:
        print(f"At test size {test_size:.2f}, both classifiers performed equally well.")


Test Size | 1-NN Accuracy | 3-NN Accuracy
----------------------------------------
  0.10   |     1.00     |     1.00
  0.20   |     1.00     |     1.00
  0.30   |     1.00     |     1.00
  0.40   |     0.98     |     0.98
  0.50   |     0.97     |     0.97
At test size 0.10, both classifiers performed equally well.
At test size 0.20, both classifiers performed equally well.
At test size 0.30, both classifiers performed equally well.
At test size 0.40, both classifiers performed equally well.
At test size 0.50, both classifiers performed equally well.


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

data = load_digits()
X = data.data
y = data.target

X_limited = []
y_limited = []
for digit in range(10):
    digit_indices = np.where(y == digit)[0][:50]
    X_limited.extend(X[digit_indices])
    y_limited.extend(y[digit_indices])
X_limited = np.array(X_limited)
y_limited = np.array(y_limited)

def evaluate_knn_with_splits(test_sizes):
    results = []
    for test_size in test_sizes:
        X_train, X_test, y_train, y_test = train_test_split(X_limited, y_limited, test_size=test_size, random_state=42)

        knn_1 = KNeighborsClassifier(n_neighbors=1)
        knn_1.fit(X_train, y_train)

        y_pred_1 = knn_1.predict(X_test)

        accuracy_1 = accuracy_score(y_test, y_pred_1)

        knn_3 = KNeighborsClassifier(n_neighbors=3)
        knn_3.fit(X_train, y_train)

        y_pred_3 = knn_3.predict(X_test)

        accuracy_3 = accuracy_score(y_test, y_pred_3)

        results.append((test_size, accuracy_1, accuracy_3))

    return results

test_sizes = [0.1, 0.2, 0.3, 0.4, 0.5]
results = evaluate_knn_with_splits(test_sizes)

print("Test Size | 1-NN Accuracy | 3-NN Accuracy")
print("----------------------------------------")
for test_size, accuracy_1, accuracy_3 in results:
    print(f"  {test_size:.2f}   |     {accuracy_1:.2f}     |     {accuracy_3:.2f}")

for test_size, accuracy_1, accuracy_3 in results:
    if accuracy_1 > accuracy_3:
        print(f"At test size {test_size:.2f}, 1-NN performed better.")
    elif accuracy_1 < accuracy_3:
        print(f"At test size {test_size:.2f}, 3-NN performed better.")
    else:
        print(f"At test size {test_size:.2f}, both classifiers performed equally well.")


Test Size | 1-NN Accuracy | 3-NN Accuracy
----------------------------------------
  0.10   |     1.00     |     0.98
  0.20   |     1.00     |     0.99
  0.30   |     0.99     |     0.98
  0.40   |     0.98     |     0.98
  0.50   |     0.99     |     0.99
At test size 0.10, 1-NN performed better.
At test size 0.20, 1-NN performed better.
At test size 0.30, 1-NN performed better.
At test size 0.40, both classifiers performed equally well.
At test size 0.50, both classifiers performed equally well.
