# 1 Nearest Neighbour Classifier using MAGIC telescopic dataset

## Balancing g and h Classes

In [2]:
import random

file_path = 'magic.txt'
with open(file_path, 'r') as f:
    lines = f.readlines()

g_records = [line for line in lines if line.strip().endswith(',g')]
h_records = [line for line in lines if line.strip().endswith(',h')]

g_count = len(g_records)
h_count = len(h_records)

print(f"Original 'g' records: {g_count}")
print(f"Original 'h' records: {h_count}")

balanced_g_records = random.sample(g_records, h_count)

balanced_dataset = balanced_g_records + h_records

random.shuffle(balanced_dataset)

output_file = 'balanced_dataset.txt'
with open(output_file, 'w') as f:
    f.writelines(balanced_dataset)

print(f"Balanced dataset saved to {output_file}")

with open(output_file, 'r') as f:
    lines = f.readlines()

g_records = [line for line in lines if line.strip().endswith(',g')]
h_records = [line for line in lines if line.strip().endswith(',h')]

g_count = len(g_records)
h_count = len(h_records)

print(f"Original 'g' records: {g_count}")
print(f"Original 'h' records: {h_count}")

Original 'g' records: 12332
Original 'h' records: 6688
Balanced dataset saved to balanced_dataset.txt
Original 'g' records: 6688
Original 'h' records: 6688


## Splitting Dataset

In [3]:
file_path = 'balanced_dataset.txt'
with open(file_path, 'r') as f:
    lines = f.readlines()

random.shuffle(lines)

total_records = len(lines)
train_size = int(0.7 * total_records)
val_size = int(0.15 * total_records)

train_set = lines[:train_size]
val_set = lines[train_size:train_size + val_size]
test_set = lines[train_size + val_size:]

with open('train_set.txt', 'w') as f:
    f.writelines(train_set)

with open('validation_set.txt', 'w') as f:
    f.writelines(val_set)

with open('test_set.txt', 'w') as f:
    f.writelines(test_set)

print("Dataset split completed:")
print(f"Training set: {len(train_set)} records")
print(f"Validation set: {len(val_set)} records")
print(f"Test set: {len(test_set)} records")

Dataset split completed:
Training set: 9363 records
Validation set: 2006 records
Test set: 2007 records


## Classifier Training

In [5]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting numpy>=1.19.5 (from scikit-learn)
  Downloading numpy-2.1.2-cp312-cp312-win_amd64.whl.metadata (59 kB)
     ---------------------------------------- 0.0/59.7 kB ? eta -:--:--
     ---------------------------------------- 59.7/59.7 kB 1.6 MB/s eta 0:00:00
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ---------------------------------------- 60.8/60.8 kB 1.6 MB/s eta 0:00:00
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   -


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.5 MB 2.0 MB/s eta 0:00:06
   ---------------------------------------- 0.1/11.5 MB 1.8 MB/s eta 0:00:07
    --------------------------------------- 0.2/11.5 MB 1.8 MB/s eta 0:00:07
   - -------------------------------------- 0.3/11.5 MB 1.8 MB/s eta 0:00:07
   - -------------------------------------- 0.5/11.5 MB 2.0 MB/s eta 0:00:06
   -- ------------------------------------- 0.6/11.5 MB 2.2 MB/s eta 0:00:06
   -- ------------------------------------- 0.7/11.5 MB 2.3 MB/s eta 0:00:05
   -- ---------------------------


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

def load_data(file_path):
    df = pd.read_csv(file_path, header=None)
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values 
    return X, y

X_train, y_train = load_data('train_set.txt')
X_val, y_val = load_data('validation_set.txt')
X_test, y_test = load_data('test_set.txt')

y_train = np.where(y_train == 'g', 1, 0)
y_val = np.where(y_val == 'g', 1, 0)
y_test = np.where(y_test == 'g', 1, 0)

knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train, y_train)

## Predicting on Validation Set

In [12]:
val_predictions = knn.predict(X_val)

print("Classification Report on Validation Set:")
print(classification_report(y_val, val_predictions, target_names=['h', 'g']))

Classification Report on Validation Set:
              precision    recall  f1-score   support

           h       0.74      0.73      0.73       969
           g       0.75      0.76      0.75      1037

    accuracy                           0.74      2006
   macro avg       0.74      0.74      0.74      2006
weighted avg       0.74      0.74      0.74      2006



## Predicting on Test Set

In [13]:
test_predictions = knn.predict(X_test)

print("Classification Report on Test Set:")
print(classification_report(y_test, test_predictions, target_names=['h', 'g']))

Classification Report on Test Set:
              precision    recall  f1-score   support

           h       0.74      0.71      0.72       977
           g       0.73      0.76      0.75      1030

    accuracy                           0.73      2007
   macro avg       0.73      0.73      0.73      2007
weighted avg       0.73      0.73      0.73      2007

