In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Read and preprocess the dataset
df = pd.read_csv("crime.csv")
df['Dates'] = pd.to_datetime(df['Dates'])
df['Year'] = df['Dates'].dt.year
df['Month'] = df['Dates'].dt.month
df['Day'] = df['Dates'].dt.day
df['Hour'] = df['Dates'].dt.hour
df['Minute'] = df['Dates'].dt.minute
df['Second'] = df['Dates'].dt.second
df = df.drop(['Dates', 'Resolution', 'Descript', 'Address'], axis=1)
df = pd.get_dummies(df, columns=['DayOfWeek', 'PdDistrict'])

# Sample a portion of the dataset
df = df.sample(frac=0.01, random_state=42)

In [3]:
# Separate features and target variable
y = df['Category']
X = df.drop('Category', axis=1)

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [5]:
# Define the KNN class
class KNN:
    def __init__(self, k):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def predict(self, X_test):
        y_pred = np.empty(X_test.shape[0], dtype=object)
        for i in range(X_test.shape[0]):
            distances = np.sqrt(np.sum((self.X_train - X_test[i])**2, axis=1))
            nearest_indices = np.argsort(distances)[:self.k]
            nearest_labels = pd.Series(self.y_train).iloc[nearest_indices]
            y_pred[i] = nearest_labels.mode()[0]
        return y_pred

In [6]:
# Define the range of k values to try
start_k = 1
end_k = 1000

In [7]:
# Track the highest accuracy and its corresponding k value
best_accuracy = 0
best_k = None

In [8]:
# Iterate over different values of k
for k in range(start_k, end_k + 1):
    # Instantiate and train the KNN model
    knn = KNN(k=k)
    knn.fit(X_train.values, y_train.values)
    
    # Make predictions
    predictions = knn.predict(X_test.values)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, predictions)
    
    # Update the best accuracy and k value if needed
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = k

In [9]:
# Output the highest accuracy and its corresponding k value
print("Highest accuracy:", best_accuracy)
print("Corresponding k value:", best_k)

Highest accuracy: 0.21127562642369022
Corresponding k value: 50
