In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

# Step 1: Data Collection
# Assuming we have a list of policies and their corresponding labels
df = pd.read_csv('dataset_testv2.csv', delimiter = ',')

policies = df['original_text']
labels = df['acceptable']

# Step 2: Data Preprocessing and Labeling
# No preprocessing needed as an example, but we can add text cleaning if required

# Step 3: Feature Extraction
vectorizer = TfidfVectorizer(decode_error='replace', encoding='utf-8')
X = vectorizer.fit_transform(policies.values.astype('U'))

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=31)

# Create a KNN classifier
knn = KNeighborsClassifier()

# Define the parameter grid for grid search
param_grid = {
              'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 
              'weights': ['uniform', 'distance'],
              'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
              'leaf_size':[30],
              'p':[1, 2],
              'metric':['minkowski','manhattan']
             }

# Perform grid search using cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=10)
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding accuracy
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test accuracy of the best model: ", test_accuracy)






























































































































































Best parameters found:  {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'n_neighbors': 16, 'p': 2, 'weights': 'uniform'}
Best accuracy found:  0.7511278195488722
Test accuracy of the best model:  0.7535211267605634
