In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split,StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
df = pd.read_csv('CrimeVsNoCrimeArticles.csv')
df.head()

In [None]:
df.dropna(subset=['title'])
x = df['title']
y = df['is_crime_report']

x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = .2, random_state = 42)

print(f'the length of the training data is : {len(x_train)}')
print(f'the length of the testing data is :{len(x_test)}')


In [None]:
model = KNeighborsClassifier()
param_grid = {
    'vectorizer__max_features': [3000, 5000],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__sublinear_tf': [True, False],
    'model__n_neighbors': [3, 5, 7],
    'model__weights': ['uniform', 'distance'],
    'model__metric': ['euclidean', 'manhattan'],
}

vectorizer = TfidfVectorizer(stop_words = 'english')

pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('model', model)
])
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
grid_search= GridSearchCV(pipeline, param_grid, cv = cv, verbose = 1,n_jobs = 1)
grid_search.fit(x_train,y_train)

print(f'best parameters found: {grid_search.best_params_}')
print(f'best score : {grid_search.best_score_}')

In [None]:
# Make predictions with the best model
y_pred = grid_search.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Improved Accuracy with Hyperparameter Tuning: {accuracy * 100:.2f}%')

# Print the classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

In [None]:
import pickle

with open('best_knn_model.pkl', 'wb') as f:
    pickle.dump(grid_search.best_estimator_, f)