In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import GridSearchCV


# Load the data from the Excel file
file_path = 'test.xlsx'
df = pd.read_excel(file_path)

# Ensure that 'Text' column is used as the independent variable
# Convert all entries to strings to avoid errors with datetime or other types
df['Text'] = df['Text'].astype(str)

# Drop any rows where 'Text' or 'Type' is NaN
df.dropna(subset=['Text', 'Type'], inplace=True)

# Define the features and labels
X = df['Text']
y = df['Type']

# Encode the labels into numeric values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Preprocess the text data using TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split the data into training and test sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.3, random_state=42)

# Train a kNN classifier (initially with k=3)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Calculate other performance metrics (precision, recall, F1-score)
report = classification_report(y_test, y_pred)
print("\nClassification Report:\n", report)

# Calculate MSE, RMSE, MAPE, and R2 scores
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAPE: {mape}")
print(f"R2 Score: {r2}")

# Perform hyperparameter tuning using GridSearchCV to find the best k value
param_grid = {'n_neighbors': np.arange(1, 20)}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best k value found
print(f"Best k value: {grid_search.best_params_['n_neighbors']}")

# Retrain the model with the best k value
best_knn = grid_search.best_estimator_
best_knn.fit(X_train, y_train)

# Evaluate the model again with the best k
y_pred_best = best_knn.predict(X_test)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)
print("\nConfusion Matrix with Best k:\n", conf_matrix_best)

report_best = classification_report(y_test, y_pred_best)
print("\nClassification Report with Best k:\n", report_best)

Confusion Matrix:
 [[48  8  7]
 [26 20  6]
 [45  4 41]]

Classification Report:
               precision    recall  f1-score   support

           0       0.40      0.76      0.53        63
           1       0.62      0.38      0.48        52
           2       0.76      0.46      0.57        90

    accuracy                           0.53       205
   macro avg       0.60      0.53      0.52       205
weighted avg       0.62      0.53      0.53       205

MSE: 1.2292682926829268
RMSE: 1.1087237224317548
MAPE: 483313130742199.9
R2 Score: -0.6862514688601653
Best k value: 13

Confusion Matrix with Best k:
 [[45  5 13]
 [18 25  9]
 [23 12 55]]

Classification Report with Best k:
               precision    recall  f1-score   support

           0       0.52      0.71      0.60        63
           1       0.60      0.48      0.53        52
           2       0.71      0.61      0.66        90

    accuracy                           0.61       205
   macro avg       0.61      0.60      0

In [2]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

# Example Actual and Predicted labels (simplified for demonstration)
actual = np.array([0]*63 + [1]*52 + [2]*90)  # Replace with actual labels
predicted = np.array([0]*47 + [1]*5 + [2]*11 + [0]*18 + [1]*25 + [2]*9 + [0]*24 + [1]*12 + [2]*54)  # Replace with predicted labels

# Calculate MSE
mse = mean_squared_error(actual, predicted)

# Calculate RMSE
rmse = np.sqrt(mse)

# Calculate MAPE
mape = mean_absolute_percentage_error(actual, predicted) * 100

# Calculate R^2 Score
r2 = r2_score(actual, predicted)

print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAPE: {mape}")
print(f"R^2 Score: {r2}")


MSE: 0.8975609756097561
RMSE: 0.9473969472242119
MAPE: 5.931570240926996e+16
R^2 Score: -0.23123123123123168
