In [55]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [56]:
# Function to load datasets from CSV files and handle NaN values
def load_datasets(train_file, label_file, test_file):
    train_data = pd.read_csv(train_file).fillna(0).values
    train_labels = pd.read_csv(label_file).fillna(0).values.flatten()
    test_data = pd.read_csv(test_file).fillna(0).values
    return train_data, train_labels, test_data

In [57]:
# Function to calculate cosine similarity between two datasets
def calculate_cosine_similarity(train_vectors, test_vector):
    similarities = cosine_similarity(train_vectors, test_vector.reshape(1, -1)).flatten()
    return similarities

In [58]:
# Function to perform KNN with improved steps and save results to CSV
def knn_improved(train_data, train_labels, test_vector, k):
    if len(train_data) == 0 or len(test_vector) == 0:
        return None, None

    # Step 1: Calculate similarity between test and train data
    similarities = calculate_cosine_similarity(train_data, test_vector)
    
    # Save step 1 results
    pd.DataFrame({"similarities": similarities}).to_csv("step1_similarities.csv", index=False)

    # Step 2: Sort similarities and get top k neighbors
    sorted_indices = np.argsort(similarities)[::-1]  # Descending order
    top_k_indices = sorted_indices[:k]
    top_k_similarities = similarities[top_k_indices]
    top_k_labels = train_labels[top_k_indices]
    
    # Save step 2 results
    pd.DataFrame({"top_k_similarities": top_k_similarities, "top_k_labels": top_k_labels}).to_csv("step2_top_k.csv", index=False)

    # Step 3: Calculate n (new k-values) per category
    unique_labels = np.unique(train_labels)
    n_values = {label: 0 for label in unique_labels}
    for label in top_k_labels:
        n_values[label] += 1
    
    # Save step 3 results
    pd.DataFrame.from_dict(n_values, orient='index', columns=['n_values']).to_csv("step3_n_values.csv")

    # Step 4: Calculate similarity comparison per category
    similarity_sums = {label: 0 for label in unique_labels}
    for idx, label in enumerate(top_k_labels):
        similarity_sums[label] += top_k_similarities[idx]
    
    # Save step 4 results
    pd.DataFrame.from_dict(similarity_sums, orient='index', columns=['similarity_sums']).to_csv("step4_similarity_sums.csv")

    # Step 5: Calculate the maximum comparison ratio
    max_comparison_ratio = {}
    total_similarity = sum(top_k_similarities)
    if total_similarity > 0:
        for label in unique_labels:
            max_comparison_ratio[label] = similarity_sums[label] / total_similarity
    else:
        for label in unique_labels:
            max_comparison_ratio[label] = 0
    
    # Save step 5 results
    pd.DataFrame.from_dict(max_comparison_ratio, orient='index', columns=['max_comparison_ratio']).to_csv("step5_max_comparison_ratio.csv")

    # Return the category with the highest ratio
    if max_comparison_ratio:
        predicted_label = max(max_comparison_ratio, key=max_comparison_ratio.get)
        return predicted_label, max_comparison_ratio
    else:
        return None, None

In [59]:
# Example usage
train_file = "train_tfidf.csv"
label_file = "train_label.csv"
test_file = "test_tfidf.csv"
k = 12

train_vectors, train_labels, test_vectors = load_datasets(train_file, label_file, test_file)
predicted_labels = []

for test_vector in test_vectors:
    result = knn_improved(train_vectors, train_labels, test_vector, k)
    if result[0] is not None:
        predicted_labels.append(result[0])
    else:
        predicted_labels.append("Error")

print("Predicted Labels:", predicted_labels)
pd.DataFrame({"predicted_labels": predicted_labels}).to_csv("predicted_labels.csv", index=False)

Predicted Labels: [np.int64(1), np.int64(1), np.int64(-1), np.int64(-1), np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(1), np.int64(-1), np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(-1), np.int64(0), np.int64(1), np.int64(1), np.int64(1), np.int64(-1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(-1), np.int64(-1), np.int64(0), np.int64(-1), np.int64(-1), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(-1), np.int64(-1), np.int64(-1), np.int64(1), np.int64(-1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(-1), np.int64(-1), np.int64(0), np.i

In [65]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate model performance
def evaluate_model(predicted_labels, true_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='weighted', zero_division=0)
    recall = recall_score(true_labels, predicted_labels, average='weighted', zero_division=0)
    f1 = f1_score(true_labels, predicted_labels, average='weighted', zero_division=0)
    return accuracy, precision, recall, f1

In [66]:
# Load true labels for evaluation
true_labels = pd.read_csv("test_label.csv").fillna(0).values.flatten()

In [67]:
# Testing the model with multiple values of k
k_values = [1, 5, 10, 15, 20, 25, 30, 35, 40, 50, 60]
results = []
all_predictions = {"true_labels": true_labels}

In [68]:
for k in k_values:
    predicted_labels = []
    n_values_list = []  # To store n values for each test vector

    for test_vector in test_vectors:
        result = knn_improved(train_vectors, train_labels, test_vector, k)

        # Extract predicted label and n values from the result
        if result[0] is not None:
            predicted_labels.append(result[0])

            # Calculate n values per category (step 3 in knn_improved)
            similarities = calculate_cosine_similarity(train_vectors, test_vector)
            sorted_indices = np.argsort(similarities)[::-1][:k]
            top_k_labels = train_labels[sorted_indices]

            # Calculate n values for each label
            unique_labels, counts = np.unique(top_k_labels, return_counts=True)
            n_values = dict(zip(unique_labels, counts))
            total_n = sum(n_values.values())
            n_values_list.append(total_n)
        else:
            predicted_labels.append(-1)  # Error or undefined prediction
            n_values_list.append(0)  # Assign 0 if undefined

    # Add predictions for the current k to the dictionary
    all_predictions[f"predictions_k_{k}"] = predicted_labels
    
    # Evaluate the model
    accuracy, precision, recall, f1 = evaluate_model(predicted_labels, true_labels)

    # Calculate average n value
    avg_n_value = np.mean(n_values_list)

    # Store results
    results.append({
        "k": k,
        "n (nilai k baru)": avg_n_value,
        "Akurasi": accuracy,
        "Presisi": precision,
        "Recall": recall,
        "Skor F1": f1
    })

In [69]:
# Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("model_evaluation_results_with_n.csv", index=False)

# Print results for verification
print(results_df)

     k  n (nilai k baru)   Akurasi   Presisi    Recall   Skor F1
0    1               1.0  0.837748  0.841654  0.837748  0.838765
1    5               5.0  0.897351  0.902071  0.897351  0.897716
2   10              10.0  0.947020  0.947905  0.947020  0.947182
3   15              15.0  0.960265  0.962621  0.960265  0.960165
4   20              20.0  0.917219  0.919610  0.917219  0.916146
5   25              25.0  0.903974  0.907117  0.903974  0.902417
6   30              30.0  0.890728  0.894940  0.890728  0.888261
7   35              35.0  0.877483  0.881325  0.877483  0.874576
8   40              40.0  0.857616  0.860629  0.857616  0.853693
9   50              50.0  0.844371  0.848458  0.844371  0.839412
10  60              60.0  0.827815  0.833695  0.827815  0.821567


In [70]:
# Save all predictions to a separate CSV file
predictions_df = pd.DataFrame(all_predictions)
predictions_df.to_csv("all_predictions_by_k.csv", index=False)

In [71]:
# Print results for verification
print("Model Evaluation Results:")
print(results_df)
print("\nAll Predictions by k:")
print(predictions_df)

Model Evaluation Results:
     k  n (nilai k baru)   Akurasi   Presisi    Recall   Skor F1
0    1               1.0  0.837748  0.841654  0.837748  0.838765
1    5               5.0  0.897351  0.902071  0.897351  0.897716
2   10              10.0  0.947020  0.947905  0.947020  0.947182
3   15              15.0  0.960265  0.962621  0.960265  0.960165
4   20              20.0  0.917219  0.919610  0.917219  0.916146
5   25              25.0  0.903974  0.907117  0.903974  0.902417
6   30              30.0  0.890728  0.894940  0.890728  0.888261
7   35              35.0  0.877483  0.881325  0.877483  0.874576
8   40              40.0  0.857616  0.860629  0.857616  0.853693
9   50              50.0  0.844371  0.848458  0.844371  0.839412
10  60              60.0  0.827815  0.833695  0.827815  0.821567

All Predictions by k:
     true_labels  predictions_k_1  predictions_k_5  predictions_k_10  \
0              1                1                1                 1   
1              1           