In [None]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Function to load datasets from CSV files and handle NaN values
def load_datasets(train_file, label_file, test_file):
    train_data = pd.read_csv(train_file).fillna(0).values
    train_labels = pd.read_csv(label_file).fillna(0).values.flatten()
    test_data = pd.read_csv(test_file).fillna(0).values
    return train_data, train_labels, test_data

# Function to calculate cosine similarity between two datasets
def calculate_cosine_similarity(train_vectors, test_vector):
    similarities = cosine_similarity(train_vectors, test_vector.reshape(1, -1)).flatten()
    return similarities

# Function to perform KNN with improved steps and save results to CSV
def knn_improved(train_data, train_labels, test_vector, k):
    # Step 1: Calculate similarity between test and train data
    similarities = calculate_cosine_similarity(train_data, test_vector)
    
    # Save step 1 results
    pd.DataFrame({"similarities": similarities}).to_csv("step1_similarities.csv", index=False)

    # Step 2: Sort similarities and get top k neighbors
    sorted_indices = np.argsort(similarities)[::-1]  # Descending order
    top_k_indices = sorted_indices[:k]
    top_k_similarities = similarities[top_k_indices]
    top_k_labels = train_labels[top_k_indices]
    
    # Save step 2 results
    pd.DataFrame({"top_k_similarities": top_k_similarities, "top_k_labels": top_k_labels}).to_csv("step2_top_k.csv", index=False)

    # Step 3: Calculate n (new k-values) per category
    unique_labels = np.unique(train_labels)
    n_values = {label: 0 for label in unique_labels}
    for label in top_k_labels:
        n_values[label] += 1
    
    # Save step 3 results
    pd.DataFrame.from_dict(n_values, orient='index', columns=['n_values']).to_csv("step3_n_values.csv")

    # Step 4: Calculate similarity comparison per category
    similarity_sums = {label: 0 for label in unique_labels}
    for idx, label in enumerate(top_k_labels):
        similarity_sums[label] += top_k_similarities[idx]
    
    # Save step 4 results
    pd.DataFrame.from_dict(similarity_sums, orient='index', columns=['similarity_sums']).to_csv("step4_similarity_sums.csv")

    # Step 5: Calculate the maximum comparison ratio
    max_comparison_ratio = {}
    for label in unique_labels:
        if similarity_sums[label] > 0:
            max_comparison_ratio[label] = similarity_sums[label] / sum(top_k_similarities)
        else:
            max_comparison_ratio[label] = 0
    
    # Save step 5 results
    pd.DataFrame.from_dict(max_comparison_ratio, orient='index', columns=['max_comparison_ratio']).to_csv("step5_max_comparison_ratio.csv")

    # Return the category with the highest ratio
    predicted_label = max(max_comparison_ratio, key=max_comparison_ratio.get)

# Example usage
train_file = "train_tfidf.csv"
label_file = "output.csv"
test_file = "test_tfidf.csv"
k = 2

train_vectors, train_labels, test_vectors = load_datasets(train_file, label_file, test_file)
predicted_labels = []

for test_vector in test_vectors:
    predicted_label, ratios = knn_improved(train_vectors, train_labels, test_vector, k)
    predicted_labels.append(predicted_label)

print("Predicted Labels:", predicted_labels)
pd.DataFrame({"predicted_labels": predicted_labels}).to_csv("predicted_labels.csv", index=False)


ValueError: Input contains NaN.