In [70]:
#Importing Data from disease_features.csv

import pandas as pd
import numpy as np
import ast
import streamlit as st
df = pd.read_csv('disease_features.csv')


In [71]:
#Task 1: TF-IDF Feature Extraction

#Step 1 #########################################################################################
columns_to_parse=['Risk Factors', 'Symptoms', 'Signs', 'Subtypes']

for col in columns_to_parse:
    df[col]=df[col].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

#print(df['Symptoms'][5])             #Test if Data Loaded      

#Step 2 #########################################################################################
# Create new string columns for TF-IDF processing
df['Risk Factors_str'] = df['Risk Factors'].apply(lambda x: ' '.join(x))
df['Symptoms_str'] = df['Symptoms'].apply(lambda x: ' '.join(x))
df['Signs_str'] = df['Signs'].apply(lambda x: ' '.join(x))

#print(df[['Risk Factors_str', 'Symptoms_str', 'Signs_str']].head(1))         #Just for Testing

#Step 3 #########################################################################################
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizers = {}                        # Store vectorizers and resulting matrices
tfidf_matrices = {}

# Apply TF-IDF separately to each feature column
for col in ['Risk Factors_str', 'Symptoms_str', 'Signs_str']:
    vectorizer = TfidfVectorizer()
    matrix = vectorizer.fit_transform(df[col])
    
    vectorizers[col] = vectorizer
    tfidf_matrices[col] = matrix
    
    #print(f"✅ TF-IDF done for '{col}' → Shape: {matrix.shape}")

#Step 4 #########################################################################################
from scipy.sparse import hstack

# Combine TF-IDF matrices column-wise
combined_tfidf = hstack([
    tfidf_matrices['Risk Factors_str'],
    tfidf_matrices['Symptoms_str'],
    tfidf_matrices['Signs_str']
])

dense_matrix = combined_tfidf.toarray()
#print(dense_matrix)                     #Checking Ohhh Got Big help                                            
#print("✅ Combined TF-IDF Matrix shape:", combined_tfidf.shape)

#Step 5 #########################################################################################

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
onehot_matrices = {}

for col in ['Risk Factors', 'Symptoms', 'Signs']:
    onehot_matrix = mlb.fit_transform(df[col])
    onehot_matrices[col] = onehot_matrix
    print(f"One-hot encoding for {col}: {onehot_matrix.shape}")

# Combine one-hot matrices
combined_onehot = np.hstack([
    onehot_matrices['Risk Factors'],
    onehot_matrices['Symptoms'],
    onehot_matrices['Signs']
])

# Compare sparsity
tfidf_sparsity = 1.0 - (np.count_nonzero(dense_matrix) / dense_matrix.size)
onehot_sparsity = 1.0 - (np.count_nonzero(combined_onehot) / combined_onehot.size)

print("\nSparsity Comparison:")
print(f"TF-IDF Sparsity: {tfidf_sparsity:.2%}")
print(f"One-hot Sparsity: {onehot_sparsity:.2%}")

# 1. Dimensionality comparison
print("\nDimensionality Comparison:")
print(f"TF-IDF features: {combined_tfidf.shape[1]}")
print(f"One-hot features: {combined_onehot.shape[1]}")

# 2. Density and statistics
print("\nMatrix Statistics:")
print("TF-IDF:")
print(f"- Non-zero elements: {combined_tfidf.nnz}")
print(f"- Mean value: {combined_tfidf.mean():.4f}")

print("\nOne-hot:")
print(f"- Non-zero elements: {np.count_nonzero(combined_onehot)}")
print(f"- Mean value: {combined_onehot.mean():.4f}")

# 3. Feature Distribution Analysis
from scipy import stats

# For TF-IDF
tfidf_values = combined_tfidf.data  # Get non-zero values
print("\nFeature Value Distribution:")
print("TF-IDF:")
print(f"- Min: {tfidf_values.min():.4f}")
print(f"- Max: {tfidf_values.max():.4f}")
print(f"- Mean: {tfidf_values.mean():.4f}")
print(f"- Median: {np.median(tfidf_values):.4f}")
print(f"- Std Dev: {tfidf_values.std():.4f}")

# For One-hot
onehot_values = combined_onehot.flatten()
print("\nOne-hot:")
print(f"- Unique values: {np.unique(combined_onehot)}")
print(f"- Min: {onehot_values.min():.4f}")
print(f"- Max: {onehot_values.max():.4f}")
#print(f"- Mean: {onehot_values.mean():.4f}")                #Irrelevant as One-hot matrixes only have 0 and 1s
#print(f"- Median: {np.median(onehot_values):.4f}")
#print(f"- Std Dev: {onehot_values.std():.4f}")

# 4. Memory usage
print("\nMemory Usage:")
print(f"TF-IDF: {combined_tfidf.data.nbytes / 1024:.2f} KB")
print(f"One-hot: {combined_onehot.nbytes / 1024:.2f} KB")

# 5. Information density (average features per sample)
print("\nInformation Density:")
print(f"TF-IDF avg features per sample: {combined_tfidf.nnz / combined_tfidf.shape[0]:.2f}")
print(f"One-hot avg features per sample: {np.count_nonzero(combined_onehot) / combined_onehot.shape[0]:.2f}")



One-hot encoding for Risk Factors: (25, 170)
One-hot encoding for Symptoms: (25, 189)
One-hot encoding for Signs: (25, 62)

Sparsity Comparison:
TF-IDF Sparsity: 92.96%
One-hot Sparsity: 95.15%

Dimensionality Comparison:
TF-IDF features: 1020
One-hot features: 421

Matrix Statistics:
TF-IDF:
- Non-zero elements: 1795
- Mean value: 0.0119

One-hot:
- Non-zero elements: 510
- Mean value: 0.0485

Feature Value Distribution:
TF-IDF:
- Min: 0.0239
- Max: 0.6903
- Mean: 0.1687
- Median: 0.1618
- Std Dev: 0.0743

One-hot:
- Unique values: [0 1]
- Min: 0.0000
- Max: 1.0000

Memory Usage:
TF-IDF: 14.02 KB
One-hot: 41.11 KB

Information Density:
TF-IDF avg features per sample: 71.80
One-hot avg features per sample: 20.40


In [72]:
#Task 2: Dimensionality Reduction

#Step 1 #########################################################################################
# Apply PCA and Truncated SVD to both matrices
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# For TF-IDF matrix (use TruncatedSVD as it works with sparse matrices)
n_components = 3  # You can try 3 as well
svd = TruncatedSVD(n_components=n_components)
tfidf_reduced = svd.fit_transform(combined_tfidf)

# For one-hot encoded matrix (use PCA)
pca = PCA(n_components=n_components)
onehot_reduced = pca.fit_transform(combined_onehot)

# Compare explained variance ratios
print("Explained Variance Ratios:\n")
print("TF-IDF (TruncatedSVD):")
print(f"Total variance explained: {svd.explained_variance_ratio_.sum():.4f}")
for i, ratio in enumerate(svd.explained_variance_ratio_):
    print(f"Component {i+1}: {ratio:.4f}")

print("\nOne-hot (PCA):")
print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.4f}")
for i, ratio in enumerate(pca.explained_variance_ratio_):
    print(f"Component {i+1}: {ratio:.4f}")

#Step 2 #########################################################################################
# Visualize the reduced dimensions (2D plots)
# First, let's create a disease category column for color-coding

category_mapping = {
    "Acute Coronary Syndrome": "Cardiovascular",
    "Adrenal Insufficiency": "Endocrine",
    "Alzheimer": "Neurological",
    "Aortic Dissection": "Cardiovascular",
    "Asthma": "Respiratory",
    "Atrial Fibrillation": "Cardiovascular",
    "Cardiomyopathy": "Cardiovascular",
    "COPD": "Respiratory",
    "Diabetes": "Endocrine",
    "Epilepsy": "Neurological",
    "Gastritis": "Gastrointestinal",
    "Gastro-oesophageal Reflux Disease": "Gastrointestinal",
    "Heart Failure": "Cardiovascular",
    "Hyperlipidemia": "Cardiovascular",
    "Hypertension": "Cardiovascular",
    "Migraine": "Neurological",
    "Multiple Sclerosis": "Neurological",
    "Peptic Ulcer Disease": "Gastrointestinal",
    "Pituitary Disease": "Endocrine",
    "Pneumonia": "Respiratory",
    "Pulmonary Embolism": "Cardiovascular",
    "Stroke": "Neurological",
    "Thyroid Disease": "Endocrine",
    "Tuberculosis": "Infectious",
    "Upper Gastrointestinal Bleeding": "Gastrointestinal"
}

# Assign categories based on the mapping
df['Category'] = df['Disease'].map(category_mapping)

# Create a numeric mapping for categories
unique_categories = df['Category'].unique()
category_to_num = {category: i for i, category in enumerate(unique_categories)}
category_nums = df['Category'].map(category_to_num)

# Create a figure with two subplots
plt.figure(figsize=(16, 7))

# Plot TF-IDF reduced dimensions
plt.subplot(1, 2, 1)
scatter = plt.scatter(tfidf_reduced[:, 0], tfidf_reduced[:, 1], c=category_nums, cmap='viridis', alpha=0.8)
plt.title('TF-IDF Vectorization (Reduced to 2D)')
plt.xlabel('Component 1')
plt.ylabel('Component 2')

# Create custom legend
from matplotlib.lines import Line2D
legend_elements = [Line2D([0], [0], marker='o', color='w', 
                         markerfacecolor=plt.cm.viridis(category_to_num[cat]/len(category_to_num)), 
                         markersize=10, label=cat) 
                  for cat in unique_categories]
plt.legend(handles=legend_elements, title="Disease Categories")
plt.grid(True, linestyle='--', alpha=0.7)

# Plot One-hot reduced dimensions
plt.subplot(1, 2, 2)
scatter = plt.scatter(onehot_reduced[:, 0], onehot_reduced[:, 1], c=category_nums, cmap='viridis', alpha=0.8)
plt.title('One-hot Encoding (Reduced to 2D)')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.legend(handles=legend_elements, title="Disease Categories")
plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

# Optional: 3D visualization if you used 3 components
if n_components == 3:
    from mpl_toolkits.mplot3d import Axes3D
    
    fig = plt.figure(figsize=(16, 7))
    
    # 3D plot for TF-IDF
    ax1 = fig.add_subplot(121, projection='3d')
    scatter1 = ax1.scatter(tfidf_reduced[:, 0], tfidf_reduced[:, 1], tfidf_reduced[:, 2], 
                          c=category_nums, cmap='viridis', alpha=0.8)
    ax1.set_title('TF-IDF Vectorization (3D)')
    ax1.set_xlabel('Component 1')
    ax1.set_ylabel('Component 2')
    ax1.set_zlabel('Component 3')
    
    # Add a legend for the 3D plot
    legend1 = ax1.legend(handles=legend_elements, title="Disease Categories")
    
    # 3D plot for One-hot
    ax2 = fig.add_subplot(122, projection='3d')
    scatter2 = ax2.scatter(onehot_reduced[:, 0], onehot_reduced[:, 1], onehot_reduced[:, 2], 
                          c=category_nums, cmap='viridis', alpha=0.8)
    ax2.set_title('One-hot Encoding (3D)')
    ax2.set_xlabel('Component 1')
    ax2.set_ylabel('Component 2')
    ax2.set_zlabel('Component 3')
    
    # Add a legend for the 3D plot
    legend2 = ax2.legend(handles=legend_elements, title="Disease Categories")
    
    plt.tight_layout()
    plt.show()




#Discussion: 

#TF-IDF Encoding:
#The TF-IDF scatter plot shows distinctly separated clusters with less overlap among data points. Each cluster is clearly defined,
# indicating better separability of diseases into categories.

#Reason: TF-IDF probably captures the importance of individual terms within the context of their occurrence across different rows.
# It assigns higher weights to terms that are more unique to specific rows. By emphasizing distinguishing features (risk factors, symptoms, etc.),
#TF-IDF creates a richer representation, enabling algorithms to identify more nuanced patterns in the data.


#One-hot Encoding:
#The One-hot encoded scatter plot exhibits overlapping clusters where the boundaries between disease categories are less defined.
#The data points are grouped more tightly, showing less separability.

#Reason: One-hot encoding treats every feature equally without weighing its significance. It lacks the ability as only 0s and 1s
# are used, to capture relative importance or contextual nuances, resulting in less informative features.

Explained Variance Ratios:

TF-IDF (TruncatedSVD):
Total variance explained: 0.1315
Component 1: 0.0089
Component 2: 0.0656
Component 3: 0.0570

One-hot (PCA):
Total variance explained: 0.2801
Component 1: 0.1106
Component 2: 0.0951
Component 3: 0.0744


In [73]:
#Task 3: Train KNN Models and Logistic Regression

#Step 1 #########################################################################################
# Prepare for KNN modeling with different k values and distance metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler  # Added for normalization
from sklearn.pipeline import Pipeline  # Added for creating pipelines with normalization
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")  # Suppress warnings

# Define the target variable (disease categories)
target = df['Category']

# Define k values and distance metrics to test
k_values = [3, 5, 7]
metrics = ['euclidean', 'manhattan', 'cosine']

# Define scoring metrics for cross-validation
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted', zero_division=0),
    'recall': make_scorer(recall_score, average='weighted', zero_division=0),
    'f1': make_scorer(f1_score, average='weighted', zero_division=0)
}

# Create DataFrames to store results
results_df = pd.DataFrame(columns=['Model', 'Feature', 'Normalization', 'k', 'Metric', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

#Step 2 #########################################################################################
# Perform 3-fold cross-validation for KNN with different configurations
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Prepare data
tfidf_array = combined_tfidf.toarray()
onehot_array = combined_onehot

# Define normalization methods to test
normalizers = {
    'None': None,
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler()
}

# For TF-IDF features
for norm_name, normalizer in normalizers.items():
    for k in k_values:
        for metric in metrics:
            try:
                if normalizer is None:
                    # No normalization
                    knn = KNeighborsClassifier(n_neighbors=k, metric=metric, weights='distance')
                    cv_results = cross_validate(knn, tfidf_array, target, cv=cv, scoring=scoring)
                else:
                    # With normalization using pipeline
                    pipeline = Pipeline([
                        ('normalizer', normalizer),
                        ('knn', KNeighborsClassifier(n_neighbors=k, metric=metric, weights='distance'))
                    ])
                    cv_results = cross_validate(pipeline, tfidf_array, target, cv=cv, scoring=scoring)
                
                # Store results
                new_row = pd.DataFrame([{
                    'Model': 'KNN',
                    'Feature': 'TF-IDF',
                    'Normalization': norm_name,
                    'k': k,
                    'Metric': metric,
                    'Accuracy': cv_results['test_accuracy'].mean(),
                    'Precision': cv_results['test_precision'].mean(),
                    'Recall': cv_results['test_recall'].mean(),
                    'F1-Score': cv_results['test_f1'].mean()
                }])
                results_df = pd.concat([results_df, new_row], ignore_index=True)
            except Exception as e:
                # Handle any errors
                new_row = pd.DataFrame([{
                    'Model': 'KNN',
                    'Feature': 'TF-IDF',
                    'Normalization': norm_name,
                    'k': k,
                    'Metric': metric,
                    'Accuracy': np.nan,
                    'Precision': np.nan,
                    'Recall': np.nan,
                    'F1-Score': np.nan
                }])
                results_df = pd.concat([results_df, new_row], ignore_index=True)

# For One-hot encoded features
for norm_name, normalizer in normalizers.items():
    for k in k_values:
        for metric in metrics:
            try:
                if normalizer is None:
                    # No normalization
                    knn = KNeighborsClassifier(n_neighbors=k, metric=metric, weights='distance')
                    cv_results = cross_validate(knn, onehot_array, target, cv=cv, scoring=scoring)
                else:
                    # With normalization using pipeline
                    pipeline = Pipeline([
                        ('normalizer', normalizer),
                        ('knn', KNeighborsClassifier(n_neighbors=k, metric=metric, weights='distance'))
                    ])
                    cv_results = cross_validate(pipeline, onehot_array, target, cv=cv, scoring=scoring)
                
                # Store results
                new_row = pd.DataFrame([{
                    'Model': 'KNN',
                    'Feature': 'One-hot',
                    'Normalization': norm_name,
                    'k': k,
                    'Metric': metric,
                    'Accuracy': cv_results['test_accuracy'].mean(),
                    'Precision': cv_results['test_precision'].mean(),
                    'Recall': cv_results['test_recall'].mean(),
                    'F1-Score': cv_results['test_f1'].mean()
                }])
                results_df = pd.concat([results_df, new_row], ignore_index=True)
            except Exception as e:
                # Handle any errors
                new_row = pd.DataFrame([{
                    'Model': 'KNN',
                    'Feature': 'One-hot',
                    'Normalization': norm_name,
                    'k': k,
                    'Metric': metric,
                    'Accuracy': np.nan,
                    'Precision': np.nan,
                    'Recall': np.nan,
                    'F1-Score': np.nan
                }])
                results_df = pd.concat([results_df, new_row], ignore_index=True)

#Step 3 #########################################################################################
# Train Logistic Regression models on both matrices with and without normalization
for norm_name, normalizer in normalizers.items():
    # For TF-IDF features
    try:
        if normalizer is None:
            # No normalization
            lr = LogisticRegression(max_iter=2000, solver='saga', multi_class='auto', class_weight='balanced')
            lr_results = cross_validate(lr, tfidf_array, target, cv=cv, scoring=scoring)
        else:
            # With normalization using pipeline
            pipeline = Pipeline([
                ('normalizer', normalizer),
                ('lr', LogisticRegression(max_iter=2000, solver='saga', multi_class='auto', class_weight='balanced'))
            ])
            lr_results = cross_validate(pipeline, tfidf_array, target, cv=cv, scoring=scoring)
        
        # Store results
        new_row = pd.DataFrame([{
            'Model': 'Logistic Regression',
            'Feature': 'TF-IDF',
            'Normalization': norm_name,
            'k': 'N/A',
            'Metric': 'N/A',
            'Accuracy': lr_results['test_accuracy'].mean(),
            'Precision': lr_results['test_precision'].mean(),
            'Recall': lr_results['test_recall'].mean(),
            'F1-Score': lr_results['test_f1'].mean()
        }])
        results_df = pd.concat([results_df, new_row], ignore_index=True)
    except Exception as e:
        # Handle any errors
        new_row = pd.DataFrame([{
            'Model': 'Logistic Regression',
            'Feature': 'TF-IDF',
            'Normalization': norm_name,
            'k': 'N/A',
            'Metric': 'N/A',
            'Accuracy': np.nan,
            'Precision': np.nan,
            'Recall': np.nan,
            'F1-Score': np.nan
        }])
        results_df = pd.concat([results_df, new_row], ignore_index=True)

    # For One-hot encoded features
    try:
        if normalizer is None:
            # No normalization
            lr = LogisticRegression(max_iter=2000, solver='saga', multi_class='auto', class_weight='balanced')
            lr_results = cross_validate(lr, onehot_array, target, cv=cv, scoring=scoring)
        else:
            # With normalization using pipeline
            pipeline = Pipeline([
                ('normalizer', normalizer),
                ('lr', LogisticRegression(max_iter=2000, solver='saga', multi_class='auto', class_weight='balanced'))
            ])
            lr_results = cross_validate(pipeline, onehot_array, target, cv=cv, scoring=scoring)
        
        # Store results
        new_row = pd.DataFrame([{
            'Model': 'Logistic Regression',
            'Feature': 'One-hot',
            'Normalization': norm_name,
            'k': 'N/A',
            'Metric': 'N/A',
            'Accuracy': lr_results['test_accuracy'].mean(),
            'Precision': lr_results['test_precision'].mean(),
            'Recall': lr_results['test_recall'].mean(),
            'F1-Score': lr_results['test_f1'].mean()
        }])
        results_df = pd.concat([results_df, new_row], ignore_index=True)
    except Exception as e:
        # Handle any errors
        new_row = pd.DataFrame([{
            'Model': 'Logistic Regression',
            'Feature': 'One-hot',
            'Normalization': norm_name,
            'k': 'N/A',
            'Metric': 'N/A',
            'Accuracy': np.nan,
            'Precision': np.nan,
            'Recall': np.nan,
            'F1-Score': np.nan
        }])
        results_df = pd.concat([results_df, new_row], ignore_index=True)

#Step 4 #########################################################################################
# Display results in smaller, more focused tables
# Format numeric columns to 4 decimal places
numeric_cols = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
for col in numeric_cols:
    results_df[col] = results_df[col].apply(lambda x: f"{x:.4f}" if not pd.isna(x) else "N/A")

# 1. Compare KNN with different normalizations
print("\n1. KNN Model Comparison by Normalization Method:")
for norm_name in results_df['Normalization'].unique():
    print(f"\n--- KNN with {norm_name} Normalization ---")
    
    # Filter data for KNN with this normalization
    knn_norm_data = results_df[(results_df['Model'] == 'KNN') & 
                              (results_df['Normalization'] == norm_name)]
    
    # Create a pivot table to compare TF-IDF vs One-hot
    pivot_table = pd.pivot_table(
        knn_norm_data,
        values=['Accuracy', 'F1-Score'],
        index=['k', 'Metric'],
        columns=['Feature'],
        aggfunc='first'  # Just take the first value since there should be only one
    )
    
    # Reorder columns for better readability
    if ('Accuracy', 'TF-IDF') in pivot_table.columns and ('Accuracy', 'One-hot') in pivot_table.columns:
        pivot_table = pivot_table[[('Accuracy', 'TF-IDF'), ('Accuracy', 'One-hot'), 
                                  ('F1-Score', 'TF-IDF'), ('F1-Score', 'One-hot')]]
    
    # Display the table
    print(pivot_table)

# 2. Compare Logistic Regression with different normalizations
print("\n2. Logistic Regression Model Comparison by Normalization Method:")
lr_data = results_df[results_df['Model'] == 'Logistic Regression']

# Create a pivot table to compare TF-IDF vs One-hot across normalizations
lr_pivot = pd.pivot_table(
    lr_data,
    values=['Accuracy', 'Precision', 'Recall', 'F1-Score'],
    index=['Normalization'],
    columns=['Feature'],
    aggfunc='first'  # Just take the first value since there should be only one
)

# Display the table
print(lr_pivot)

# 3. Best Models by Feature Type
print("\n3. Best Models by Feature Type:")
# Convert F1-Score to float for finding the best models
results_df['F1-Score_float'] = results_df['F1-Score'].apply(lambda x: float(x) if x != "N/A" else 0)

# Find best model for TF-IDF
best_tfidf = results_df[results_df['Feature'] == 'TF-IDF'].loc[results_df[results_df['Feature'] == 'TF-IDF']['F1-Score_float'].idxmax()]
best_onehot = results_df[results_df['Feature'] == 'One-hot'].loc[results_df[results_df['Feature'] == 'One-hot']['F1-Score_float'].idxmax()]

# Create a DataFrame with the best models
best_models = pd.DataFrame([best_tfidf, best_onehot])
best_models = best_models.drop('F1-Score_float', axis=1)
best_models.index = ['Best TF-IDF Model', 'Best One-hot Model']

# Display the table
print(best_models[['Model', 'Normalization', 'k', 'Metric', 'Accuracy', 'Precision', 'Recall', 'F1-Score']])

# 4. Top 5 Models Overall
print("\n4. Top 5 Models Overall:")
top_models = results_df.sort_values(by='F1-Score_float', ascending=False).head(5)
top_models = top_models.drop('F1-Score_float', axis=1)
top_models.index = range(1, len(top_models) + 1)  # Reset index to start from 1
print(top_models[['Model', 'Feature', 'Normalization', 'k', 'Metric', 'Accuracy', 'Precision', 'Recall', 'F1-Score']])

# 5. Effect of k Value on KNN Performance (Best Normalization and Metric)
print("\n5. Effect of k Value on KNN Performance:")
# Get the best normalization and metric from the top model
best_norm = top_models[top_models['Model'] == 'KNN']['Normalization'].iloc[0] if not top_models[top_models['Model'] == 'KNN'].empty else results_df['Normalization'].iloc[0]
best_metric = top_models[top_models['Model'] == 'KNN']['Metric'].iloc[0] if not top_models[top_models['Model'] == 'KNN'].empty else results_df['Metric'].iloc[0]

# Filter data for the best normalization and metric
k_effect_data = results_df[(results_df['Model'] == 'KNN') & 
                          (results_df['Normalization'] == best_norm) &
                          (results_df['Metric'] == best_metric)]

# Create a pivot table to compare k values
k_pivot = pd.pivot_table(
    k_effect_data,
    values=['Accuracy', 'F1-Score'],
    index=['k'],
    columns=['Feature'],
    aggfunc='first'
)

# Display the table
print(f"Using {best_norm} normalization and {best_metric} metric:")
print(k_pivot)

# Clean up temporary column
results_df = results_df.drop('F1-Score_float', axis=1)


1. KNN Model Comparison by Normalization Method:

--- KNN with None Normalization ---
            Accuracy         F1-Score        
Feature       TF-IDF One-hot   TF-IDF One-hot
k Metric                                     
3 cosine      0.6806  0.5972   0.6362  0.5574
  euclidean   0.4861  0.2731   0.3804  0.1616
  manhattan   0.3194  0.2731   0.1608  0.1616
5 cosine      0.7222  0.3981   0.6778  0.3178
  euclidean   0.4815  0.2731   0.3343  0.1505
  manhattan   0.3611  0.2731   0.2163  0.1505
7 cosine      0.5972  0.4444   0.5197  0.3986
  euclidean   0.3981  0.2778   0.2509  0.1545
  manhattan   0.3194  0.2778   0.1571  0.1545

--- KNN with StandardScaler Normalization ---
            Accuracy         F1-Score        
Feature       TF-IDF One-hot   TF-IDF One-hot
k Metric                                     
3 cosine      0.6806  0.5139   0.6245  0.4730
  euclidean   0.3194  0.2731   0.1571  0.1590
  manhattan   0.3194  0.2731   0.1571  0.1590
5 cosine      0.6019  0.3565   0.5111 