In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Function to handle preprocessing, train Logistic Regression, and predict
def classify_with_logistic_and_save(train_data_path, train_label_path, test_data_path, output_file):
    # Load data
    # Removed delimiter='\t' as delim_whitespace=True is sufficient for handling multiple spaces
    train_data = pd.read_csv(train_data_path, header=None, delim_whitespace=True)
    train_labels = pd.read_csv(train_label_path, delimiter='\t', header=None) # Kept delimiter='\t' for label files, assuming they are tab-separated
    test_data = pd.read_csv(test_data_path, header=None, delim_whitespace=True)

    # Convert all columns in train_data and test_data to numeric, errors='coerce' replaces invalid values with NaN
    train_data = train_data.apply(pd.to_numeric, errors='coerce')
    test_data = test_data.apply(pd.to_numeric, errors='coerce')


    # Handle missing values with KNN Imputer
    imputer = KNNImputer(n_neighbors=5)
    train_data = imputer.fit_transform(train_data)
    test_data = imputer.transform(test_data)

    # Standardize features
    scaler = StandardScaler()
    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)

    # Hyperparameter tuning for Logistic Regression
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'solver': ['lbfgs', 'saga'],
        'max_iter': [200, 500]
    }
    grid_search = GridSearchCV(
        LogisticRegression(random_state=42, multi_class='multinomial'),
        param_grid,
        cv=3,
        scoring='accuracy'
    )

    # Train model
    grid_search.fit(train_data, train_labels.values.ravel())
    best_model = grid_search.best_estimator_

    # Predict on test data
    predictions = best_model.predict(test_data)

    # Save predictions to file
    pd.DataFrame(predictions).to_csv(output_file, index=False, header=False)


# Paths for the datasets
datasets = [
    ("TrainData1.txt", "TrainLabel1.txt", "TestData1.txt", "AlternativeMethodClassification1.txt"),
    ("TrainData2.txt", "TrainLabel2.txt", "TestData2.txt", "AlternativeMethodClassification2.txt"),
    ("TrainData3.txt", "TrainLabel3.txt", "TestData3.txt", "AlternativeMethodClassification3.txt"),
    ("TrainData4.txt", "TrainLabel4.txt", "TestData4.txt", "AlternativeMethodClassification4.txt"),
    ("TrainData5.txt", "TrainLabel5.txt", "TestData5.txt", "AlternativeMethodClassification5.txt"),
    ("TrainData6.txt", "TrainLabel6.txt", "TestData6.txt", "AlternativeMethodClassification6.txt")
]

# Loop through each dataset and process
for train_data, train_label, test_data, output_file in datasets:
    classify_with_logistic_and_save(train_data, train_label, test_data, output_file)

print("Predictions saved for all datasets using Logistic Regression.")


  train_data = pd.read_csv(train_data_path, header=None, delim_whitespace=True)
  test_data = pd.read_csv(test_data_path, header=None, delim_whitespace=True)
  train_data = pd.read_csv(train_data_path, header=None, delim_whitespace=True)
  test_data = pd.read_csv(test_data_path, header=None, delim_whitespace=True)
  train_data = pd.read_csv(train_data_path, header=None, delim_whitespace=True)
  test_data = pd.read_csv(test_data_path, header=None, delim_whitespace=True)


ValueError: X has 1 features, but KNNImputer is expecting 13 features as input.