In [68]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

def load_data(file_path):
    """Load dataset from CSV file."""
    indexes = list(range(10))
    dataset = pd.read_csv(file_path, sep=",", header=None, names=indexes)
    return dataset

def encode_categorical_features(dataset):
    """Encode categorical features using Label Encoding."""
    le = LabelEncoder()
    for column in dataset.columns:
        if dataset[column].dtype == "object":
            dataset[column] = le.fit_transform(dataset[column].astype(str))
    return dataset

def impute_missing_values(dataset):
    """Impute missing values using the mean strategy."""
    imputer = SimpleImputer(strategy='mean')
    return pd.DataFrame(imputer.fit_transform(dataset), columns=dataset.columns)

def train_mlps(dataset):
    """Train MLP classifiers for each feature in the dataset."""
    MLPs = []
    train_scores = []
    test_scores = []

    pbar = tqdm(total=len(dataset.columns), desc="Fitting Models", position=0, leave=True)
    for column in dataset.columns:
        X = dataset.drop(columns=[column])
        Y = dataset[column]
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

        param_grid = {
            'hidden_layer_sizes': [(10,), (20,), (10, 10)],
            'activation': ['tanh', 'relu'],
            'solver': ['adam', 'sgd'],
            'max_iter': [300, 600]
        }
        
        mlp = GridSearchCV(MLPClassifier(random_state=1), param_grid, cv=3)
        mlp.fit(X_train, y_train)

        train_scores.append(mlp.score(X_train, y_train))
        test_scores.append(mlp.score(X_test, y_test))
        MLPs.append(mlp.best_estimator_)

        pbar.update(1)
    pbar.close()
    
    return MLPs, train_scores, test_scores

def joined_arch(classifiers, query, imputer):
    """Predict missing values using trained classifiers."""
    prediction = []
    
    for i, col in enumerate(query.columns):
        if query[col].isnull().values[0]:  # If the value is missing
            query_no_target = query.drop(columns=[col])
            query_no_target = imputer.transform(query_no_target)
            prediction.append(classifiers[i].predict([query_no_target])[0])
        else:
            prediction.append(int(query[col].iloc[0]))
    
    return prediction

def predict_missing_values(query, MLPs, imputer):
    """Fill missing values in the query DataFrame."""
    filled_query = query.copy()
    
    for column in filled_query.columns:
        if filled_query[column].isnull().any():
            query_to_predict = filled_query[filled_query[column].isnull()].drop(columns=[column])
            query_to_predict = imputer.transform(query_to_predict)
            
            if not query_to_predict.empty:
                predictions = joined_arch(MLPs, query_to_predict, imputer)
                filled_query.loc[filled_query[column].isnull(), column] = predictions

    return filled_query

def correct_error_queries(query, MLPs, imputer):
    """Correct errors in the query DataFrame."""
    for column in query.columns:
        predictions = joined_arch(MLPs, query, imputer)
        query[column] = np.where(query[column].isnull(), predictions, query[column])

    return query

def main():
    # Load and preprocess dataset
    dataset = load_data('HW1DATA.csv')
    dataset = encode_categorical_features(dataset)
    dataset = impute_missing_values(dataset)

    # Train MLP classifiers
    MLPs, train_scores, test_scores = train_mlps(dataset)
    print("Training Scores:", train_scores)
    print("Test Scores:", test_scores)

    # Handle HW1_QUERY_MISSING.csv
    query_missing = pd.read_csv('HW1_QUERY_MISSING.csv', sep=",", header=None, names=range(10))
    query_missing.replace('?', np.nan, inplace=True)
    query_missing = query_missing.apply(pd.to_numeric, errors='coerce')
    query_missing = pd.DataFrame(imputer.transform(query_missing), columns=query_missing.columns)
    
    predicted_missing = predict_missing_values(query_missing, MLPs, imputer)
    print("Predictions for HW1_QUERY_MISSING.csv:")
    for i in range(len(predicted_missing)):
        print(f"Query {i+1}: Actual: {query_missing.iloc[i].to_numpy()}, Predicted: {predicted_missing.iloc[i].to_numpy()}")
    
    predicted_missing.to_csv('predicted_missing_values.csv', index=False)

    # Handle HW1_QUERY_W_ERROR.csv
    query_errors = pd.read_csv('HW1_QUERY_W_ERROR.csv', sep=",", header=None, names=range(10))
    query_errors.replace('?', np.nan, inplace=True)
    query_errors = query_errors.apply(pd.to_numeric, errors='coerce')
    
    corrected_query_errors = correct_error_queries(query_errors, MLPs, imputer)
    print("Corrected predictions for HW1_QUERY_W_ERROR.csv:")
    for i in range(len(corrected_query_errors)):
        print(f"Error Query {i+1}: Actual: {query_errors.iloc[i].to_numpy()}, Corrected: {corrected_query_errors.iloc[i].to_numpy()}")

    corrected_query_errors.to_csv('corrected_values.csv', index=False)

if __name__ == "__main__":
    main()

Fitting Models: 100%|██████████| 10/10 [01:30<00:00,  9.06s/it]

Training Scores: [0.46808510638297873, 0.7524177949709865, 0.7794970986460348, 0.7117988394584139, 0.7021276595744681, 0.7678916827852998, 0.5609284332688588, 0.7504835589941973, 0.8897485493230174, 0.965183752417795]
Test Scores: [0.3153846153846154, 0.7153846153846154, 0.7153846153846154, 0.7, 0.6461538461538462, 0.7846153846153846, 0.5153846153846153, 0.7538461538461538, 0.8692307692307693, 0.9538461538461539]
Predictions for HW1_QUERY_MISSING.csv:
Query 1: Actual: [5.42658423 6.36321484 6.35548686 6.69706337 6.77279753 6.78979907
 6.40340031 6.53941267 6.63833076 0.66924266], Predicted: [5.42658423 6.36321484 6.35548686 6.69706337 6.77279753 6.78979907
 6.40340031 6.53941267 6.63833076 0.66924266]
Query 2: Actual: [5.42658423 6.36321484 6.35548686 6.69706337 6.77279753 6.78979907
 6.40340031 6.53941267 6.63833076 0.66924266], Predicted: [5.42658423 6.36321484 6.35548686 6.69706337 6.77279753 6.78979907
 6.40340031 6.53941267 6.63833076 0.66924266]
Query 3: Actual: [5.42658423 6.363


