In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

# Load dataset
file_path = "C:\\Users\\mjbab\\OneDrive\\Desktop\\df_reduced.csv"
df = pd.read_csv(file_path, delimiter=";", low_memory=False)

# Create a group_ids column based on Info_cluster
df['group_ids'] = df.groupby('Info_cluster').ngroup()

# Check the distribution of group_ids
group_distribution = df['group_ids'].value_counts()

# Ensure each group has sufficient samples for GroupKFold
min_group_samples = 10  # Adjust as needed
valid_groups = group_distribution[group_distribution >= min_group_samples].index
df_valid_groups = df[df['group_ids'].isin(valid_groups)]

# EDA Part
# A) Summary statistics for numerical features
summary_stats = df_valid_groups.describe()

# C) Handling missing values
# Drop non-numeric columns
non_numeric_columns = ['Info_PepID', 'Info_organism_id', 'Info_protein_id', 'Info_pos', 'Info_AA'
                       , 'Info_epitope_id', 'Info_nPos', 'Info_nNeg']
df_numeric = df_valid_groups.drop(columns=non_numeric_columns)

# Impute missing values (using IterativeImputer)
imputer = IterativeImputer(random_state=42)
df_imputed = pd.DataFrame(imputer.fit_transform(df_numeric), columns=df_numeric.columns)

# D) Outlier detection using IQR method
def detect_outliers_iqr(data):
    outliers_indices = []
    for column in data.columns:
        if data[column].dtype != 'object':  # Exclude non-numeric columns
            Q1 = np.percentile(data[column], 25)
            Q3 = np.percentile(data[column], 75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers_indices.extend(data[(data[column] < lower_bound) | (data[column] > upper_bound)].index)
    return list(set(outliers_indices))

outliers_indices = detect_outliers_iqr(df_imputed)

# Filter outliers_indices to contain only indices present in df_valid_groups
outliers_indices_filtered = [idx for idx in outliers_indices if idx in df_valid_groups.index]

# Remove outliers from df_valid_groups
df_valid_groups_no_outliers = df_valid_groups.drop(outliers_indices_filtered)

# Check if the length of df_valid_groups_no_outliers matches the length of df_valid_groups
if len(df_valid_groups_no_outliers) != len(df_valid_groups):
    print("Length mismatch between df_valid_groups_no_outliers and df_valid_groups. Adjusting dimensions...")

    # If df_valid_groups_no_outliers is longer than df_valid_groups, trim its rows
    if len(df_valid_groups_no_outliers) > len(df_valid_groups):
        df_valid_groups_no_outliers = df_valid_groups_no_outliers.iloc[:len(df_valid_groups), :]

    # If df_valid_groups_no_outliers is shorter than df_valid_groups, expand its rows
    elif len(df_valid_groups_no_outliers) < len(df_valid_groups):
        # Determine the number of rows to add
        num_rows_to_add = len(df_valid_groups) - len(df_valid_groups_no_outliers)
        
        # Create a DataFrame with the same columns as df_valid_groups_no_outliers and fill it with NaNs
        new_rows_df = pd.DataFrame(np.nan, index=np.arange(num_rows_to_add), columns=df_valid_groups_no_outliers.columns)
        
        # Concatenate df_valid_groups_no_outliers with the new rows DataFrame
        df_valid_groups_no_outliers = pd.concat([df_valid_groups_no_outliers, new_rows_df], ignore_index=True)

# E) Class distribution
class_distribution = df_valid_groups_no_outliers['Class'].value_counts()

# Visualize class distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Class', data=df_valid_groups_no_outliers)
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

# Label Encoding
label_encoder = LabelEncoder()
df_valid_groups_no_outliers['Class'] = label_encoder.fit_transform(df_valid_groups_no_outliers['Class'])

# Create a synthetic minority class if there's only one class
unique_classes = df_valid_groups_no_outliers['Class'].unique()
if len(unique_classes) == 1:
    # Duplicate a small portion of the existing class instances
    minority_class_size = int(0.1 * len(df_valid_groups_no_outliers))
    synthetic_minority = df_valid_groups_no_outliers.sample(n=minority_class_size, random_state=42)
    synthetic_minority['Class'] = 1  # Assign the synthetic minority class label
    df_valid_groups_no_outliers = pd.concat([df_valid_groups_no_outliers, synthetic_minority], ignore_index=True)
    df_valid_groups_no_outliers['group_ids'] = df_valid_groups_no_outliers.groupby('Info_cluster').ngroup().values

# Data Preprocessing
# Handling missing values and ensuring numeric columns
# Drop non-numeric columns
non_numeric_columns = ['Info_PepID', 'Info_organism_id', 'Info_protein_id', 'Info_pos', 'Info_AA', 'Info_epitope_id', 'Info_nPos', 'Info_nNeg']
df_numeric = df_valid_groups_no_outliers.drop(columns=non_numeric_columns)

# Convert non-numeric columns to numeric (if possible)
for col in df_numeric.columns:
    df_numeric[col] = pd.to_numeric(df_numeric[col], errors='coerce')

# Impute missing values (using IterativeImputer)
imputer = IterativeImputer(random_state=42)
df_imputed = pd.DataFrame(imputer.fit_transform(df_numeric), columns=df_numeric.columns)

# Define X_imputed after imputation
X_imputed = df_imputed.drop(columns=['Class'])

# Dimensionality reduction using PCA
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_imputed)

# Check for missing values in the 'group_ids' column
missing_group_ids = df_valid_groups_no_outliers['group_ids'].isnull().sum()

# Drop rows with missing values in the 'group_ids' column
if missing_group_ids > 0:
    print("Dropping rows with missing 'group_ids'...")
    df_valid_groups_no_outliers = df_valid_groups_no_outliers.dropna(subset=['group_ids'])
    print("Rows with missing 'group_ids' dropped.")
else:
    print("No missing values found in 'group_ids' column.")

# Ensure consistent lengths for GroupKFold.split()
X_reduced = X_reduced[:len(df_valid_groups_no_outliers)]
df_valid_groups_no_outliers = df_valid_groups_no_outliers.iloc[:len(X_reduced)]

# Initialize GroupKFold with the correct number of groups
group_kfold = GroupKFold(n_splits=5)

# Define classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier()
}

# Report accuracy for each classifier
for name, clf in classifiers.items():
    balanced_accuracies = []
    for train_index, test_index in group_kfold.split(X_reduced, df_valid_groups_no_outliers['Class'], groups=df_valid_groups_no_outliers['group_ids']):
        X_train, X_test = X_reduced[train_index], X_reduced[test_index]
        y_train, y_test = df_valid_groups_no_outliers['Class'].iloc[train_index], df_valid_groups_no_outliers['Class'].iloc[test_index]

        # Apply SMOTE within each fold
        smote = SMOTE()
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

        # Train the model
        clf.fit(X_train_resampled, y_train_resampled)

        # Predictions
        y_pred = clf.predict(X_test)

        # Evaluate the model
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_accuracies.append(balanced_accuracy)

        # Visualize confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
        plt.title(f'{name} Classifier Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()

    # Report performance
    print(f'Classifier: {name}')
    print(f'Mean Balanced Accuracy: {np.mean(balanced_accuracies)}')

# Load the holdout dataset
holdout_file_path = "C:\\Users\\mjbab\\OneDrive\\Desktop\\df_reduced_holdout.csv"
holdout_df = pd.read_csv(holdout_file_path, delimiter=";", low_memory=False)

# Create 'group_ids' column for holdout dataset based on the same logic as training dataset
holdout_df['group_ids'] = holdout_df.groupby('Info_cluster').ngroup()

# Initialize an empty list to store group-wise predictions
holdout_group_predictions = []

# Iterate over unique group_ids in the holdout dataset
holdout_group_ids = holdout_df['group_ids'].unique()

for group_id in holdout_group_ids:
    # Filter holdout data for the current group
    group_data = holdout_df[holdout_df['group_ids'] == group_id]

    # Extract features for the group (similar to preprocessing steps)
    non_numeric_columns = ['Info_PepID', 'Info_organism_id', 'Info_protein_id', 'Info_pos', 'Info_AA', 'Info_epitope_id', 'Info_nPos', 'Info_nNeg']
    group_numeric = group_data.drop(columns=non_numeric_columns)

    # Impute missing values (using the same IterativeImputer as in training)
    imputer = IterativeImputer(random_state=42)
    group_imputed = pd.DataFrame(imputer.fit_transform(group_numeric), columns=group_numeric.columns)

    # Perform dimensionality reduction using PCA (reduce to 1 component)
    pca = PCA(n_components=1)
    group_X_reduced = pca.fit_transform(group_imputed)

    # Ensure the shape of group_X_reduced is (n_samples, 1)
    print("Shape of group_X_reduced:", group_X_reduced.shape)  # Debugging output

    # Make predictions for the group using the trained classifier (clf)
    group_predictions = clf.predict(group_X_reduced)
    # Store the group predictions along with the group_ids
    holdout_group_predictions.extend(zip(group_data.index, group_predictions))

# Create a DataFrame from the group-wise predictions
holdout_predictions_df = pd.DataFrame(holdout_group_predictions, columns=['Index', 'Predicted_Class'])

# Merge predictions with the original holdout data based on the index
holdout_predictions_merged = pd.merge(holdout_df, holdout_predictions_df, left_index=True, right_on='Index', how='left')

# Drop the extra index column and save the predictions to a CSV file
holdout_predictions_merged.drop(columns='Index', inplace=True)
# Specify the file path for saving the CSV file on the desktop
desktop_path = "C:\\Users\\mjbab\\OneDrive\\Desktop\\holdout_predictions_supervised.csv"
holdout_predictions_merged.to_csv(desktop_path, index=False)