In [1]:

import numpy as np
import pandas as pd
import missingno as msno 
from matplotlib.pyplot import subplots
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneOut, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import OLSInfluence
from patsy import dmatrices
from sklearn.model_selection import GridSearchCV
from scipy import stats
import statsmodels.api as sm
from ISLP import confusion_table
from statsmodels.stats.outliers_influence \
     import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm
import seaborn as sns
from statsmodels.stats.stattools import durbin_watson
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)

# 🛠️ 1-Data Preparation 

In [2]:

train_data = pd.read_csv('training_data/training_data.csv' , sep=";")
test_data = pd.read_csv('test_data_no_target/test_data_no_target.csv', sep=";")
train_data.info

# Filter columns that exceed the threshold
missing_values_count=train_data.isnull().sum()
columns_exceeding_threshold = missing_values_count[missing_values_count > 100]


columns_to_drop = missing_values_count[missing_values_count > 100].index
train_data_cleaned = train_data.drop(columns=columns_to_drop)
test_data_cleaned = test_data.drop(columns=columns_to_drop)
train_data_cleaned.drop('Group', axis=1, inplace=True)
test_data_cleaned.drop('Group', axis=1, inplace=True)


def convert_decimal_separator(df):
    # Iterate through each column in the DataFrame
    for column in df.columns:
        # Replace commas with periods in the data
        df[column] = df[column].astype(str).str.replace(',', '.')
        
        # Convert the column to numeric type
        df[column] = pd.to_numeric(df[column], errors='coerce')

# Call the function to convert decimal separator in X
convert_decimal_separator(train_data_cleaned)

# Call the function to convert decimal separator in X
convert_decimal_separator(test_data_cleaned)


#KNNImputer for train data

# Display missing values by column
missing_values_train = train_data_cleaned.isnull().sum().sort_values(ascending=False)


# Initialize the KNNImputer with the 'nan_euclidean' metric
knn_imputer = KNNImputer(n_neighbors=5, metric='nan_euclidean')

# Fit and transform the data using KNN imputation
# Apply KNN imputation to the numeric columns (float64 or int64)
numeric_columns = train_data_cleaned.select_dtypes(include=['float64'])
imputed_data = knn_imputer.fit_transform(numeric_columns)

# Replace the original numeric columns in the DataFrame with the imputed data
train_data_cleaned[numeric_columns.columns] = imputed_data

# Check which columns still have missing values (this should be zero)
missing_values_after_fill = train_data_cleaned.isnull().sum().sum()



#KNNImputer for test data

# Display missing values by column
missing_values_test = test_data_cleaned.isnull().sum().sort_values(ascending=False)


# Initialize the KNNImputer with the 'nan_euclidean' metric
knn_imputer = KNNImputer(n_neighbors=5, metric='nan_euclidean')

# Fit and transform the data using KNN imputation
# Apply KNN imputation to the numeric columns (float64 or int64)
numeric_columns = test_data_cleaned.select_dtypes(include=['float64'])
imputed_data = knn_imputer.fit_transform(numeric_columns)

# Replace the original numeric columns in the DataFrame with the imputed data
test_data_cleaned[numeric_columns.columns] = imputed_data

# Check which columns still have missing values (this should be zero)
missing_values_after_fill_test = test_data_cleaned.isnull().sum().sum()


missing_train_values_later = train_data_cleaned.isnull().sum()

missing_test_values_later = test_data_cleaned.isnull().sum()

# Data Preparation - Train Data without Group, Perform and Class Columns
target= train_data_cleaned['Perform']
classes= train_data_cleaned['Class']
train_data_cleaned.drop(['Class','Perform'], axis=1, inplace=True)


# 🧹 2- Feature Selection- LASSO

In [3]:
# Store column names before splitting and scaling
original_columns = train_data_cleaned.columns

# Split the data
X_train, X_test, y_train, y_test = train_test_split(train_data_cleaned, target, test_size=0.2, random_state=42)

# Scale your features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create DataFrames with scaled features
X_train_df = pd.DataFrame(X_train_scaled, columns=original_columns)
X_test_df = pd.DataFrame(X_test_scaled, columns=original_columns)

# Now you can access column names from X_train_df, X_test_df, and y_train
print("Column names of X_train_df:")
print(X_train_df.columns)

# Instantiate LassoCV model
lasso_cv = LassoCV(cv=5)  # Use 5-fold cross-validation to find the optimal alpha

# Fit the model
lasso_cv.fit(X_train_scaled, y_train)

# Extract important features
important_features_indices = np.where(lasso_cv.coef_ != 0)[0]
important_features = train_data_cleaned.columns[important_features_indices]

print("Important features selected by Lasso:", important_features)


Column names of X_train_df:
Index(['I1', 'I2', 'I3', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I19',
       'I20', 'I25', 'I28', 'I29', 'I30', 'I31', 'I33', 'I34', 'I35', 'I36',
       'I37', 'I38', 'I39', 'I40', 'I41', 'I42', 'I43', 'I47', 'I53', 'I54',
       'I56', 'dI1', 'dI2', 'dI3', 'dI5', 'dI6', 'dI7', 'dI8', 'dI9', 'dI10',
       'dI11', 'dI19', 'dI20', 'dI25', 'dI29', 'dI30', 'dI31', 'dI33', 'dI34',
       'dI35', 'dI36', 'dI37', 'dI39', 'dI40', 'dI41', 'dI42', 'dI47', 'dI53',
       'dI54', 'dI56'],
      dtype='object')


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Important features selected by Lasso: Index(['I6', 'I9', 'I28', 'I37', 'I41', 'I43', 'I47', 'dI6', 'dI9', 'dI25',
       'dI29', 'dI47'],
      dtype='object')


In [4]:
# Get the optimal alpha value
optimal_alpha = lasso_cv.alpha_
print("Optimal alpha value selected by LassoCV:", optimal_alpha)


Optimal alpha value selected by LassoCV: 0.0028975295824779455


In [5]:
from sklearn.linear_model import ElasticNetCV
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Assuming train_data_cleaned and target are already defined

# Store column names before splitting and scaling
original_columns = train_data_cleaned.columns

# Split the data
X_train, X_test, y_train, y_test = train_test_split(train_data_cleaned, target, test_size=0.2, random_state=42)

# Scale your features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create DataFrames with scaled features
X_train_df = pd.DataFrame(X_train_scaled, columns=original_columns)
X_test_df = pd.DataFrame(X_test_scaled, columns=original_columns)

# Now you can access column names from X_train_df, X_test_df, and y_train
print("Column names of X_train_df:")
print(X_train_df.columns)

# Instantiate ElasticNetCV model
elastic_net_cv = ElasticNetCV(cv=5, l1_ratio=0.5, random_state=42)  # Use 5-fold cross-validation to find the optimal alpha and l1_ratio

# Fit the model
elastic_net_cv.fit(X_train_scaled, y_train)

# Extract important features
important_features_indices = np.where(elastic_net_cv.coef_ != 0)[0]
important_features = train_data_cleaned.columns[important_features_indices]

print("Important features selected by Elastic Net:", important_features)

# Get the optimal alpha and l1_ratio values
optimal_alpha = elastic_net_cv.alpha_
optimal_l1_ratio = elastic_net_cv.l1_ratio_

print("Optimal alpha value selected by ElasticNetCV:", optimal_alpha)
print("Optimal l1_ratio value selected by ElasticNetCV:", optimal_l1_ratio)


Column names of X_train_df:
Index(['I1', 'I2', 'I3', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I19',
       'I20', 'I25', 'I28', 'I29', 'I30', 'I31', 'I33', 'I34', 'I35', 'I36',
       'I37', 'I38', 'I39', 'I40', 'I41', 'I42', 'I43', 'I47', 'I53', 'I54',
       'I56', 'dI1', 'dI2', 'dI3', 'dI5', 'dI6', 'dI7', 'dI8', 'dI9', 'dI10',
       'dI11', 'dI19', 'dI20', 'dI25', 'dI29', 'dI30', 'dI31', 'dI33', 'dI34',
       'dI35', 'dI36', 'dI37', 'dI39', 'dI40', 'dI41', 'dI42', 'dI47', 'dI53',
       'dI54', 'dI56'],
      dtype='object')


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Important features selected by Elastic Net: Index(['I6', 'I9', 'I28', 'I37', 'I41', 'I43', 'I47', 'dI6', 'dI9', 'dI25',
       'dI29', 'dI47'],
      dtype='object')
Optimal alpha value selected by ElasticNetCV: 0.005795059164955892
Optimal l1_ratio value selected by ElasticNetCV: 0.5


elasticnet results are same with lasso, so lasso results were taken

In [6]:
columns_to_select=['I6', 'I9', 'I28', 'I37', 'I41', 'I43', 'I47', 'dI6', 'dI9', 'dI25',
        'dI29', 'dI47']


# Select the columns from train_data_cleaned
new_data = train_data_cleaned[columns_to_select]
new_data_test = test_data_cleaned[columns_to_select]
# Set column names for new_data
new_data.columns = columns_to_select

# Set column names for new_data_test
new_data_test.columns = columns_to_select

# 📐 3- Feature Engineering – Polynomial Features

In [7]:
poly = PolynomialFeatures(degree=2, include_bias=False)  # Change degree as needed
new_data = poly.fit_transform(new_data)
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import pandas as pd

# Assuming `test_cleaned_data` is the cleaned test dataset and `train_data_cleaned` was the cleaned training dataset.

# Step 1: Transform the test data using PolynomialFeatures (same instance used for training data)
X_test_interactions = poly.transform(new_data_test)

# Step 2: Convert the interaction features to a DataFrame using the column names from the training data
new_data_test = pd.DataFrame(X_test_interactions, columns=poly.get_feature_names_out(new_data_test.columns))


# 🌲 4- Classification - Random Forest Model with Bagging (Bootstrap Aggregating)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
import numpy as np

# Assuming new_data and classes are your data and target labels

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(new_data, classes, test_size=0.2, random_state=42)

# Initialize base Random Forest classifier
base_rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Initialize Bagging classifier with Random Forest as base estimator
bagging_rf = BaggingClassifier(base_rf, n_estimators=10, random_state=42)

# Train base Random Forest classifier
base_rf.fit(X_train, y_train)

# Train Bagging classifier
bagging_rf.fit(X_train, y_train)

# Evaluate base Random Forest classifier
base_rf_score = base_rf.score(X_test, y_test)
print("Base Random Forest Accuracy:", base_rf_score)

# Evaluate Bagging classifier
bagging_rf_score = bagging_rf.score(X_test, y_test)
print("Bagging Random Forest Accuracy:", bagging_rf_score)


Base Random Forest Accuracy: 0.46125
Bagging Random Forest Accuracy: 0.466875


In [9]:
# Assuming new_data_test is a NumPy array containing your new test data

# Make predictions on new test data using the base Random Forest classifier
base_rf_predictions = base_rf.predict(new_data_test)

# Make predictions on new test data using the Bagging classifier
bagging_rf_predictions = bagging_rf.predict(new_data_test)

# Print predictions
print("Base Random Forest predictions:", base_rf_predictions)
print("Bagging Random Forest predictions:", bagging_rf_predictions)




Base Random Forest predictions: [ 1  1 -1 ...  1  1  1]
Bagging Random Forest predictions: [ 1  1 -1 ...  1  1 -1]


# 💾 5- Save Predicitons to CSV File

In [10]:

predictions_test_model = bagging_rf_predictions.flatten()


# Creating a DataFrame with predictions and the second column
data = {
        'Predictions_Decisiontree': predictions_test_model}

# Create DataFrame
predictions_df = pd.DataFrame(data)

# Export predictions to a CSV file without a header
predictions_df.to_csv('Result.csv', index=False, header=False)