In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import functions as F

from functions import cross_validation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, RandomForestRegressor,StackingRegressor, StackingClassifier
from sklearn.metrics import accuracy_score, mean_absolute_error,mean_squared_error,confusion_matrix, make_scorer
from sklearn.tree import DecisionTreeRegressor, export_graphviz, DecisionTreeClassifier
from functions import manual_bagging, evaluate_regression_metrics, evaluate_classification_metrics, manual_Random_Forest
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import Isomap
from sklearn.datasets import make_regression
from sklearn.svm import SVR, SVC

import configparser
config = configparser.ConfigParser()
config.read('configuration.ini')

['configuration.ini']

In [4]:
data = pd.read_csv(config['created_files']['train'],sep = ',')
X_train_p = data.drop(columns=['T3','T1','T2']).reset_index(drop=True)
y_train_p = data['T3'].reset_index(drop=True)

X_train, X_test, y_train_unstd, y_test_unstd = train_test_split(X_train_p,y_train_p,test_size=0.2)

X_train, X_test, y_train,y_test,scaler_y = F.standarize_numerical_variables(X_train, X_test, y_train_unstd, y_test_unstd,2)

data = pd.read_csv(config['created_files']['test'],sep = ',')
X_test_final = data.drop(columns=['T3']).reset_index(drop=True)
y_test_final = data['T3'].reset_index(drop=True)

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(628, 41)
(628,)
(157, 41)
(157,)


In [9]:
base_models = [
    ('lr', LinearRegression()),
    ('lasso', LassoCV(cv=5)),
    ('RF', RandomForestRegressor(random_state=42)),
    ('svm', SVR(C=1.0, epsilon=0.2)),
]

meta_model = LinearRegression()
stack = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5, passthrough=True)
stack.fit(X_train, y_train_unstd)
y_pred = stack.predict(X_test)
print(evaluate_regression_metrics(y_test_unstd,y_pred))

{'Mean Absolute Error:': 2.66545647351469, 'Mean Squared Error:': 13.918633016824216, 'R² Score:': 0.2684198046910381}


In [15]:
base_models = [
    ('lr', LogisticRegression(max_iter=1000)), 
    ('svc', SVC(probability=True)),
    ('RF', RandomForestClassifier(random_state=42))
]

meta_model = SVC(probability=True)
stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5, passthrough=True)
stack.fit(X_train, y_train_unstd)
y_pred = stack.predict(X_test)
print(y_pred)
print(evaluate_regression_metrics(y_test_unstd,y_pred))



[11 11 10 14 15 10 10 10 12 11 11 10 10 11 10 10 14 11  9 11 10 11 10 11
 11 11 11 11 11 10 11 11 14 13 11 11 10 10 10 10 11 10 10 11 10 13 11 10
 11 12 11 10 12 13 11 11 11 10 11 11 10 11 11 10 10 10 10 11  9 10 11 14
 11 12 11 11 10 14 11 11  9 11 11  8 10 11 11 10 11 15 10 14 10 14 11 13
 10 10 10 14  8 11 11 10 13 14 14 11 13 11 11 11 10  0 10 11 11 10 14 11
 10 11 11 11 10 10 11 10 10 14 14 11 10 10  8 10 11 10 11 11 10 11 10 10
 10 10 14 10 11 11 11 10 11 11 11 10 10]
{'Mean Absolute Error:': 2.8917197452229297, 'Mean Squared Error:': 16.369426751592357, 'R² Score:': 0.1396031201088369}


In [16]:
X = X_train_p.copy()
y = y_train_p.copy()

param_grid = {
    'kpca__kernel': ['rbf', 'poly', 'sigmoid','cosine'],
    'kpca__gamma': [0.01, 0.1, 1],
    'kpca__degree': [2, 3, 4],
    'kpca__eigen_solver': ['auto', 'dense', 'arpack', 'randomized'],
    'kpca__n_components': [20, 30]
}

pipe = Pipeline([
    ('scaler', StandardScaler()),  # This step safely separates train and test and standarizes using only the training data
    ('kpca', KernelPCA(n_components=2, fit_inverse_transform=False)),
    ('model', LogisticRegression())  
])

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy' if y.nunique() < 10 else 'r2')
grid.fit(X, y)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)

X_kpca = grid.best_estimator_.named_steps['kpca'].transform(StandardScaler().fit_transform(X))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters: {'kpca__degree': 2, 'kpca__eigen_solver': 'auto', 'kpca__gamma': 0.01, 'kpca__kernel': 'poly', 'kpca__n_components': 30}
Best score: 0.07552166532941831


45 fits failed out of a total of 1440.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\luciaa_herraiz\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\luciaa_herraiz\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\luciaa_herraiz\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 416, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\