<a href="https://colab.research.google.com/github/Guillermo-rv/ML/blob/main/Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Session 4

## Exercise 1

In [None]:
# Step 1: Reading the data
df_liga = pd.read_csv("laliga_player_stats.csv")

# Step 2: Removing the columns with the word "Goals" or "Penalties"
# Step 2.1.: Finding the columns
lst_cols_remove = df_liga.columns[df_liga.columns.str.contains("Goals", case=True)] # List with cols "goals"
lst_cols_remove_penalties = df_liga.columns[df_liga.columns.str.contains("Penalties", case=True)] # List with cols "penalties"
lst_cols_remove = lst_cols_remove.append(lst_cols_remove_penalties) # Append "penalties" to "goals"
# Step 2.2: Removing them from the dataset
X = df_liga.drop(columns=lst_cols_remove) # 49 columns

# Step 3: Creating X and y
X = X.replace("%", "", regex=True) # We remove the % to
X = X[X.columns[4:]] # Removing the first 4 columns
X = X.apply(pd.to_numeric) # Transforming to numeric all the columns!
y = df_liga['Goals scored'] # Our variable to predict

# Step 4: Splitting training and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

# Step 5: Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def adjusted_r2_score(y_true, y_pred, n_features): # n_features needs to be the number of columns of X (X_train or X_test)
                                                   # X_train.shape[1] // X_test.shape[1]
  r2 = r2_score(y_true, y_pred)
  N = y_true.shape[0] # len(y_true)
  p = n_features
  return 1 - (((1-r2)*(N-1)) / (N-p-1))

def print_errors (y_true, y_pred, n_features): # n_features is the number of columns of X (X_train.shape[1])
  print("R2: " + str(r2_score(y_true, y_pred)))  # The best result is 1.0
  print("Adjusted R2: " + str(adjusted_r2_score(y_true, y_pred, n_features))) # The best result is 1.0
  print("MSE: " + str(mean_squared_error(y_true, y_pred))) # The best value is 0.0
  print("MAE: " + str(mean_absolute_error(y_true, y_pred))) # The best value is 0.0

## Exercise 2

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(0.1) # feature selectors (selecting some variables/features/columns from the X)
lasso.fit(X_train_scaled, y_train)
# METRICS!! BEST PARAMETERS
# Training
y_train_pred = lasso.predict(X_train_scaled)
print("Training metrics")
print_errors(y_train, y_train_pred, X_train_scaled.shape[1])
# Test
y_test_pred = lasso.predict(X_test_scaled)
print("\nTest metrics")
print_errors(y_test, y_test_pred, X_test_scaled.shape[1]) # X_train_scaled.shape[1] also works

Training metrics
R2: 0.8954385736399293
Adjusted R2: 0.882439044957326
MSE: 1.270757252850962
MAE: 0.6481259414131052

Test metrics
R2: 0.8767823948678781
Adjusted R2: 0.8151735923018172
MSE: 1.5397449483923829
MAE: 0.7587273966613549


In [None]:
#pd.DataFrame(lasso.coef_) # info: importance of the columns & columns to remove (redundant)
X_cols = pd.DataFrame(X.columns) # Creating a new dataframe with the name of the columns in the rows
X_cols['coefs'] = pd.DataFrame(lasso.coef_) # Adding one column to the dataframe with the coefficients from Lasso
X_cols[X_cols['coefs']!=0.0] # Filtering by coefs not 0.0

In [None]:
# Another way of selecting the columns using Lasso
from sklearn.feature_selection import SelectFromModel

selectModel = SelectFromModel(lasso, prefit=True) # We use prefit=True because lasso was already fitted.
                                                    # If not, set prefit to False and fit the selectModel object with X_train_scaled, y_train
X_train_scaled_lasso = selectModel.transform(X_train_scaled)
X_test_scaled_lasso = selectModel.transform(X_test_scaled)

print(selectModel.get_feature_names_out(X.columns))

['Games where substituted' 'Unsuccessful aerial challenges' 'Offsides'
 'Shots on target' 'From inside the area' 'From outside the area'
 'Crosses']


## Exercise 3

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# WITH ALL COLUMNS
print("WITH ALL COLUMNS")
knn_v1 = KNeighborsRegressor() # Default parameters
knn_v1.fit(X_train_scaled, y_train)
print("\nTraining metrics")
y_train_pred_knn = knn_v1.predict(X_train_scaled)
print_errors(y_train, y_train_pred_knn, X_train_scaled.shape[1])
print("\nTest metrics")
y_test_pred_knn = knn_v1.predict(X_test_scaled)
print_errors(y_test, y_test_pred_knn, X_train_scaled.shape[1])

# LASSO COLUMNS
print("\n\nLASSO COLUMNS")
knn_v2 = KNeighborsRegressor() # Default parameters
knn_v2.fit(X_train_scaled_lasso, y_train)
print("\nTraining metrics")
y_train_pred_knn2 = knn_v2.predict(X_train_scaled_lasso)
print_errors(y_train, y_train_pred_knn2, X_train_scaled_lasso.shape[1]) # now we have 7 columns
print("\nTest metrics")
y_test_pred_knn2 = knn_v2.predict(X_test_scaled_lasso)
print_errors(y_test, y_test_pred_knn2, X_train_scaled_lasso.shape[1]) # now we have 7 columns

WITH ALL COLUMNS

Training metrics
R2: 0.8164679862395957
Adjusted R2: 0.7936504926369509
MSE: 2.230503597122302
MAE: 0.6796163069544364

Test metrics
R2: 0.7968869854786736
Adjusted R2: 0.6953304782180105
MSE: 2.5381294964028775
MAE: 0.9151079136690649


LASSO COLUMNS

Training metrics
R2: 0.8190962991704955
Adjusted R2: 0.8160001478115554
MSE: 2.198561151079137
MAE: 0.6148681055155875

Test metrics
R2: 0.7924194203066626
Adjusted R2: 0.7813273282619804
MSE: 2.5939568345323747
MAE: 0.8201438848920864


In [None]:
y.describe()

count    556.000000
mean       1.696043
std        3.503828
min        0.000000
25%        0.000000
50%        0.000000
75%        2.000000
max       36.000000
Name: Goals scored, dtype: float64

# Session 5

## Exercise 1

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

my_params = {
    'kernel': ['linear', 'rbf'],
    'C': [0.001, 0.01, 0.1, 1],
    'gamma': [0.01, 0.1, 1],
    'epsilon': [0.01, 0.1, 1]
}

svr = SVR()

clf = GridSearchCV(svr, my_params) # r2 // accuracy
clf.fit(X_train_scaled, y_train)

In [None]:
print("Best params: " + str(clf.best_params_))
print("TRAINING METRICS")
y_train_pred = clf.predict(X_train_scaled)
print_errors(y_train, y_train_pred, X_train_scaled.shape[1])
print("TEST METRICS")
y_test_pred = clf.predict(X_test_scaled)
print_errors(y_test, y_test_pred, X_train_scaled.shape[1])

# Benchmark algorithms!!

Best params: {'C': 0.1, 'epsilon': 1, 'gamma': 0.01, 'kernel': 'linear'}
TRAINING METRICS
R2: 0.891015792163769
Adjusted R2: 0.8774664041625079
MSE: 1.3245082567752142
MAE: 0.6310345410157989
TEST METRICS
R2: 0.8529989213134939
Adjusted R2: 0.7794983819702408
MSE: 1.836946660934354
MAE: 0.7859985737269664


## Exercise 2

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
my_grid_mlp = {
    'hidden_layer_sizes' : [[50, 25], # We have 50 neurons in the first hidden layer and 25 neurons in the second (last) hidden layer
                           [25, 10],
                           [50] # We only have 1 hidden layer that has 50 neurons
                           ]
}

from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(max_iter=500, learning_rate="adaptive", early_stopping=True,
                   tol=0.00001, alpha=0.001, random_state=42)

clf_mlp = GridSearchCV(mlp, my_grid_mlp)
clf_mlp.fit(X_train_scaled, y_train)

In [None]:
print("Best params: " + str(clf_mlp.best_params_))
print("TRAINING METRICS")
y_train_pred = clf_mlp.predict(X_train_scaled)
print_errors(y_train, y_train_pred, X_train_scaled.shape[1])
print("TEST METRICS")
y_test_pred = clf_mlp.predict(X_test_scaled)
print_errors(y_test, y_test_pred, X_train_scaled.shape[1])

In [None]:
# Try different parameters -> set a random_state
# MLP: use more specific neural networks libraries (pytorch, tensorflow,...)

In [None]:
# SAVING A MODEL
import joblib

In [None]:
joblib.dump(clf_mlp, "my_mlp_model.pkl") # Save the model as .pkl

['my_mlp_model.pkl']

In [None]:
my_saved_model = joblib.load("my_mlp_model.pkl") # Load the model

In [None]:
print("TEST METRICS")
y_test_pred = my_saved_model.predict(X_test_scaled)
print_errors(y_test, y_test_pred, X_train_scaled.shape[1])

# Session 6

In [None]:
df_insurance = pd.read_csv("insurance.csv")

In [None]:
# In sklearn, models based on trees don't work (for the moment) with categorical columns.
# Theoretically, models based on trees handle perfectly these types of columns.
# In H2O library (python) you can use categorical columns in models based on trees. [Consider it if you have lot of categories]

In [None]:
# In sklearn you can convert categories to 0 or 1 using pd.dummies (pandas) or OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
X = df_insurance.drop(columns="charges")
y = df_insurance['charges']

In [None]:
cols_toEncoder = ['sex', 'smoker', 'region']
cols_toScale = ['age']
cols_transformer = ColumnTransformer(transformers=[('my_onehotenc', OneHotEncoder(), cols_toEncoder) # Apply OneHotEncoder only to cols_toEncode
                               # , ('my_scaler', StandardScaler(), cols_toScale) # Be careful! fit with train and transform with train and test
                                ],
                  remainder="passthrough") # The columns that are not in the transformers will go as they are
X_transformed = cols_transformer.fit_transform(X)
#X_transformed.shape # 3 for regions, 1 new for sex, 1 new for smoker

In [None]:
# Add the columns names
cols_withTransformer = cols_transformer.transformers_[0][1].get_feature_names_out(cols_toEncoder) # Columns after the OneHotEncoder
                                                # ([0] because it's the first transformer and [1] because it's the object OneHotEncoder itself)
cols_withTransformer = list(cols_withTransformer)
cols_withoutTransformer = ['age','bmi','children'] # Columns that are in the remainder of the ColumnTransformer
cols_newDataFrame = cols_withTransformer + cols_withoutTransformer # Adding one list to the other
X_transformed_df = pd.DataFrame(X_transformed, columns=cols_newDataFrame) # Creating the dataframe with the columns names

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y)
# Split X and y
# Split training and test: X_train, X_test, y_train, y_test
# Preprocessing (Scaler!!!!, OneHotEncoder - depends)

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_leaf_nodes=5)
dt.fit(X_train, y_train)
print("Metrics TRAINING")
y_train_pred = dt.predict(X_train)
print_errors(y_train, y_train_pred, X_train.shape[1])
print("\nMetrics TEST")
y_test_pred = dt.predict(X_test)
print_errors(y_test, y_test_pred, X_train.shape[1])

Metrics TRAINING
R2: 0.8350750496869388
Adjusted R2: 0.8332443993807394
MSE: 24046403.585564
MAE: 3208.576531636226

Metrics TEST
R2: 0.8358829489424062
Adjusted R2: 0.8302938233645935
MSE: 24413669.209773634
MAE: 3189.9851251496157


In [None]:
pd.DataFrame(y_test_pred).value_counts()

5345.116045     158
12359.521230    107
21412.018360     31
44384.138093     27
36068.060530     12
Name: count, dtype: int64

In [None]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(y_test, y_test_pred)*100

45.24718827697121

In [None]:
!pip install import-ipynb
import import_ipynb
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from drive.MyDrive.Colab_Notebooks.IntakeFebruary2024_ML_exercises.s00_useful_functions_supervised import *

In [None]:
plot_feature_importances_tree(dt, X_transformed_df)

In [None]:
plot_decision_tree_regression(dt, X_transformed_df.columns) # Getting the plot of the tree

# Session 7: Ensembles

In [None]:
# SCALING
X_train_df = pd.DataFrame(X_train, columns=cols_newDataFrame)
X_test_df = pd.DataFrame(X_test, columns=cols_newDataFrame)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_df[['age', 'bmi', 'children']])

In [None]:
# Transforming with the scaling
X_train_scaled_nums = scaler.transform(X_train_df[['age','bmi','children']])
X_test_scaled_nums = scaler.transform(X_test_df[['age','bmi','children']])

# Dropping the columns that we scaled
X_train_df = X_train_df.drop(columns=['age','bmi','children'])
X_test_df = X_test_df.drop(columns=['age','bmi','children'])

# Creating a new DataFrame with the columns that we scaled
X_train_scaled_nums = pd.DataFrame(X_train_scaled_nums, columns=['age','bmi','children'])
X_test_scaled_nums = pd.DataFrame(X_test_scaled_nums, columns=['age','bmi','children'])

# Adding the columns that we scaled to the DataFrame
X_train_df[['age', 'bmi', 'children']] = X_train_scaled_nums
X_test_df[['age', 'bmi', 'children']] = X_test_scaled_nums

In [None]:
# Import the classes (LR, SVM, KNN)
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Define the models --> The optimal way of doing it is: finding the best parameters for each estimator individually (GridSearchCV)
    # Using those parameters to define the estimators of the VotingRegressor
reg1 = LinearRegression()
reg2 = SVR() # SVR(kernel="linear")... # you can change the parameters if you want
reg3 = KNeighborsRegressor()

# Import the models with the parameters in the VotingRegressor
from sklearn.ensemble import VotingRegressor
voting = VotingRegressor(estimators=[('my_linearregressor', reg1),
                            ('my_svr', reg2),
                            ('my_knn', reg3)],
                         weights=[0.70, 0.10, 0.20]  # The LR will have more impact of the predictions of the VotingRegressor
                )
voting.fit(X_train_df, y_train) # We fit only the VotingRegressor (not the individual estimators)
y_train_voting = voting.predict(X_train_df) # Prediction of the training
print("Errors TRAINING")
print_errors(y_train, y_train_voting, X_train_df.shape[1])
print("Errors TEST")
y_test_voting = voting.predict(X_test_df)
print_errors(y_test, y_test_voting, X_train_df.shape[1]) # last parameter is the # of columns of our X (we need them for adjusted R2)

Errors TRAINING
R2: 0.7724978854186896
Adjusted R2: 0.7699726349036599
MSE: 33170285.35347191
MAE: 3863.0254185885206
Errors TEST
R2: 0.7627796698955704
Adjusted R2: 0.7547009589632214
MSE: 35288342.26353762
MAE: 4035.0686721912184


In [None]:
# Stacking Regressor --> you need a final estimator (model)
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
# SAME AS WITH VOTINGREGRESSOR: Obtain the best parameters for the individual model with GridSearchCV before
# creating the stacking regressor

stacking = StackingRegressor(estimators = [('lr', reg1), ('svm', reg2), ('knn', reg3)],
                  final_estimator = RandomForestRegressor())

stacking.fit(X_train_df, y_train) # We fit only the StackingRegressor (not the individual estimators)
y_train_stacking = stacking.predict(X_train_df) # Prediction of the training
print("Errors TRAINING")
print_errors(y_train, y_train_stacking, X_train_df.shape[1])
print("Errors TEST")
y_test_stacking = stacking.predict(X_test_df)
print_errors(y_test, y_test_stacking, X_train_df.shape[1]) # last parameter is the # of columns of our X (we need them for adjusted R2)

Errors TRAINING
R2: 0.794486083546255
Adjusted R2: 0.7922048998116524
MSE: 29964359.959581345
MAE: 3462.657253785224
Errors TEST
R2: 0.7815582086697382
Adjusted R2: 0.7741190145377479
MSE: 32494890.693934686
MAE: 3791.9240784011936


In [None]:
y_pred_reg1 = 10
y_pred_reg2 = 13
y_pred_reg3 = 14



# Session 8

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

In [None]:
# Step 1: Read data
df_liga = pd.read_csv("laliga_player_stats.csv")

# Step 2: Create the label (value to predict/classify)
df_liga['categories_goals'] = np.where(df_liga['Goals scored']>=10, 'cat1', 'cat0')
#print("Unbalance in the target:" + str(df_liga['categories_goals'].value_counts()))
print("\nPercentage of cat1 in the dataset: " + str((df_liga['categories_goals'].value_counts()['cat1']*100)/len(df_liga)))

# Step 3: Divide X and y
X = df_liga.drop(columns=['categories_goals'])
X = X[['Minutes played', 'Percentage of games played', 'Percentage of games where substituted', 'Passes', 'Assists', 'Shots', 'Crosses']].replace("%", "", regex=True)
X[['Percentage of games played', 'Percentage of games where substituted']] = X[['Percentage of games played', 'Percentage of games where substituted']].apply(pd.to_numeric, errors='coerce')

y = df_liga[['categories_goals']]

#  Step 4: Split in training/test considering the categories (stratifying)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25, stratify=y)
print("\nPercentage of cat1 in training set: " + str((y_train.value_counts()['cat1']*100)/len(y_train)))
print("\nPercentage cat1 in test set: " + str((y_test.value_counts()['cat1']*100)/len(y_test)))


Percentage of cat1 in the dataset: 3.776978417266187

Percentage of cat1 in training set: 3.8560411311053984

Percentage cat1 in test set: 3.592814371257485


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, balanced_accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

def print_resultados(y_actual, y_pred):
  print("\tAccuracy score: " + str(accuracy_score(y_actual, y_pred)))
  print("\tBalanced Accuracy score: "+ str(balanced_accuracy_score(y_actual, y_pred)))

In [None]:
# PIPELINE with LogReg with default parameters
print("**\tLOGREG with DEFAULT parameters\t**")
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

pipeline_logreg_base = Pipeline([#('scaler', StandardScaler()),  # 1º
                                #("lg", LogisticRegression(class_weight="balanced"))]) # 2º
                                # ('svc', SVC(class_weight="balanced"))])
                                ('my_rf', RandomForestClassifier(class_weight="balanced"))])

pipeline_logreg_base.fit(X_train, y_train['categories_goals'])

print("\nTraining results:")
print_resultados(y_train, pipeline_logreg_base.predict(X_train))

print("\nTest results:")
print_resultados(y_test, pipeline_logreg_base.predict(X_test))

**	LOGREG with DEFAULT parameters	**

Training results:
	Accuracy score: 1.0
	Balanced Accuracy score: 1.0

Test results:
	Accuracy score: 0.9580838323353293
	Balanced Accuracy score: 0.5771221532091098


In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train, pipeline_logreg_base.predict(X_train))

array([[358,  16],
       [  0,  15]])

In [None]:
confusion_matrix(y_test, pipeline_logreg_base.predict(X_test))

array([[156,   5],
       [  0,   6]])

## EXTRA

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline

In [None]:
my_pipeline = make_pipeline(
                RandomUnderSampler(),  # Under-sampler strategy (you can use also an over-sampler strategy)
                RandomForestClassifier()
            )

In [None]:
my_pipeline.fit(X_train, y_train['categories_goals'])
y_train_preds = my_pipeline.predict(X_train)
y_test_preds = my_pipeline.predict(X_test)

In [None]:
X_test['predictions'] =  y_test_preds

In [None]:
X_test['real'] = y_test

In [None]:
print("\nTRAINING")
print_resultados(y_train, y_train_preds)
print("\nTEST")
print_resultados(y_test, y_test_preds)

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
balanced_rf = BalancedRandomForestClassifier()
balanced_rf.fit(X_train, y_train['categories_goals'])

y_train_preds = balanced_rf.predict(X_train)
y_test_preds = balanced_rf.predict(X_test)

print("\nTRAINING")
print_resultados(y_train, y_train_preds)
print("\nTEST")
print_resultados(y_test, y_test_preds)