In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn import linear_model
import statsmodels.formula.api as smf
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [2]:
data = pd.read_csv('Hitters.csv')
data = pd.get_dummies(data, columns=['League','Division','NewLeague'])
target = 'Salary'
data = data[data['Salary'].notnull()]
data.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,...,PutOuts,Assists,Errors,Salary,League_A,League_N,Division_E,Division_W,NewLeague_A,NewLeague_N
1,315,81,7,24,38,39,14,3449,835,69,...,632,43,10,475.0,False,True,False,True,False,True
2,479,130,18,66,72,76,3,1624,457,63,...,880,82,14,480.0,True,False,False,True,True,False
3,496,141,20,65,78,37,11,5628,1575,225,...,200,11,3,500.0,False,True,True,False,False,True
4,321,87,10,39,42,30,2,396,101,12,...,805,40,4,91.5,False,True,True,False,False,True
5,594,169,4,74,51,35,11,4408,1133,19,...,282,421,25,750.0,True,False,False,True,True,False


In [3]:
class Models:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)

    def calibracion(self, predicciones, y, bins, plot = True):
        df_calibracion = pd.DataFrame({
            'predicciones': predicciones,
            'y': y
        })
        df_calibracion['bins'] = pd.qcut(df_calibracion['predicciones'], q = bins, labels=False) + 1
        grouped = df_calibracion.groupby('bins').mean()
        if plot:
            plt.plot(grouped.predicciones, grouped.y, marker='o', label = 'Modelo', color = 'blue')
            plt.plot(grouped.predicciones, grouped.predicciones, marker='o', label = 'Real', color = 'red')
            plt.xlabel('Predicción')
            plt.ylabel('Real')
            plt.legend()
            plt.grid()    

    def linear_regression(self, plot = True):
        model = LinearRegression().fit(self.X_train, self.y_train)
        predicciones = model.predict(self.X_test)
        r2 = r2_score(self.y_test, predicciones)
        # Llama a la función de calibración para la gráfica
        self.calibracion(predicciones, self.y_test, 10, plot)
        return r2
    
    def linear_ridge(self, plot = True):
        model = Ridge().fit(self.X_train, self.y_train)
        predicciones = model.predict(self.X_test)
        r2 = r2_score(self.y_test, predicciones)
        # Llama a la función de calibración para la gráfica
        self.calibracion(predicciones, self.y_test, 10, plot)
        return r2

    def linear_lasso(self, plot = True):
        model = Lasso().fit(self.X_train, self.y_train)
        predicciones = model.predict(self.X_test)
        r2 = r2_score(self.y_test, predicciones)
        # Llama a la función de calibración para la gráfica
        self.calibracion(predicciones, self.y_test, 10, plot)
        return r2
    
    def polinomial_regression(self, degree=2, plot = True):
        """
        Por defecto usa Lasso para regularización
        """
        X_scaled = StandardScaler().fit_transform(self.X)
        poly_features = PolynomialFeatures(degree=degree)
        X_poly = poly_features.fit_transform(X_scaled)
        X_train, X_test, y_train, y_test = train_test_split(X_poly, self.y, test_size=0.2, random_state=42)
        model = Lasso().fit(X_train, y_train)
        predicciones = model.predict(X_test)
        # Llama a la función de calibración para la gráfica
        self.calibracion(predicciones, self.y_test, 10, plot)
        r2 = r2_score(y_test, predicciones)
        return r2
    
    def knn(self, plot = True):
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(self.X_train)
        X_test_scaled = scaler.transform(self.X_test)

        r2s = [r2_score(self.y_test, KNeighborsRegressor(n_neighbors=i).fit(X_train_scaled, self.y_train).predict(X_test_scaled)) for i in range(1, 50)]

        r2s_df = pd.DataFrame({
            'nn': range(1, 50),
            'r2': r2s
        })

        opt_nn = r2s_df[r2s_df['r2'] == r2s_df['r2'].max()]['nn'].values[0]
        model = KNeighborsRegressor(n_neighbors=opt_nn).fit(X_train_scaled, self.y_train)
        predicciones = model.predict(X_test_scaled)
        # Llama a la función de calibración para la gráfica
        self.calibracion(predicciones, self.y_test, 10, plot)
        return r2s_df[r2s_df['r2'] == r2s_df['r2'].max()]
    
    def _run_models(self):
        modelos = [
            'Regresión Lineal',
            'Regresión Ridge',
            'Regresión Lasso',
            'Regresión Polinomial (grado 2)',
            'Regresión Polinomial (grado 3)',
            'KNN'
        ]
        r2s = [
            self.linear_regression(plot=False),
            self.linear_ridge(plot=False),
            self.linear_lasso(plot=False),
            self.polinomial_regression(2,plot=False),
            self.polinomial_regression(2,plot=False),
            self.knn(plot=False)['r2'].values[0]
        ]
        return pd.DataFrame({
            'Modelo': modelos,
            'R2': r2s
        })

In [4]:
X = data.copy()
X = data.drop(columns=[target])
y = data[target]

In [5]:
Models(X, y)._run_models()

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,Modelo,R2
0,Regresión Lineal,0.290745
1,Regresión Ridge,0.291855
2,Regresión Lasso,0.294198
3,Regresión Polinomial (grado 2),0.21531
4,Regresión Polinomial (grado 3),0.21531
5,KNN,0.305726
