In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import seaborn as sns
import pickle

# para calcular las métricas

from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn import tree

from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder  
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
plt.rcParams["figure.figsize"] = (10,8)

In [2]:
df_tt = pd.read_csv("../data/preproc.csv", index_col = 0)
df_tt.head(2)

Unnamed: 0_level_0,carat,cut,color,clarity,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,-1.128026,1,0,6,6.353
1,0.669489,0,1,2,9.183


Separación CSV de train en X y y

In [3]:
X = df_tt.drop('price', axis =1)
y = df_tt['price']

Usando la librería stats

In [4]:
results = smf.ols("price ~  carat +  cut + color +  clarity", data=df_tt).fit() 
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.77
Model:,OLS,Adj. R-squared:,0.77
Method:,Least Squares,F-statistic:,33700.0
Date:,"Wed, 25 May 2022",Prob (F-statistic):,0.0
Time:,19:42:32,Log-Likelihood:,-28323.0
No. Observations:,40370,AIC:,56660.0
Df Residuals:,40365,BIC:,56700.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,8.0338,0.008,995.469,0.000,8.018,8.050
carat,0.9219,0.003,349.226,0.000,0.917,0.927
cut,-0.0203,0.002,-9.150,0.000,-0.025,-0.016
color,-0.0278,0.001,-18.913,0.000,-0.031,-0.025
clarity,-0.0393,0.002,-24.919,0.000,-0.042,-0.036

0,1,2,3
Omnibus:,20547.727,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,160822.221
Skew:,2.338,Prob(JB):,0.0
Kurtosis:,11.587,Cond. No.,17.8


Usando la librería sklearn

In [5]:
lr = LinearRegression()

In [6]:
 lr.fit(X, y)

LinearRegression()

In [7]:
lr.intercept_

8.033795508182227

In [8]:
lr.coef_

array([ 0.92194702, -0.02033963, -0.0278076 , -0.03933323])

# Predicción

Nos traemos el CSV de test limpio

In [9]:
X_test = pd.read_csv("../data/test_limpio_.csv", index_col = 0)
X_test.head(2)

Unnamed: 0,carat,cut,color,clarity
0,-1.023184,0,5,5
1,0.94188,1,5,5


In [10]:
y_pred_test = lr.predict(X_test)
y_pred_train = lr.predict(X)

In [11]:
y_pred_test

array([6.75477007, 8.54611546, 9.51222707, ..., 6.83814165, 8.58614732,
       7.78277244])

# Validación del Modelo

In [12]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y, y_pred_train))
print('Mean Squared Error:', metrics.mean_squared_error(y, y_pred_train))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y, y_pred_train)))
print("R2:",  metrics.r2_score(y, y_pred_train))

Mean Absolute Error: 0.30343894839610236
Mean Squared Error: 0.23818518883554757
Root Mean Squared Error: 0.48804219985114766
R2: 0.7695808067699201


In [13]:
def metricas1(y, y_pred_test, y_pred_train, tipo_modelo):
    
    
    resultados = {'Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y, y_pred_train))}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [14]:
lr_results = metricas1(y, y_pred_test, y_pred_train,"Regresion lineal")

In [15]:
lr_results

Unnamed: 0,0,modelo
0,0.488042,Regresion lineal
1,Root Mean Squared Error:,Regresion lineal


# Cross Validation

In [16]:
# Validación cruzada repetida con múltiples métricas
# ==============================================================================

cv_scores = cross_validate(
                estimator = LinearRegression(),
                X         = X,
                y         = y,
                scoring   = ('r2', 'neg_root_mean_squared_error'),
                cv        = 10)

# Se convierte el diccionario a dataframe para facilitar la visualización
cv_scores = pd.DataFrame(cv_scores)
cv_scores

Unnamed: 0,fit_time,score_time,test_r2,test_neg_root_mean_squared_error
0,0.014821,0.003599,0.769099,-0.486801
1,0.006524,0.002796,0.759542,-0.50449
2,0.006714,0.001926,0.762778,-0.492955
3,0.006315,0.001835,0.7769,-0.479458
4,0.005579,0.002152,0.772532,-0.483127
5,0.005489,0.00201,0.776939,-0.479231
6,0.00573,0.001975,0.765328,-0.493316
7,0.005967,0.001841,0.784191,-0.472654
8,0.006135,0.001864,0.768787,-0.494153
9,0.006018,0.001833,0.758495,-0.49418
