<a href="https://colab.research.google.com/github/MaxVieiraSantiago/Machine-Learning-Algorithms/blob/master/Linear%20Regression/Linear_Regression_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# statsmodels
# Mais interessante para um ponto de vista estatístico, como o modelo explica os dados. 
# Fazer uma análise estatística da estimação deste modelo nos dados.
# Dados estatísticos, p-value dos coeficientes, etc...

# scikit-learn
# Fazer uma análise mais preditiva ( machine-learning ), não se preocupa tão bem com a qualidade do fit (treino) do modelo (o quaão bem ele treina os dados de treino)
# e sim na qualidade das previsões em dados que ele nunca viu.

In [1]:
import pandas as pd
import numpy as np

In [2]:
url = './sample_data/california_housing_test.csv'

dados = pd.read_csv( url )

In [3]:
dados.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


## **Statsmodels**

In [4]:
import statsmodels.api as sm 

# OLS ( Ordinary Least Squares )

In [10]:
y = 'median_house_value'

# Elimiar a coluna, mas não elemina de 'dados', para isso deveria escrever -> dados = dados.drop( [ y ], axis = 1 )
dados.drop( [ y ], axis = 1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085
1,-118.30,34.26,43.0,1510.0,310.0,809.0,277.0,3.5990
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375
...,...,...,...,...,...,...,...,...
2995,-119.86,34.42,23.0,1450.0,642.0,1258.0,607.0,1.1790
2996,-118.14,34.06,27.0,5257.0,1082.0,3496.0,1036.0,3.3906
2997,-119.70,36.30,10.0,956.0,201.0,693.0,220.0,2.2895
2998,-117.12,34.10,40.0,96.0,14.0,46.0,14.0,3.2708


In [11]:
# modelo = Ordinary Least Squares( Variável que queremos prever (y), todas as demais variáveis preditoras (X) )
mod = sm.OLS( dados[ y ],  dados.drop( [ y ], axis = 1 ) )
res = mod.fit()

In [14]:
print( res.summary() )

                                 OLS Regression Results                                
Dep. Variable:     median_house_value   R-squared (uncentered):                   0.898
Model:                            OLS   Adj. R-squared (uncentered):              0.897
Method:                 Least Squares   F-statistic:                              3281.
Date:                Wed, 15 Mar 2023   Prob (F-statistic):                        0.00
Time:                        10:38:12   Log-Likelihood:                         -37938.
No. Observations:                3000   AIC:                                  7.589e+04
Df Residuals:                    2992   BIC:                                  7.594e+04
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                         coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

## **Dados Normalizados**

In [28]:
# Subtrai a média de cada coluna e divide pelo desvio padrão de cada coluna
from sklearn.preprocessing import StandardScaler

X = dados.drop( [ y ], axis = 1 )
scaler = StandardScaler()

X_normalizados = scaler.fit_transform( X )
X_normalizados = pd.DataFrame( X_normalizados, columns = X.columns )
X_normalizados[ 'intercept' ] = 1 # Criando essa coluna com o valor 1, ela já é calculada e disponibilizada automaticamente em [ OLS Regression Results ]

mod = sm.OLS( dados[ y ], X_normalizados )
res = mod.fit()

print( res.summary() )

                            OLS Regression Results                            
Dep. Variable:     median_house_value   R-squared:                       0.620
Model:                            OLS   Adj. R-squared:                  0.619
Method:                 Least Squares   F-statistic:                     610.9
Date:                Wed, 15 Mar 2023   Prob (F-statistic):               0.00
Time:                        11:06:59   Log-Likelihood:                -37712.
No. Observations:                3000   AIC:                         7.544e+04
Df Residuals:                    2991   BIC:                         7.550e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
longitude          -8.578e+04   3728

## **Scikit-learn**

In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

y = 'median_house_value'
X = dados.drop( [ y ], axis = 1 )

X_train, X_teste, y_train, y_teste = train_test_split( X, dados[ y ], test_size=0.3, random_state = 0 )

print( X_train.shape, X_teste.shape, y_train.shape, y_teste.shape )

(2100, 8) (900, 8) (2100,) (900,)


In [25]:
modelo = LinearRegression() # hiper-parâmetro -> n_jobs ( para paralelizar, quando temos um computador com vários núcleos )
modelo.fit( X_train, y_train )

predicao = modelo.predict( X_teste )

In [26]:
from sklearn.metrics import mean_squared_error

erro = np.sqrt( mean_squared_error( y_teste, predicao ) )
print( erro )

70139.32959119469


In [27]:
print( modelo.coef_ )

[-4.32312450e+04 -4.26883367e+04  1.08647924e+03 -9.03406929e+00
  1.07011846e+02 -3.63104120e+01  5.10231421e+01  3.84568729e+04]


In [None]:
}

In [None]:
print( modelo.st )