# Regressão Linear Univariada

In [2]:
import pandas as pd # para leitura de base de dados e manipulação de data frame
import numpy as np # para álgebra linear
import altair as alt # para plotar gráficos

In [24]:
class LinearRegression():
    
    def __init__(self,alpha,iterations):
        self.alpha = alpha
        self.iterations = iterations
    
    def cost_function(self,X,y,theta):
        m = len(y) #número de amostras
        J = 0 # custo
        
        for i in range(m):
            J = J + np.power((theta[0] + theta[1] * X[1][i] - y[i]),2)
        
        return J*(1/(2*m))
    
    def gradient_descent(self,X,y,theta):
        #import pdb; pdb.set_trace() #Modo Debug (usar para fazer debug - usando n e as variáveis)
        m = len(y)
        J_history = np.zeros(self.iterations)
        temp0 = 0
        temp1 = 0
        
        for k in range(self.iterations):
        
            for i in range(m):
                temp0 = temp0 - self.alpha * (1/m * (theta[0] + theta[1] * X[1][i] - y[i]))
                temp1 = temp1 - self.alpha * (1/m * (theta[0] + theta[1] * X[1][i] - y[i]))*X[1][i]
                theta[0] = temp0
                theta[1] = temp1
                
            J_history[k] = self.cost_function(X,y,theta)
    
        return theta,J_history

In [4]:
# se estiver usando google colab
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
# data = pd.read_csv('/content/drive/My Drive/datasets/profit.txt',header=None,names=['size','profit'])
# data.head()

In [7]:
data = pd.read_csv('profit.txt',header=None,names=['size','profit']) # faz a leitura dos dados
data.head() #mostra as 5 primeiras linhas

Unnamed: 0,size,profit
0,6.1101,17.592
1,5.5277,9.1302
2,8.5186,13.662
3,7.0032,11.854
4,5.8598,6.8233


In [8]:
data_plot = alt.Chart(data).mark_circle(size=60).encode(
    x='size:Q',
    y='profit:Q'
)
data_plot

In [9]:
#Preparando os dados
x = data.iloc[0:,0:1] #dados de treinamento
ones = np.ones(len(data)) 
x.insert(0,len(x.columns) + 1,ones)
y = data.iloc[:,1]
theta = np.zeros(len(x.columns))
x.columns = range(x.shape[1])

In [10]:
x

Unnamed: 0,0,1
0,1.0,6.1101
1,1.0,5.5277
2,1.0,8.5186
3,1.0,7.0032
4,1.0,5.8598
...,...,...
92,1.0,5.8707
93,1.0,5.3054
94,1.0,8.2934
95,1.0,13.3940


In [12]:
iterations = 1500
alpha = 0.01
lr = LinearRegression(alpha,iterations)
lr.gradient_descent(x,y,theta)

> <ipython-input-11-a7f9c7965354>(18)gradient_descent()
-> m = len(y)


(Pdb)  
(Pdb)  n


> <ipython-input-11-a7f9c7965354>(19)gradient_descent()
-> J_history = np.zeros(self.iterations)


(Pdb)  m


97


(Pdb)  n


> <ipython-input-11-a7f9c7965354>(20)gradient_descent()
-> temp0 = 0


(Pdb)  J_history


array([0., 0., 0., ..., 0., 0., 0.])


(Pdb)  


array([0., 0., 0., ..., 0., 0., 0.])


(Pdb)  1500


1500


(Pdb)  n


> <ipython-input-11-a7f9c7965354>(21)gradient_descent()
-> temp1 = 0


(Pdb)  n


> <ipython-input-11-a7f9c7965354>(23)gradient_descent()
-> for k in range(self.iterations):


(Pdb)  n


> <ipython-input-11-a7f9c7965354>(25)gradient_descent()
-> for i in range(m):


(Pdb)  n


> <ipython-input-11-a7f9c7965354>(26)gradient_descent()
-> temp0 = temp0 - self.alpha * (1/m * (theta[0] + theta[1] * X[1][i] - y[i]))


(Pdb)  self.alpha


0.01


(Pdb)  x[1][i]


6.1101


(Pdb)  y[i]


17.592


(Pdb)  n


> <ipython-input-11-a7f9c7965354>(27)gradient_descent()
-> temp1 = temp1 - self.alpha * (1/m * (theta[0] + theta[1] * X[1][i] - y[i]))*X[1][i]


(Pdb)  temp0


0.0018136082474226804


(Pdb)  temp1


0


(Pdb)  theta[0]=temp0
(Pdb)  n


> <ipython-input-11-a7f9c7965354>(28)gradient_descent()
-> theta[0] = temp0


(Pdb)  temp1


0.011080185347654372


(Pdb)  q


BdbQuit: 

In [14]:
#treinando o modelo
iterations = 1500
alpha = 0.01
lr = LinearRegression(alpha,iterations)
custo = lr.cost_function(x,y,theta)
print('Custo inicial: ',custo, '\n')
theta_lr, custo_h = lr.gradient_descent(x,y,theta)
print('Valores de theta_0 e theta_1: ',theta_lr)
print('Custo final: ',custo_h[-1])

Custo inicial:  32.0621456185558 

Valores de theta_0 e theta_1:  [-3.58838901  1.12366721]
Custo final:  4.546073237210185


In [15]:
#predizendo os dados de treinamento
#para ver qual a reta que a Regressão gerou
y_hat = np.matmul(x.values,theta_lr) #retorna o profit predito para cada tamanho de população de x
fit_plot = pd.DataFrame({'X':x[1],'Y':y_hat}) #dados para plotar nossa reta
plot_reta = alt.Chart(fit_plot).mark_line(color='red').encode(
    x=alt.X('X',axis=alt.Axis(title='population')),
    y=alt.Y('Y',axis=alt.Axis(title='profit'))
)
data_plot + plot_reta

In [16]:
#predizendo novas amostras
pred_1 = x[0][0] * theta_lr[0] + x[1][0] * theta_lr[1] #como funciona a predição
print(pred_1,'\n')

pred_2 = 1 * theta_lr[0] + 14 * theta_lr[1]
print('predição: ',pred_2)

3.277330003547353 

predição:  12.142951921000838


In [17]:
fit_plot

Unnamed: 0,X,Y
0,6.1101,3.277330
1,5.5277,2.622906
2,8.5186,5.983682
3,7.0032,4.280877
4,5.8598,2.996076
...,...,...
92,5.8707,3.008324
93,5.3054,2.373115
94,8.2934,5.730633
95,13.3940,11.462010


In [18]:
it = list(range(1,1501))
df_custo = pd.DataFrame({'Iteracoes':it,'Custo':custo_h})

In [19]:
alt.Chart(df_custo).mark_line(color='red').encode(
    x=alt.X('Iteracoes',axis=alt.Axis(title='Iterações')),
    y=alt.Y('Custo',axis=alt.Axis(title='Custo'))
)

# Dataset para Regressão Linear Multivariada

In [None]:
# se estiver usando google colab
# df_beer = pd.read_csv('/content/drive/My Drive/datasets/Consumo_cerveja.csv',nrows=365,delimiter=';')
# df_beer.head()

In [20]:
df_beer = pd.read_csv('consumo_cerveja.csv',nrows=365,delimiter=';')
df_beer.head()

Unnamed: 0,Data,Temperatura Media (C),Temperatura Minima (C),Temperatura Maxima (C),Precipitacao (mm),Final de Semana,Consumo de cerveja (litros)
0,01/01/2015,27.3,23.9,32.5,0.0,0,25.461
1,02/01/2015,27.02,24.5,33.5,0.0,0,28.972
2,03/01/2015,24.82,22.4,29.9,0.0,1,30.814
3,04/01/2015,23.98,21.5,28.6,1.2,1,29.799
4,05/01/2015,23.82,21.0,28.3,0.0,0,28.9


In [21]:
from sklearn.model_selection import train_test_split #para criar conjunto de treino e teste
#alterando nome das colunas para facilitar manipulação
df_beer = df_beer.rename(columns={'Temperatura Media (C)':'meanTemp','Temperatura Minima (C)':'minTemp','Temperatura Maxima (C)':'maxTemp',
                       'Precipitacao (mm)':'precip','Final de Semana':'weekend','Consumo de cerveja (litros)':'consumo'})

#selecionando as colunas serem usadas como features para treinar o modelo
#selecionando a coluna meta, ou seja, aquela que eu quero predizer
feature_col = ['maxTemp','precip','weekend']
meta_col = ['consumo']

X = df_beer[feature_col].values
y = df_beer[meta_col].values.ravel()

#adicionando nova coluna para multiplicação
ones = np.ones([X.shape[0],1])
X = np.concatenate((ones, X), axis=1)
m, n = np.shape(X) #quantidade de amostras x quantidade de características

split_test_size = 0.30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, random_state=42)

In [23]:
X

array([[ 1. , 32.5,  0. ,  0. ],
       [ 1. , 33.5,  0. ,  0. ],
       [ 1. , 29.9,  0. ,  1. ],
       ...,
       [ 1. , 24.1, 10.3,  0. ],
       [ 1. , 22.4,  6.3,  0. ],
       [ 1. , 29. ,  0. ,  0. ]])

# Regressão Linear Multivariada

In [25]:
class LinearRegressionMultivariate():
    
    def __init__(self,alpha,iterations):
        self.alpha = alpha
        self.iterations = iterations
        
    
    def cost_function(self,X,y,theta):
        m = len(y) #número de amostras
        J = 0 # custo
        
        J = np.sum(np.square(np.dot(X, theta) - y))
        return J*(1/(2*m))
    
    def gradient_descent(self, X, y, theta):
        m = len(y)
        J_history = np.zeros(self.iterations)
        
        for k in range(self.iterations):   
            hypothesis = np.dot(X, theta)
            loss = hypothesis - y
            gradient = np.dot(X.transpose(), loss) / m
            theta = theta - self.alpha * gradient
            J_history[k] = self.cost_function(X, y, theta)
            
        return theta, J_history

In [26]:
#treinando
alpha = 0.001
iterations = 100000
lr_geral = LinearRegressionMultivariate(alpha,iterations)
theta = np.zeros(n)
theta_lr, custo_h = lr_geral.gradient_descent(X_train, y_train, theta)
theta_lr #valores finais de theta

array([ 5.85891726,  0.68667881, -0.05194517,  5.38578351])

In [27]:
X_train.shape

(255, 4)

In [28]:
theta.shape

(4,)

In [36]:
#testando com novas amostras
#2o = Temperatura (varie de 0 a 35, para ver a mudança de consumo)
#3o = Chuva
#4o = Final de semana (1 ou 0)

pred_value = [1,22,5.6,1]
np.dot(pred_value,theta_lr)

26.060741727896563

# Regressão Linear usando Scikit-Learn

In [37]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [38]:
#separando os dados em conjunto de treino e teste
#treinando o modelo
mlr_skl = linear_model.LinearRegression()
mlr_skl.fit(X_train, y_train.ravel()) # flatten arrays

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [39]:
y_pred = mlr_skl.predict(X_test) # predizendo
print(mean_squared_error(y_test,y_pred)) # MSE
mlr_skl.coef_  #valores finais de theta

5.480853459612662


array([ 0.        ,  0.66752499, -0.05305718,  5.33497184])

In [41]:
#testando com novas amostras
pred_value = [[1,22,5.6,1]]
mlr_skl.predict(pred_value)

array([26.11957568])

# Equação Normal

In [42]:
X = df_beer[feature_col].values
y = df_beer[meta_col].values
X = np.matrix(X)
aux1 = np.matmul(X.transpose(),X)
inv = np.linalg.inv(aux1)
aux2 = np.matmul(X.transpose(),y)
theta = np.matmul(inv,aux2)
theta

matrix([[ 0.89911459],
        [-0.04817406],
        [ 5.49500367]])

# Dump de Modelo

In [43]:
from sklearn.externals import joblib

joblib.dump(mlr_skl, r'C:\Users\dheny\Documents\FIAP\Python\codigos\lr_model.pkl') 
lr_model_loaded = joblib.load(r'C:\Users\dheny\Documents\FIAP\Python\codigos\lr_model.pkl') 



FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\dheny\\Documents\\FIAP\\Python\\codigos\\lr_model.pkl'

In [None]:
lr_model_loaded.predict([[1,35,0,1]])