# Regressão Linear Univariada

In [2]:
import pandas as pd # para leitura de base de dados e manipulação de data frame
import numpy as np # para álgebra linear
import altair as alt # para plotar gráficos

In [3]:
class LinearRegression():
    
    def __init__(self,alpha,iterations):
        self.alpha = alpha
        self.iterations = iterations
    
    def cost_function(self,X,y,theta):
        m = len(y) #número de amostras
        J = 0 # custo
        
        for i in range(m):
            J = J + np.power((theta[0] + theta[1] * X[1][i] - y[i]),2)
        
        return J*(1/(2*m))
    
    def gradient_descent(self,X,y,theta):
        #import pdb; pdb.set_trace()
        m = len(y)
        J_history = np.zeros(self.iterations)
        temp0 = 0
        temp1 = 0
        
        for k in range(self.iterations):
        
            for i in range(m):
                temp0 = temp0 - self.alpha * (1/m * (theta[0] + theta[1] * X[1][i] - y[i]))
                temp1 = temp1 - self.alpha * (1/m * (theta[0] + theta[1] * X[1][i] - y[i]))*X[1][i]
                theta[0] = temp0
                theta[1] = temp1
                
            J_history[k] = self.cost_function(X,y,theta)
    
        return theta,J_history

In [4]:
# se estiver usando google colab
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
# data = pd.read_csv('/content/drive/My Drive/datasets/profit.txt',header=None,names=['size','profit'])
# data.head()

In [7]:
data = pd.read_csv('profit.txt',header=None,names=['size','profit']) # faz a leitura dos dados
data.head() #mostra as 5 primeiras linhas

Unnamed: 0,size,profit
0,6.1101,17.592
1,5.5277,9.1302
2,8.5186,13.662
3,7.0032,11.854
4,5.8598,6.8233


In [8]:
data_plot = alt.Chart(data).mark_circle(size=60).encode(
    x='size:Q',
    y='profit:Q'
)
data_plot

In [9]:
#Preparando os dados
x = data.iloc[0:,0:1] #dados de treinamento
ones = np.ones(len(data)) 
x.insert(0,len(x.columns) + 1,ones)
y = data.iloc[:,1]
theta = np.zeros(len(x.columns))
x.columns = range(x.shape[1])

In [10]:
x

Unnamed: 0,0,1
0,1.0,6.1101
1,1.0,5.5277
2,1.0,8.5186
3,1.0,7.0032
4,1.0,5.8598
...,...,...
92,1.0,5.8707
93,1.0,5.3054
94,1.0,8.2934
95,1.0,13.3940


In [None]:
#treinando o modelo
iterations = 1500
alpha = 0.01
lr = LinearRegression(alpha,iterations)
custo = lr.cost_function(x,y,theta)
print('Custo inicial: ',custo, '\n')
theta_lr, custo_h = lr.gradient_descent(x,y,theta)
print('Valores de theta_0 e theta_1: ',theta_lr)
print('Custo final: ',custo_h[-1])

In [None]:
#predizendo os dados de treinamento
#para ver qual a reta que a Regressão gerou
y_hat = np.matmul(x.values,theta_lr) #retorna o profit predito para cada tamanho de população de x
fit_plot = pd.DataFrame({'X':x[1],'Y':y_hat}) #dados para plotar nossa reta
plot_reta = alt.Chart(fit_plot).mark_line(color='red').encode(
    x=alt.X('X',axis=alt.Axis(title='population')),
    y=alt.Y('Y',axis=alt.Axis(title='profit'))
)
data_plot + plot_reta

In [None]:
#predizendo novas amostras
pred_1 = x[0][0] * theta_lr[0] + x[1][0] * theta_lr[1] #como funciona a predição
print(pred_1,'\n')

pred_2 = 1 * theta_lr[0] + 14 * theta_lr[1]
print('predição: ',pred_2)

In [None]:
fit_plot

In [None]:
it = list(range(1,1501))
df_custo = pd.DataFrame({'Iteracoes':it,'Custo':custo_h})

In [None]:
alt.Chart(df_custo).mark_line(color='red').encode(
    x=alt.X('Iteracoes',axis=alt.Axis(title='Iterações')),
    y=alt.Y('Custo',axis=alt.Axis(title='Custo'))
)

# Dataset para Regressão Linear Multivariada

In [None]:
# se estiver usando google colab
# df_beer = pd.read_csv('/content/drive/My Drive/datasets/Consumo_cerveja.csv',nrows=365,delimiter=';')
# df_beer.head()

In [None]:
df_beer = pd.read_csv('bases/consumo_cerveja.csv',nrows=365,delimiter=';')
df_beer.head()

In [None]:
from sklearn.model_selection import train_test_split #para criar conjunto de treino e teste
#alterando nome das colunas para facilitar manipulação
df_beer = df_beer.rename(columns={'Temperatura Media (C)':'meanTemp','Temperatura Minima (C)':'minTemp','Temperatura Maxima (C)':'maxTemp',
                       'Precipitacao (mm)':'precip','Final de Semana':'weekend','Consumo de cerveja (litros)':'consumo'})

#selecionando as colunas serem usadas como features para treinar o modelo
#selecionando a coluna meta, ou seja, aquela que eu quero predizer
feature_col = ['maxTemp','precip','weekend']
meta_col = ['consumo']

X = df_beer[feature_col].values
y = df_beer[meta_col].values.ravel()

#adicionando nova coluna para multiplicação
ones = np.ones([X.shape[0],1])
X = np.concatenate((ones, X), axis=1)
m, n = np.shape(X) #quantidade de amostras x quantidade de características

split_test_size = 0.30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, random_state=42)

# Regressão Linear Multivariada

In [None]:
class LinearRegressionMultivariate():
    
    def __init__(self,alpha,iterations):
        self.alpha = alpha
        self.iterations = iterations
        
    
    def cost_function(self,X,y,theta):
        m = len(y) #número de amostras
        J = 0 # custo
        
        J = np.sum(np.square(np.dot(X, theta) - y))
        return J*(1/(2*m))
    
    def gradient_descent(self, X, y, theta):
        m = len(y)
        J_history = np.zeros(self.iterations)
        
        for k in range(self.iterations):   
            hypothesis = np.dot(X, theta)
            loss = hypothesis - y
            gradient = np.dot(X.transpose(), loss) / m
            theta = theta - self.alpha * gradient
            J_history[k] = self.cost_function(X, y, theta)
            
        return theta, J_history

In [None]:
#treinando
alpha = 0.001
iterations = 100000
lr_geral = LinearRegressionMultivariate(alpha,iterations)
theta = np.zeros(n)
theta_lr, custo_h = lr_geral.gradient_descent(X_train, y_train, theta)
theta_lr #valores finais de theta

In [None]:
X_train.shape

In [None]:
theta.shape

In [None]:
#testando com novas amostras
pred_value = [1,35,0,1]
np.dot(pred_value,theta_lr)

# Regressão Linear usando Scikit-Learn

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [None]:
#separando os dados em conjunto de treino e teste
#treinando o modelo
mlr_skl = linear_model.LinearRegression()
mlr_skl.fit(X_train, y_train.ravel()) # flatten arrays

In [None]:
y_pred = mlr_skl.predict(X_test) # predizando
print(mean_squared_error(y_test,y_pred)) # MSE
mlr_skl.coef_  #valores finais de theta

In [None]:
#testando com novas amostras
pred_value = [[1,35,0,1]]
mlr_skl.predict(pred_value)

# Equação Normal

In [None]:
X = df_beer[feature_col].values
y = df_beer[meta_col].values
X = np.matrix(X)
aux1 = np.matmul(X.transpose(),X)
inv = np.linalg.inv(aux1)
aux2 = np.matmul(X.transpose(),y)
theta = np.matmul(inv,aux2)
theta

# Dump de Modelo

In [None]:
from sklearn.externals import joblib

joblib.dump(mlr_skl, r'C:\Users\dheny\Documents\FIAP\Python\codigos\lr_model.pkl') 
lr_model_loaded = joblib.load(r'C:\Users\dheny\Documents\FIAP\Python\codigos\lr_model.pkl') 

In [None]:
lr_model_loaded.predict([[1,35,0,1]])