# Análise ROI - Regressão Linear

## Importar o arquivo

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/home/gbrlmoraes/git_reps/FIAP_PBLs__2024_2/PBL_fase_5/dados/Gastos_Publicidade_MelhoresCompras.csv', decimal = ',')
df.rename(columns = {
    'Ano' : 'ano',
    'Mes' : 'mes',
    'Tipo de Midia' : 'tipo_midia',
    'Gastos com Publicidade (em R$)' : 'gasto',
    'Previsao Inicial de Aumento de Vendas (em mil unidades)' : 'roi'
}, inplace = True)
df.head()

Unnamed: 0,ano,mes,tipo_midia,gasto,roi
0,2022,1,Paginas web,87000.0,1000
1,2022,1,Redes sociais,120000.0,800
2,2022,1,TV,250000.0,1500
3,2022,1,Jornal,110000.0,730
4,2022,1,Revista,40000.0,360


## Separação em treino e teste

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
x = df[['tipo_midia', 'gasto']]
x.head()

Unnamed: 0,tipo_midia,gasto
0,Paginas web,87000.0
1,Redes sociais,120000.0
2,TV,250000.0
3,Jornal,110000.0
4,Revista,40000.0


In [5]:
y = df['roi']
y.head()

0    1000
1     800
2    1500
3     730
4     360
Name: roi, dtype: int64

In [6]:
x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size = 0.2, shuffle = True)

## Análise dos dados de treino

In [7]:
x_treino.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160 entries, 42 to 10
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   tipo_midia  160 non-null    object 
 1   gasto       160 non-null    float64
dtypes: float64(1), object(1)
memory usage: 3.8+ KB


## Aplicando One-Hot-Encoding

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
encoder = OneHotEncoder(sparse_output = False)

In [10]:
encoded = encoder.fit_transform(x_treino[['tipo_midia']])

In [11]:
encoded_df = pd.DataFrame(encoded, columns = encoder.get_feature_names_out(['tipo_midia']))
encoded_df.head()

Unnamed: 0,tipo_midia_Google,tipo_midia_Instagram,tipo_midia_Jornal,tipo_midia_Paginas web,tipo_midia_Radio,tipo_midia_Redes sociais,tipo_midia_Revista,tipo_midia_TV
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Ordenando linhas do dataset de treino para juntar com os dados one-hot
x_treino.reset_index(drop = True, inplace = True)

In [13]:
# Juntando os datasets
x_treino_encoded = pd.concat([x_treino, encoded_df], axis = 1)

# Removendo coluna de tipo de mídia
x_treino_encoded.drop(columns = ['tipo_midia'], inplace = True)
x_treino_encoded.head()

Unnamed: 0,gasto,tipo_midia_Google,tipo_midia_Instagram,tipo_midia_Jornal,tipo_midia_Paginas web,tipo_midia_Radio,tipo_midia_Redes sociais,tipo_midia_Revista,tipo_midia_TV
0,82000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,88200.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,150000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,78400.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,146900.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Treinando o modelo

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
fiapinho = LinearRegression(fit_intercept = False)

In [16]:
fiapinho.fit(x_treino_encoded, y_treino)

## Avaliando modelo

In [17]:
from sklearn.metrics import r2_score, root_mean_squared_error

In [18]:
y_pred = fiapinho.predict(x_treino_encoded)

### Dados de treino

In [19]:
r2_score_treino = r2_score(
    y_pred, y_treino
)
print(f'Valor do R2 no treino: {r2_score_treino:.3f}')

Valor do R2 no treino: 0.968


In [20]:
rmse_treino = root_mean_squared_error(
    y_pred, y_treino
)
print(f'Valor do erro médio no treino: R${rmse_treino:.2f}')

Valor do erro médio no treino: R$198.30


### Dados de teste

In [21]:
### Tratando dados de teste

# Aplicando one-hot
encoded = encoder.transform(x_teste[['tipo_midia']])
encoded_df = pd.DataFrame(encoded, columns = encoder.get_feature_names_out(['tipo_midia']))

# Ordenando linhas do dataset de teste para juntar com os dados one-hot
x_teste.reset_index(drop = True, inplace = True)

# Juntando os datasets
x_teste_encoded = pd.concat([x_teste, encoded_df], axis = 1)

# Removendo coluna de tipo de mídia
x_teste_encoded.drop(columns = ['tipo_midia'], inplace = True)
x_teste_encoded.head()

Unnamed: 0,gasto,tipo_midia_Google,tipo_midia_Instagram,tipo_midia_Jornal,tipo_midia_Paginas web,tipo_midia_Radio,tipo_midia_Redes sociais,tipo_midia_Revista,tipo_midia_TV
0,190400.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,201600.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,175616.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,120000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,123200.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [22]:
y_pred_teste = fiapinho.predict(x_teste_encoded)

In [23]:
r2_score_teste = r2_score(
    y_pred_teste, y_teste
)
print(f'Valor do R2 no teste: {r2_score_teste:.3f}')

Valor do R2 no teste: 0.938


In [24]:
rmse_teste = root_mean_squared_error(
    y_pred_teste, y_teste
)
print(f'Valor do erro médio no teste: R${rmse_teste:.2f}')

Valor do erro médio no teste: R$242.60
