# Import Libraries

In [1]:
import pandas as pd
import matplotlib as pt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
%matplotlib inline
pt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

# Import Dataset

In [2]:
prestamos30dias = pd.read_csv("BacPrestamos30Dias.csv") 

In [3]:
prestamos30dias.head()

Unnamed: 0,RowId,Año,Mes,Actividad,1-30
0,1,2007,1,Inmuebles,1208.903812
1,2,2007,1,Consumo,5382.492834
2,3,2007,2,Inmuebles,1086.259146
3,4,2007,2,Consumo,7135.741881
4,5,2007,3,Inmuebles,1325.900358


# Data Cleansing

In [4]:
#Usar Categoria 1 para Inmuebles,2 para consumo
prestamos30dias['Actividad'].replace(['Inmuebles','Consumo'],[1,2],inplace=True)
#Quitar columna RowId
prestamos30dias.drop(['RowId'], axis=1,inplace=True)

In [5]:
#Reordenar columnas
prestamos30dias.columns.tolist()

['Año', 'Mes', 'Actividad', '1-30']

In [6]:
prestamos30dias.tail()

Unnamed: 0,Año,Mes,Actividad,1-30
311,2019,12,2,45833.50156
312,2020,1,1,16535.872622
313,2020,1,2,39990.904749
314,2020,2,1,15072.15478
315,2020,2,2,42338.167099


# Definir Conjuntos de Datos 

In [7]:
x = prestamos30dias.iloc[:,0:3]
y= prestamos30dias.iloc[:,3]

In [8]:
x.head()

Unnamed: 0,Año,Mes,Actividad
0,2007,1,1
1,2007,1,2
2,2007,2,1
3,2007,2,2
4,2007,3,1


In [9]:
y.head()

0    1208.903812
1    5382.492834
2    1086.259146
3    7135.741881
4    1325.900358
Name: 1-30, dtype: float64

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
x_train.shape, x_test.shape,y_train.shape,y_test.shape

((221, 3), (95, 3), (221,), (95,))

# Modelos Regresion

Usando Linear Regression

In [11]:
lreg = LinearRegression().fit(x_train, y_train)
lpred = lreg.predict(x_test)

# The coefficients
print('Coefficients: \n', lreg.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test,lpred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, lpred))
print('-------------------------------------------------------------------------')
print('Mean Absolute Error:', mean_absolute_error(y_test, lpred))  
print('Mean Squared Error:', mean_squared_error(y_test, lpred))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, lpred)))

Coefficients: 
 [ 2164.40105103    69.58538837 15541.47417933]
Mean squared error: 45259673.02
Coefficient of determination: 0.62
-------------------------------------------------------------------------
Mean Absolute Error: 5108.344445984429
Mean Squared Error: 45259673.02021536
Root Mean Squared Error: 6727.5309750468905


Usando Random Forest Regressor

In [12]:
RFreg = RandomForestRegressor()
RFreg = RFreg.fit(x_train, y_train)
RFpred = RFreg.predict(x_test)
print('-------------------------------------------------------------------------')
print('Mean Absolute Error:', mean_absolute_error(y_test, RFpred))  
print('Mean Squared Error:', mean_squared_error(y_test, RFpred))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, RFpred)))

-------------------------------------------------------------------------
Mean Absolute Error: 1517.953898695128
Mean Squared Error: 9045637.991390703
Root Mean Squared Error: 3007.596713555643


Importar dataset de Prediccion

In [13]:
dspred = pd.read_csv("BacPrestamos30DiasPred.csv") 

In [14]:
#Usar Categoria 1 para Inmuebles,2 para consumo
dspred['Actividad'].replace(['Inmuebles','Consumo'],[1,2],inplace=True)
#Quitar columna RowId
dspred.drop(['1-30'], axis=1,inplace=True)
dspred.drop(['RowId'], axis=1,inplace=True)

In [15]:
dspred.head()

Unnamed: 0,Año,Mes,Actividad
0,2020,1,1
1,2020,1,2
2,2020,2,1
3,2020,2,2
4,2020,3,1


In [16]:
FinalPred=RFreg.predict(dspred)

In [17]:
FinalPred

array([15753.29559918, 41431.85273001, 15636.92647347, 44367.7177604 ,
       15826.68459974, 52135.96165939, 17043.84064914, 56300.58543079,
       17602.51236088, 51286.56885217, 17899.35934405, 45941.8789711 ,
       17867.81741619, 51722.79259615, 17363.00327616, 51487.2189498 ,
       16725.76334162, 55669.30016687, 16403.38036288, 53030.94331263,
       16067.32239443, 51936.72600597, 16067.32239443, 51936.72600597])

In [18]:
dspred['1-30']=FinalPred

In [19]:
dspred.to_csv("BacPrestamos30DiasPred.csv") 

In [20]:
dspred.head()

Unnamed: 0,Año,Mes,Actividad,1-30
0,2020,1,1,15753.295599
1,2020,1,2,41431.85273
2,2020,2,1,15636.926473
3,2020,2,2,44367.71776
4,2020,3,1,15826.6846
