# Import Libraries

In [1]:
import pandas as pd
import matplotlib as pt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
%matplotlib inline
pt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

# Import Dataset

In [2]:
prestamos90dias = pd.read_csv("BacPrestamos90Dias.csv") 

In [3]:
prestamos90dias.head()

Unnamed: 0,RowId,Año,Mes,Actividad,61-90
0,2148,2016,11,Consumo,7003796000.0
1,2158,2016,12,Inmuebles,1275202000.0
2,2166,2016,12,Consumo,6337330000.0
3,2176,2017,1,Inmuebles,1320492000.0
4,2184,2017,1,Consumo,7833737000.0


# Data Cleansing

In [4]:
#Usar Categoria 1 para Inmuebles,2 para consumo
prestamos90dias['Actividad'].replace(['Inmuebles','Consumo'],[1,2],inplace=True)
#Quitar columna RowId
prestamos90dias.drop(['RowId'], axis=1,inplace=True)

In [5]:
#Reordenar columnas
prestamos90dias.columns.tolist()

['Año', 'Mes', 'Actividad', '61-90']

In [6]:
prestamos90dias.tail()

Unnamed: 0,Año,Mes,Actividad,61-90
311,2016,9,1,845754200.0
312,2016,9,2,8500154000.0
313,2016,10,1,1258309000.0
314,2016,10,2,7457556000.0
315,2016,11,1,1193932000.0


# Definir Conjuntos de Datos 

In [7]:
x = prestamos90dias.iloc[:,0:3]
y= prestamos90dias.iloc[:,3]

In [8]:
x.head()

Unnamed: 0,Año,Mes,Actividad
0,2016,11,2
1,2016,12,1
2,2016,12,2
3,2017,1,1
4,2017,1,2


In [9]:
y.head()

0    7.003796e+09
1    1.275202e+09
2    6.337330e+09
3    1.320492e+09
4    7.833737e+09
Name: 61-90, dtype: float64

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
x_train.shape, x_test.shape,y_train.shape,y_test.shape

((221, 3), (95, 3), (221,), (95,))

# Modelos Regresion

Usando Linear Regression

In [11]:
lreg = LinearRegression().fit(x_train, y_train)
lpred = lreg.predict(x_test)

# The coefficients
print('Coefficients: \n', lreg.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test,lpred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, lpred))
print('-------------------------------------------------------------------------')
print('Mean Absolute Error:', mean_absolute_error(y_test, lpred))  
print('Mean Squared Error:', mean_squared_error(y_test, lpred))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, lpred)))

Coefficients: 
 [5.97198151e+08 7.70588696e+07 4.00037758e+09]
Mean squared error: 4388294177128680960.00
Coefficient of determination: 0.71
-------------------------------------------------------------------------
Mean Absolute Error: 1565868382.9599316
Mean Squared Error: 4.388294177128681e+18
Root Mean Squared Error: 2094825572.005622


Usando Random Forest Regressor

In [12]:
RFreg = RandomForestRegressor()
RFreg = RFreg.fit(x_train, y_train)
RFpred = RFreg.predict(x_test)
print('-------------------------------------------------------------------------')
print('Mean Absolute Error:', mean_absolute_error(y_test, RFpred))  
print('Mean Squared Error:', mean_squared_error(y_test, RFpred))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, RFpred)))

-------------------------------------------------------------------------
Mean Absolute Error: 431443903.4911915
Mean Squared Error: 5.090138201619227e+17
Root Mean Squared Error: 713452044.75278


Importar dataset de Prediccion

In [13]:
dspred = pd.read_csv("BacPrestamos90DiasPred.csv") 

In [14]:
#Usar Categoria 1 para Inmuebles,2 para consumo
dspred['Actividad'].replace(['Inmuebles','Consumo'],[1,2],inplace=True)
#Quitar columna RowId
dspred.drop(['61-90'], axis=1,inplace=True)
dspred.drop(['RowId'], axis=1,inplace=True)

In [15]:
dspred.head()

Unnamed: 0,Año,Mes,Actividad
0,2020,1,1
1,2020,1,2
2,2020,2,1
3,2020,2,2
4,2020,3,1


In [16]:
FinalPred=RFreg.predict(dspred)

In [17]:
FinalPred

array([3.75478977e+09, 1.15975937e+10, 3.68074275e+09, 1.22554386e+10,
       3.67040061e+09, 1.22750034e+10, 3.66574420e+09, 1.29873517e+10,
       3.69556787e+09, 1.34119089e+10, 3.75530163e+09, 1.48278907e+10,
       3.75148453e+09, 1.50148523e+10, 3.69271647e+09, 1.50654792e+10,
       3.67991370e+09, 1.53076681e+10, 3.75501912e+09, 1.49790780e+10,
       3.75347101e+09, 1.53670975e+10, 3.75347101e+09, 1.53389780e+10])

In [18]:
dspred['61-90']=FinalPred

In [19]:
dspred.to_csv("BacPrestamos90DiasPred.csv") 

In [20]:
dspred.head()

Unnamed: 0,Año,Mes,Actividad,61-90
0,2020,1,1,3754790000.0
1,2020,1,2,11597590000.0
2,2020,2,1,3680743000.0
3,2020,2,2,12255440000.0
4,2020,3,1,3670401000.0
