In [57]:
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None)#to display complete dataframe
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model

In [32]:
#Read the 'indice de cartera vencida' file, and set the col 0 ('Fecha') be the dataframe index.
indice_df = pd.read_excel('icv_mensual.xlsx', index_col=0)

In [20]:
indice_df.head()

Unnamed: 0_level_0,ICV_cartera_total
Fecha,Unnamed: 1_level_1
2002-01-01,0.106834
2002-02-01,0.106372
2002-03-01,0.105889
2002-04-01,0.104348
2002-05-01,0.10437


In [97]:
#read the variables dataset
variables_df = pd.read_excel('variables_macro_trimestral.xlsx', index_col=0)

In [98]:
variables_df.head()

Unnamed: 0_level_0,Desempleo,IPC,TRM,Exportaciones,Importaciones,PIB
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001-03-01,0.200207,,2278.78,,,
2001-06-01,0.181627,,2305.66,,,
2001-09-01,0.177623,,2328.23,,,
2001-12-01,0.166253,,2306.9,,,
2002-03-01,0.190028,,2282.33,,,


In [99]:
#add an incremental column number
variables_df['count'] = range(0, len(variables_df.index))
variables_df.head()

Unnamed: 0_level_0,Desempleo,IPC,TRM,Exportaciones,Importaciones,PIB,count
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001-03-01,0.200207,,2278.78,,,,0
2001-06-01,0.181627,,2305.66,,,,1
2001-09-01,0.177623,,2328.23,,,,2
2001-12-01,0.166253,,2306.9,,,,3
2002-03-01,0.190028,,2282.33,,,,4


In [100]:
#regression between count and IPC
#The following code generates a df with the neccesary data to make the regression
ipc_model_data = variables_df[['count', 'IPC']][variables_df.IPC.notnull()]
ipc_model_data.head()

Unnamed: 0_level_0,count,IPC
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1
2003-03-01,8,51.51
2003-06-01,9,52.33
2003-09-01,10,52.53
2003-12-01,11,53.07
2004-03-01,12,54.71


In [101]:
#create x and y arrays, we must reshape it because LinearRegression() object only accepts array like
#imputs
x = ipc_model_data['count'].values.reshape(-1, 1)
y = ipc_model_data['IPC'].values.reshape(-1, 1)

In [102]:
#Create the Linear Regression model to estimate the missing data
regression_model = linear_model.LinearRegression()
#fit the model
regression_model.fit(X = x, y = y)

LinearRegression()

In [103]:
#generate the x values to be predicted (the 'count' values)
x_to_predict = variables_df['count'][variables_df.IPC.isnull()].values.reshape(-1,1)
x_to_predict

array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7]])

In [104]:
#Fill the nulls with the predictions
predictions = regression_model.predict(x_to_predict)
#We need to transform the predictions array into a numeric list.
predictions = [float(i) for i in predictions] 


In [110]:
df_with_predictions = variables_df.copy() #create a copy to save the predictions
df_with_predictions.IPC[variables_df.IPC.isnull()] = predictions
df_with_predictions

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Desempleo,IPC,TRM,Exportaciones,Importaciones,PIB,count
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001-03-01,0.200207,44.307221,2278.78,,,,0
2001-06-01,0.181627,45.076797,2305.66,,,,1
2001-09-01,0.177623,45.846372,2328.23,,,,2
2001-12-01,0.166253,46.615947,2306.9,,,,3
2002-03-01,0.190028,47.385523,2282.33,,,,4
2002-06-01,0.177497,48.155098,2364.25,,,,5
2002-09-01,0.179465,48.924673,2751.23,,,,6
2002-12-01,0.157456,49.694249,2814.89,,,,7
2003-03-01,0.177929,51.51,2959.01,,,,8
2003-06-01,0.169987,52.33,2826.95,,,,9


In [107]:
x1 = [1, 2]
x2 = [3, 4]
yi=x1+x2
yi

[1, 2, 3, 4]

In [39]:
#the .isnull() or .notnull() commands generate a series of 'TRUES' and 'FALSES' where the criteria
#is met. 
variables_df.loc[variables_df['IPC'].isnull(), 'IPC']


Fecha
2001-03-01   NaN
2001-06-01   NaN
2001-09-01   NaN
2001-12-01   NaN
2002-03-01   NaN
2002-06-01   NaN
2002-09-01   NaN
2002-12-01   NaN
Name: IPC, dtype: float64

In [36]:
variables_df['IPC'].isnull()

Fecha
2001-03-01     True
2001-06-01     True
2001-09-01     True
2001-12-01     True
2002-03-01     True
              ...  
2019-12-01    False
2020-03-01    False
2020-06-01    False
2020-09-01    False
2020-12-01    False
Name: IPC, Length: 80, dtype: bool