# Reporte analítica de datos - Proyecto de Grado II

# Modelamiento de los datos

In [11]:
import pandas as pd
import numpy as np
from matplotlib import pyplot
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_squared_error
import datetime as datetime

In [None]:
data = pd.read_csv('../dengue_data/Data.csv')
df = pd.DataFrame(data)
df["Date"] = pd.to_datetime(df['Date'])
df['Age'] = df['Age'].astype(float)
df.set_index('Id', inplace=True)
df.head(10)

In [None]:
def assign_zeros(nb_df) :
    day_delta = datetime.timedelta(days=1)
    start_date = datetime.date(2010, 1, 1)
    end_date = datetime.date(2020, 1, 1)

    list = []

    for i in range((end_date - start_date).days):
        date = start_date + i*day_delta
        try: 
            list.append(nb_df.at[date,'Cases'])
        except:  
            list.append(0)
            
    final_model = pd.DataFrame(columns=('Date', 'Cases'))

    for i in range((end_date - start_date).days):
        date = start_date + i*day_delta
        final_model.loc[len(final_model)]=[date, list[i]]
        
    return final_model

### Técnica de modelado: Autoregresión (AR)

In [54]:
test = [0, 0, 0, 4, 1, 0, 0, 0, 1, 1]
pred = [0, 1, 1, 4, 1, 0, 0, 0, 1, 0]

print(accuracy(test, pred))

10
0.3


In [53]:
def accuracy(test, predictions) :
    accuracy = 0
    for i in range(0, len(predictions)) :
        x = test[i]
        y = predictions[i]
        if(x != 0) :
            if(x == y) :
                accuracy += 1
    return round(accuracy/aux, 3)

In [None]:
def AR(series, nb_name, max_prediciton_size, optime_lags, flag, plot) :
    
    if(flag):
        fecha_1 = datetime.date(2016, 12, 31)
        fecha_2 = datetime.date(2017, 1, 1)
    else:
        fecha_1 = datetime.date(2015, 12, 31)
        fecha_2 = datetime.date(2016, 1, 1)

    df_train = series.loc[: fecha_1]
    df_test = series.loc[fecha_2: ]

    y1 = df_train.Cases.to_numpy()
    y2 = df_test.Cases.to_numpy()

    # Entrenamiento del modelo
    train, test = y1, y2

    # train autoregression
    window = optime_lags
    model = AutoReg(train, lags=optime_lags)
    model_fit = model.fit()
    coef = model_fit.params
    
    # walk forward over time steps in test
    history = train[len(train)-window:]
    history = [history[i] for i in range(len(history))]
    predictions = list()
    for t in range(len(test)):
        length = len(history)
        lag = [history[i] for i in range(length-window,length)]
        yhat = coef[0]
        for d in range(window):
            yhat += coef[d+1] * lag[window-d-1]
        obs = test[t]
        predictions.append(abs(np.round(float(yhat))))
        history.append(obs)
    test = test[-max_prediciton_size:]
    predictions = predictions[0:max_prediciton_size]
    rmse = np.sqrt(mean_squared_error(test, predictions))
    real_cases = sum(i for i in test if i != 0) 
    number_of_predictions = sum(1 for i in predictions if i != 0) 
    accuracy = accuracy(test, predictions)
    
    if(plot) :
        print('Test RMSE: %.3f' % rmse, '- Accuracy: ', accuracy, '- Total predictions:', number_of_predictions,
          '- Real cases:',real_cases, '- LAGS:', optime_lags, '- Days to predict:', max_prediciton_size)
        pyplot.figure(figsize=(25,5))
        pyplot.plot(test)
        pyplot.plot(predictions, color='red')
        pyplot.title(nb_name)  
        pyplot.ylabel("Número de casos") 
        pyplot.xlabel("Días a predecir")
        pyplot.show()
    
    return real_cases, number_of_predictions, rmse, accuracy



In [None]:
def evaluate_model(neighborhoods_df, city) :
    flag = False
    array_lags = []
    array_days_to_predict = []
    array_rmse = []
    array_accuracy = []
    array_real_cases = []
    array_number_of_predictions = []
    array_neighborhoods = []
    array_cities = []
    values_for_lags = np.arange(600, 1100, 50)
    values_for_days = np.arange(100, 1100, 50)
    for neighborhood_name in neighborhoods_df.index :
        if city == 'Buga' :
            flag = True
            nb_df = dfBuga[dfBuga.Neighborhood == neighborhood_name]
        elif city == 'Giron' :
            nb_df = dfGiron[dfGiron.Neighborhood == neighborhood_name]
        else :
            nb_df = dfYopal[dfYopal.Neighborhood == neighborhood_name]
        nb_df = pd.DataFrame(nb_df['Date'].value_counts().sort_index())
        nb_df.columns = ['Cases']
        nb_df.index.name = 'Date'
        df_zeros = assign_zeros(nb_df)
        df_zeros.set_index('Date',inplace=True) 

        for lags in values_for_lags :
            for days_to_predict in values_for_days :
                real_cases, number_of_predictions, rmse, accuracy = AR(df_zeros, neighborhood_name, days_to_predict, lags, flag, False)
                array_lags.append(lags)
                array_days_to_predict.append(days_to_predict)
                array_rmse.append(rmse)
                array_real_cases.append(real_cases)
                array_number_of_predictions.append(number_of_predictions)
                array_neighborhoods.append(neighborhood_name)
                array_cities.append(city)
                array_accuracy.append(accuracy)
   
    data = {'City': array_cities, 'Neighborhood': array_neighborhoods,
            'Real Cases': array_real_cases, 'Number of predictions': array_number_of_predictions,
            'Lags': array_lags, 'Days to predict': array_days_to_predict, 'RMSE': array_rmse, 'Accuracy': array_accuracy}
    
    df_data = pd.DataFrame(data=data)
    return df_data

## ENTRENAMIENTO

In [8]:
test = [1, 1, 4, 4, 0, 0, 4, 0]
predictions = [1, 1, 3, 4, 0, 1, 0, 1]
print('accuracy: ',accuracy(test, predictions))

1
1
3
4
0
1
0
1
accuracy:  0.6


### BUGA

In [None]:
dfBuga = df[df.City == 'Buga']

In [None]:
neighborhoods = dfBuga['Neighborhood'].value_counts()
neighborhoods = neighborhoods.to_frame(name='Cases')
neighborhoods = neighborhoods[neighborhoods.Cases >= 100]

In [None]:
evaluate_model(neighborhoods, 'Buga').to_csv('buga_models.csv', index=False)

### GIRÓN

In [None]:
dfGiron = df[df.City == 'Girón']

In [None]:
neighborhoods = dfGiron['Neighborhood'].value_counts()
neighborhoods = neighborhoods.to_frame(name='Cases')
neighborhoods = neighborhoods[neighborhoods.Cases >= 204]

In [None]:
evaluate_model(neighborhoods, 'Giron').to_csv('giron_models.csv', index=False)

### YOPAL

In [None]:
dfYopal = df[df.City == 'Yopal']

In [None]:
neighborhoods = dfYopal['Neighborhood'].value_counts()
neighborhoods = neighborhoods.to_frame(name='Cases')
neighborhoods = neighborhoods[neighborhoods.Cases >= 204]

In [None]:
evaluate_model(neighborhoods, 'Yopal').to_csv('yopal_models.csv', index=False)

## SELECCIÓN DEL MEJOR MODELO

### BUGA

In [None]:
buga_models = pd.read_csv('models/buga_models.csv', index_col= 0)
df_buga_models = pd.DataFrame(buga_models)

In [None]:
first_four_neighborhoods = ['FUENMAYOR', 'SANTA BARBARA', 'BALBOA', 'LA HONDA']
df_buga_models = df_buga_models[df_buga_models.Neighborhood.isin(first_four_neighborhoods)]
df_buga_models = df_buga_models[df_buga_models.RMSE <= 0.20]

In [None]:
df_buga_models.sort_values(by=['Lags', 'Days to predict', 'RMSE'])

El mejor modelo para Buga es Lags=750, Days to predict=850

#### BARRIO FUENMAYOR

In [None]:
df_fuenmayor = dfBuga[dfBuga.Neighborhood == "FUENMAYOR"]
df_fuenmayor = pd.DataFrame(df_fuenmayor['Date'].value_counts().sort_index())
df_fuenmayor.columns = ['Cases']
df_fuenmayor.index.name = 'Date'
df_fuenmayor = assign_zeros(df_fuenmayor)
df_fuenmayor.set_index('Date',inplace=True) 

In [None]:
AR(df_fuenmayor, "FUENMAYOR", 850, 750, True)

#### BARRIO SANTA BÁRBARA

In [None]:
df_santabarbara = dfBuga[dfBuga.Neighborhood == "SANTA BARBARA"]
df_santabarbara = pd.DataFrame(df_santabarbara['Date'].value_counts().sort_index())
df_santabarbara.columns = ['Cases']
df_santabarbara.index.name = 'Date'
df_santabarbara = assign_zeros(df_santabarbara)
df_santabarbara.set_index('Date',inplace=True) 

In [None]:
AR(df_santabarbara, "SANTA BARBARA", 850, 750, True, True)

#### BARRIO LA HONDA

In [None]:
df_lahonda = dfBuga[dfBuga.Neighborhood == "LA HONDA"]
df_lahonda = pd.DataFrame(df_lahonda['Date'].value_counts().sort_index())
df_lahonda.columns = ['Cases']
df_lahonda.index.name = 'Date'
df_lahonda = assign_zeros(df_lahonda)
df_lahonda.set_index('Date',inplace=True) 

In [None]:
AR(df_lahonda, "LA HONDA", 850, 750, True, True)

#### BARRIO BALBOA

In [None]:
df_balboa = dfBuga[dfBuga.Neighborhood == "BALBOA"]
df_balboa = pd.DataFrame(df_balboa['Date'].value_counts().sort_index())
df_balboa.columns = ['Cases']
df_balboa.index.name = 'Date'
df_balboa = assign_zeros(df_balboa)
df_balboa.set_index('Date',inplace=True) 

In [None]:
AR(df_balboa, "BALBOA", 850, 750, True, True)