Usando base de dados do [DATASUS TABNET](https://datasus.saude.gov.br/informacoes-de-saude-tabnet/) sobre [casos de Tuberculose](http://tabnet.datasus.gov.br/cgi/tabcgi.exe?sinannet/cnv/tubercbr.def).

In [1]:
import pandas as pd
from datetime import date
import plotly.graph_objects as go
import numpy as np
from sklearn.linear_model import LinearRegression
import time

In [2]:
url_tuberculose = 'https://raw.githubusercontent.com/LucasGabrielB/Alura-Bootcamp-Data-Science-Aplicada/main/Modulo-04/datasets/TUBERCULOSE%20-%20CASOS%20CONFIRMADOS%20NOTIFICADOS%20NO%20SISTEMA%20DE%20INFORMA%C3%87%C3%83O%20DE%20AGRAVOS%20DE%20NOTIFICA%C3%87%C3%83O%20-%20BRASIL.csv'

df_tuberculose = pd.read_csv(url_tuberculose, skiprows=3, skipfooter=17, sep=';', encoding='ISO-8859-1', engine='python')

df_tuberculose.head()

Unnamed: 0,Ano Diagnóstico,Jan,Fev,Mar,Abr,Mai,Jun,Jul,Ago,Set,Out,Nov,Dez,Total
0,2001,8088,6542,8095,7305,7656,6805,6987,8065,6697,7524,6874,6627,87265
1,2002,8013,7346,7961,8771,7784,6713,7746,8330,7654,8200,7532,6809,92859
2,2003,8115,7985,7474,7975,8024,7082,7949,7585,8136,8521,7624,7303,93773
3,2004,7574,6743,8574,8089,7763,7314,7825,8232,7901,8016,7753,7196,92980
4,2005,7432,6748,8441,7958,8019,7781,7282,8391,7690,7186,7587,7541,92056


In [3]:
convert_months = {
    'Jan': 1,
    'Fev': 2,
    'Mar': 3,
    'Abr': 4,
    'Mai': 5,
    'Jun': 6,
    'Jul': 7,
    'Ago': 8,
    'Set': 9,
    'Out': 10,
    'Nov': 11,
    'Dez': 12,
}

In [4]:
# transformando o DataFrame em uma série temporal
df_tuberculose_ts = df_tuberculose.drop('Total', axis=1).melt(value_name='Casos', id_vars=['Ano Diagnóstico'], var_name='Mês')
df_tuberculose_ts['Data'] = df_tuberculose_ts.apply(lambda row: date(row['Ano Diagnóstico'], convert_months[row['Mês']], 1), axis=1)
df_tuberculose_ts.drop(['Ano Diagnóstico', 'Mês'], axis=1, inplace=True)
df_tuberculose_ts.sort_values('Data', inplace=True)

df_tuberculose_ts

Unnamed: 0,Casos,Data
0,8088,2001-01-01
19,6542,2001-02-01
38,8095,2001-03-01
57,7305,2001-04-01
76,7656,2001-05-01
...,...,...
151,8239,2019-08-01
170,8143,2019-09-01
189,8592,2019-10-01
208,7615,2019-11-01


In [5]:
# criando media movel
df_tuberculose_ts['Casos - Média Móvel'] = df_tuberculose_ts['Casos'].rolling(8, center=True).mean()

In [6]:
# adicionando coluna com o timestamp
df_tuberculose_ts['Data - timestamp'] = df_tuberculose_ts['Data'].map(lambda d: time.mktime(d.timetuple()))

In [7]:
# treinando modelo linear
df_temp = df_tuberculose_ts.dropna()
y = np.array(df_temp[-12:]['Casos']).reshape(-1, 1)
X = np.array(df_temp[-12:]['Data - timestamp']).reshape(-1, 1)

reg = LinearRegression().fit(X, y)

In [8]:
def predict_cases(year: int, month: int) -> int:
    return int(reg.predict(np.array([[time.mktime(date(year, month, 1).timetuple())]])))

In [9]:
# dataframe com as previsoes para o proximo ano, 2020
df_tuberculose_predict = pd.DataFrame(
    [{'Casos': predict_cases(2020, month), 
      'Data': date(2020, month, 1)}
        for month in range(1, 13)]
)

df_tuberculose_predict

Unnamed: 0,Casos,Data
0,8202,2020-01-01
1,8224,2020-02-01
2,8246,2020-03-01
3,8268,2020-04-01
4,8290,2020-05-01
5,8313,2020-06-01
6,8335,2020-07-01
7,8357,2020-08-01
8,8380,2020-09-01
9,8402,2020-10-01


In [10]:
fig = go.Figure()

fig.add_trace(go.Scatter(line={'color': '#ff3826'},
                         name='Casos',
                         y=df_tuberculose_ts['Casos'],
                         x=df_tuberculose_ts['Data']))

fig.add_trace(go.Scatter(line={'color': 'black'},
                         name='Média Móvel (8)',
                         y=df_tuberculose_ts['Casos - Média Móvel'],
                         x=df_tuberculose_ts['Data']))

fig.add_trace(go.Scatter(line={'color': 'blue'},
                         name='Previsão',
                         y=df_tuberculose_predict['Casos'],
                         x=df_tuberculose_predict['Data']))

fig.update_traces(mode='markers+lines', hovertemplate=None)
fig.update_layout(
    hovermode='x',
    title={
        'text': 'Casos de Tuberculose ao longo dos anos no Brasil',
        'y': 0.9,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    yaxis_title='Número de casos',
    font={
        'family': 'Courier New, monospace',
        'size': 16
    }
)

fig.show()