In [69]:
import pandas as pd
import warnings
from feature_engine.encoding import OneHotEncoder

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 39)

caminho_arquivo = '../Black_Umbrella/dados/integracao_ocorr_diario.csv'
df = pd.read_csv(caminho_arquivo)

In [70]:
df.head()

Unnamed: 0,data,tavg,tmin,tmax,prcp,wdir,wspd,pres,distrito,latitude_distrito,longitude_distrito,ocorrencia,longitude_ocorrencia,latitude_ocorrencia
0,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Agua Rasa,-23.565372,-46.573697,,,
1,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Alto De Pinheiros,-23.549906,-46.707642,,,
2,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Anhanguera,-23.432908,-46.788534,,,
3,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Aricanduva,-23.578024,-46.511454,,,
4,2013-01-01,29.6,20.0,31.0,0.0,315.0,11.1,1017.511112,Artur Alvim,-23.540469,-46.489791,,,


In [71]:
df.isna().sum()

data                         0
tavg                         0
tmin                         0
tmax                         0
prcp                         0
wdir                         0
wspd                         0
pres                         0
distrito                    41
latitude_distrito         1074
longitude_distrito        1074
ocorrencia              369138
longitude_ocorrencia    369138
latitude_ocorrencia     369138
dtype: int64

In [72]:
# Converter 'data' para datetime
df['data'] = pd.to_datetime(df['data'])

# Preencher valores nulos em 'ocorrencia' com 'sem_ocorrencia' e padronizar texto
df['ocorrencia'].fillna('sem_ocorrencia', inplace=True)
df['ocorrencia'] = df['ocorrencia'].str.lower().str.replace(' ', '_', regex=False)

# Imputar valores faltantes para latitude e longitude com a média
df['latitude_distrito'].fillna(df['latitude_distrito'].mean(), inplace=True)
df['longitude_distrito'].fillna(df['longitude_distrito'].mean(), inplace=True)
df['latitude_ocorrencia'].fillna(df['latitude_ocorrencia'].mean(), inplace=True)
df['longitude_ocorrencia'].fillna(df['longitude_ocorrencia'].mean(), inplace=True)

df.head()

Unnamed: 0,data,tavg,tmin,tmax,prcp,wdir,wspd,pres,distrito,latitude_distrito,longitude_distrito,ocorrencia,longitude_ocorrencia,latitude_ocorrencia
0,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Agua Rasa,-23.565372,-46.573697,sem_ocorrencia,-46.623993,-23.570186
1,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Alto De Pinheiros,-23.549906,-46.707642,sem_ocorrencia,-46.623993,-23.570186
2,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Anhanguera,-23.432908,-46.788534,sem_ocorrencia,-46.623993,-23.570186
3,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Aricanduva,-23.578024,-46.511454,sem_ocorrencia,-46.623993,-23.570186
4,2013-01-01,29.6,20.0,31.0,0.0,315.0,11.1,1017.511112,Artur Alvim,-23.540469,-46.489791,sem_ocorrencia,-46.623993,-23.570186


In [73]:
# Criar a coluna target binária 'ocorrencia_target'
df['ocorrencia_target'] = df['ocorrencia'].apply(lambda x: 0 if x == 'sem_ocorrencia' else 1)

print(f'{df['ocorrencia'].value_counts()}\n\n{df['ocorrencia_target'].value_counts()}')

ocorrencia
sem_ocorrencia     369138
queda_de_arvore     44150
alagamento           6659
inundacao            4523
deslizamento         2940
Name: count, dtype: int64

ocorrencia_target
0    369138
1     58272
Name: count, dtype: int64


In [74]:
df.isna().sum()

data                     0
tavg                     0
tmin                     0
tmax                     0
prcp                     0
wdir                     0
wspd                     0
pres                     0
distrito                41
latitude_distrito        0
longitude_distrito       0
ocorrencia               0
longitude_ocorrencia     0
latitude_ocorrencia      0
ocorrencia_target        0
dtype: int64

- Target Encoding substitui cada valor categórico pela **média da variável-alvo** para essa categoria. No nosso caso, vamos calcular a proporção de ocorrências em cada distrito.

- Ajuda a capturar a relação entre a variável categórica (`distrito`) e o que estamos tentando prever (`ocorrencia_target`), sem criar muitas colunas extras.

In [75]:
# Imputar valores nulos em 'distrito' com 'Unknown' e transformar em categórica
df['distrito'].fillna('Unknown', inplace=True)
df['distrito'] = df['distrito'].astype('category')

# Target Encoding
district_means = df.groupby('distrito')['ocorrencia_target'].mean()
df['distrito_encoded'] = df['distrito'].map(district_means)

In [76]:
df.head()

Unnamed: 0,data,tavg,tmin,tmax,prcp,wdir,wspd,pres,distrito,latitude_distrito,longitude_distrito,ocorrencia,longitude_ocorrencia,latitude_ocorrencia,ocorrencia_target,distrito_encoded
0,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Agua Rasa,-23.565372,-46.573697,sem_ocorrencia,-46.623993,-23.570186,0,0.098238
1,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Alto De Pinheiros,-23.549906,-46.707642,sem_ocorrencia,-46.623993,-23.570186,0,0.168444
2,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Anhanguera,-23.432908,-46.788534,sem_ocorrencia,-46.623993,-23.570186,0,0.068531
3,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Aricanduva,-23.578024,-46.511454,sem_ocorrencia,-46.623993,-23.570186,0,0.071641
4,2013-01-01,29.6,20.0,31.0,0.0,315.0,11.1,1017.511112,Artur Alvim,-23.540469,-46.489791,sem_ocorrencia,-46.623993,-23.570186,0,0.080137


- Utilizar a coluna de data para gerar mais Features

In [77]:
 # Extrair dia, mês e ano
df['dia'] = df['data'].dt.day
df['mes'] = df['data'].dt.month
df['ano'] = df['data'].dt.year

 # Função para determinar a estação do ano
def estacao_do_ano(data):
        mes = data.month
        dia = data.day
        if (mes == 12 and dia >= 21) or (mes in [1, 2]) or (mes == 3 and dia <= 20):
            return 'verao'
        elif (mes == 3 and dia >= 21) or (mes in [4, 5]) or (mes == 6 and dia <= 20):
            return 'outono'
        elif (mes == 6 and dia >= 21) or (mes in [7, 8]) or (mes == 9 and dia <= 22):
            return 'inverno'
        elif (mes == 9 and dia >= 23) or (mes in [10, 11]) or (mes == 12 and dia <= 20):
            return 'primavera'

# Aplicar a função na coluna 'data' e criar a nova coluna 'estacao'
df['estacao'] = df['data'].apply(estacao_do_ano)
df.head()

Unnamed: 0,data,tavg,tmin,tmax,prcp,wdir,wspd,pres,distrito,latitude_distrito,longitude_distrito,ocorrencia,longitude_ocorrencia,latitude_ocorrencia,ocorrencia_target,distrito_encoded,dia,mes,ano,estacao
0,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Agua Rasa,-23.565372,-46.573697,sem_ocorrencia,-46.623993,-23.570186,0,0.098238,1,1,2013,verao
1,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Alto De Pinheiros,-23.549906,-46.707642,sem_ocorrencia,-46.623993,-23.570186,0,0.168444,1,1,2013,verao
2,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Anhanguera,-23.432908,-46.788534,sem_ocorrencia,-46.623993,-23.570186,0,0.068531,1,1,2013,verao
3,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Aricanduva,-23.578024,-46.511454,sem_ocorrencia,-46.623993,-23.570186,0,0.071641,1,1,2013,verao
4,2013-01-01,29.6,20.0,31.0,0.0,315.0,11.1,1017.511112,Artur Alvim,-23.540469,-46.489791,sem_ocorrencia,-46.623993,-23.570186,0,0.080137,1,1,2013,verao


In [78]:
df['estacao'].value_counts()

estacao
verao        118747
inverno      106129
outono       105569
primavera     96965
Name: count, dtype: int64

In [79]:
df['estacao'].value_counts(normalize=True) * 100

estacao
verao        27.782925
inverno      24.830725
outono       24.699703
primavera    22.686647
Name: proportion, dtype: float64

In [80]:
# Identificar e converter variáveis categóricas para 'category'
cat_features = ['estacao']
df[cat_features] = df[cat_features].astype('category')

# Codificar variáveis categóricas usando OneHotEncoder
onehot = OneHotEncoder(variables=cat_features)
df = onehot.fit_transform(df)

In [82]:
df.head(10)

Unnamed: 0,data,tavg,tmin,tmax,prcp,wdir,wspd,pres,distrito,latitude_distrito,longitude_distrito,ocorrencia,longitude_ocorrencia,latitude_ocorrencia,ocorrencia_target,distrito_encoded,dia,mes,ano,estacao_verao,estacao_outono,estacao_inverno,estacao_primavera
0,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Agua Rasa,-23.565372,-46.573697,sem_ocorrencia,-46.623993,-23.570186,0,0.098238,1,1,2013,1,0,0,0
1,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Alto De Pinheiros,-23.549906,-46.707642,sem_ocorrencia,-46.623993,-23.570186,0,0.168444,1,1,2013,1,0,0,0
2,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Anhanguera,-23.432908,-46.788534,sem_ocorrencia,-46.623993,-23.570186,0,0.068531,1,1,2013,1,0,0,0
3,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Aricanduva,-23.578024,-46.511454,sem_ocorrencia,-46.623993,-23.570186,0,0.071641,1,1,2013,1,0,0,0
4,2013-01-01,29.6,20.0,31.0,0.0,315.0,11.1,1017.511112,Artur Alvim,-23.540469,-46.489791,sem_ocorrencia,-46.623993,-23.570186,0,0.080137,1,1,2013,1,0,0,0
5,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Barra Funda,-23.525462,-46.667513,sem_ocorrencia,-46.623993,-23.570186,0,0.098468,1,1,2013,1,0,0,0
6,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Bela Vista,-23.560122,-46.650034,sem_ocorrencia,-46.623993,-23.570186,0,0.076888,1,1,2013,1,0,0,0
7,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Belem,-23.534883,-46.594939,sem_ocorrencia,-46.623993,-23.570186,0,0.058062,1,1,2013,1,0,0,0
8,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Bom Retiro,-23.525163,-46.638164,sem_ocorrencia,-46.623993,-23.570186,0,0.08394,1,1,2013,1,0,0,0
9,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Bras,-23.545326,-46.616444,sem_ocorrencia,-46.623993,-23.570186,0,0.053263,1,1,2013,1,0,0,0
