### Modelo de Classificação Binária

In [63]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsRestClassifier

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 39)

caminho_arquivo = '../Black_Umbrella/dados/integracao_ocorr_diario_pre_processado.csv'
df = pd.read_csv(caminho_arquivo)

In [64]:
df.head()

Unnamed: 0,data,tavg,tmin,tmax,prcp,wdir,wspd,pres,distrito,latitude_distrito,longitude_distrito,ocorrencia,longitude_ocorrencia,latitude_ocorrencia,ocorrencia_target,distrito_encoded,dia,mes,ano,estacao_verao,estacao_outono,estacao_inverno,estacao_primavera
0,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Agua Rasa,-23.565372,-46.573697,sem_ocorrencia,-46.623993,-23.570186,0,0.098238,1,1,2013,1,0,0,0
1,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Alto De Pinheiros,-23.549906,-46.707642,sem_ocorrencia,-46.623993,-23.570186,0,0.168444,1,1,2013,1,0,0,0
2,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Anhanguera,-23.432908,-46.788534,sem_ocorrencia,-46.623993,-23.570186,0,0.068531,1,1,2013,1,0,0,0
3,2013-01-01,25.5,21.9,31.0,0.0,316.0,16.8,1017.511112,Aricanduva,-23.578024,-46.511454,sem_ocorrencia,-46.623993,-23.570186,0,0.071641,1,1,2013,1,0,0,0
4,2013-01-01,29.6,20.0,31.0,0.0,315.0,11.1,1017.511112,Artur Alvim,-23.540469,-46.489791,sem_ocorrencia,-46.623993,-23.570186,0,0.080137,1,1,2013,1,0,0,0


In [69]:
X = df.drop(columns=['data', 'distrito', 'ocorrencia', 'longitude_ocorrencia', 'latitude_ocorrencia', 'ocorrencia_target'])
y = df['ocorrencia_target']

# Dividindo os dados em treino (80%) e teste (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instanciar o scaler
scaler = StandardScaler()

# Ajustar o scaler aos dados de treinamento e transformar
X_train_scaled = scaler.fit_transform(X_train[['tavg', 'tmin', 'tmax', 'prcp', 'wdir', 'wspd', 'pres']])
X_test_scaled = scaler.transform(X_test[['tavg', 'tmin', 'tmax', 'prcp', 'wdir', 'wspd', 'pres']])

# Substituir os dados originais pelos dados escalados
X_train[['tavg', 'tmin', 'tmax', 'prcp', 'wdir', 'wspd', 'pres']] = X_train_scaled
X_test[['tavg', 'tmin', 'tmax', 'prcp', 'wdir', 'wspd', 'pres']] = X_test_scaled

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((341928, 17), (85482, 17), (341928,), (85482,))

- Validando os melhores parâmetros pro modelo `RandomForestClassifier`

In [70]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Definindo o classificador
rf_model = RandomForestClassifier(random_state=42)

# Definindo os hiperparâmetros a serem testados
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'class_weight': ['balanced', None]
}

# Configurando a busca
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                           scoring='f1_weighted', cv=3, n_jobs=-1)

# Treinando o modelo
grid_search.fit(X_train, y_train)

# Resultados da busca
print("Melhores parâmetros:", grid_search.best_params_)
print("Melhor F1-score:", grid_search.best_score_)


KeyboardInterrupt: 

In [None]:
# Aplicando class weights para balancear a importância das classes
# O modelo vai penalizar mais os erros da classe 1 (ocorrência) para focar em melhorar a recall e F1-score dessa classe.
# Criando e treinando o modelo Random Forest
rf_model = RandomForestClassifier(n_estimators=50, random_state=42, class_weight={0: 1, 1: 3})
rf_model.fit(X_train, y_train)

# Fazer previsões e avaliar o modelo
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

- Validando o modelo com Matriz de Confusão

In [None]:
matriz = confusion_matrix(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))

In [None]:
visualizacao = ConfusionMatrixDisplay(confusion_matrix=matriz,
                                      display_labels=['sem ocorrencia', 'ocorrencia'])
visualizacao.plot()

plt.title("Matriz de Confusão - DecisionTreeClassifier");

In [None]:
import joblib

# Salvar o modelo treinado
# joblib.dump(rf_model, 'modelo_ocorrencias2.pkl')

### Modelo de Classificação Multi-Classe

In [None]:
df['ocorrencia'].value_counts()

In [None]:
# Instanciando e transformando a coluna de Ocorrências com LabelEncoder
label_encoder = LabelEncoder()
df['ocorrencia_encoded'] = label_encoder.fit_transform(df['ocorrencia'])

# Removendo as colunas que não vão ser utilizadas
X = df.drop(columns=['data', 'distrito', 'ocorrencia', 'ocorrencia_target', 'longitude_ocorrencia', 'latitude_ocorrencia'])
y = df['ocorrencia_encoded']

# Dividindo os dados em treino (80%) e teste (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Criar o classificador OvR
ovr_classifier = OneVsRestClassifier(RandomForestClassifier(random_state=42))

# Treinar o modelo
ovr_classifier.fit(X_train, y_train)

# Fazer previsões
y_pred = ovr_classifier.predict(X_test)

# Avaliar o modelo
print(classification_report(y_test, y_pred))

- Definindo os melhores parâmetros com `GridSearchCV`

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Definindo o classificador
rf = RandomForestClassifier(random_state=42)

# Definindo os hiperparâmetros a serem testados
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'class_weight': ['balanced', None]
}

# Configurando a busca
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           scoring='f1_weighted', cv=3, n_jobs=-1)

# Treinando o modelo
grid_search.fit(X_train, y_train)

# Resultados da busca
print("Melhores parâmetros:", grid_search.best_params_)
print("Melhor F1-score:", grid_search.best_score_)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Supondo que df seja seu DataFrame e 'ocorrencia' seja a variável target
X = df.drop(columns=['ocorrencia', 'distrito', 'data', 'longitude_ocorrencia', 'latitude_ocorrencia', 'ocorrencia_encoded'])
y = df['ocorrencia']

# Dividir os dados
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Criar o classificador OvR com os melhores parâmetros
ovr_classifier = OneVsRestClassifier(RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=5,
    class_weight=None,
    random_state=42
))

# Treinar o modelo
ovr_classifier.fit(X_train, y_train)

# Fazer previsões
y_pred = ovr_classifier.predict(X_test)

# Avaliar o modelo
print(classification_report(y_test, y_pred))

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Supondo que df seja seu DataFrame e 'ocorrencia' seja a variável target
X = df.drop(columns=['ocorrencia', 'distrito', 'data', 'longitude_ocorrencia', 'latitude_ocorrencia', 'ocorrencia_encoded'])
y = df['ocorrencia']

# Dividir os dados
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Aplicar SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Criar o classificador OvR com os melhores parâmetros
ovr_classifier = OneVsRestClassifier(RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=5,
    class_weight=None,
    random_state=42
))

# Treinar o modelo
ovr_classifier.fit(X_train_resampled, y_train_resampled)

# Fazer previsões
y_pred = ovr_classifier.predict(X_test)

# Avaliar o modelo
print(classification_report(y_test, y_pred))