In [34]:
import re
from collections import Counter
import sklearn as skl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.metrics import classification_report,plot_confusion_matrix, accuracy_score
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier

In [35]:
incidentes = pd.read_csv("../input/train_data_cleaned.csv", index_col=0)
teste = pd.read_csv("../input/test_data_cleaned.csv", index_col=0)

# Cenário 3.1 - Outliers com método de transformação LOG
* A features que seguem uma normal-padrão, são tratadas pelo método IQR
* As features que não seguem uma normal-padrão são transformadas com log

In [36]:
# função que determina se uma coluna está desviada e que a transforma conforme o caso
def check_and_transform_skewness(column_list,df):
  for col in column_list:
    skewness = df[col].skew()
    print(f'Initial skewness: {skewness}')
    if -1 < skewness < 1:
      df[col] = np.log1p(df[col])
      print(f'High skewness: {df[col].skew()}')
      print("Done")
    elif -0.3 < skewness < -1 or 0.3 < skewness < 1:
      df[col] = np.log1p(df[col])
      print(f'Moderate skewness: {df[col].skew()}')
      print("Done")
    elif -0.3 < skewness < 0.3:
      print(f"column \'{col}\' is fairly skewed.")
    return df

In [37]:
# dados, lista de colunas, flg com método de deteção, method = string com a forma de os substituir.
def process_outliers(df, columns, flag, method):
    if flag == 1:
        # Use IQR distance to detect outliers
        for col in columns:
            median = df[col].median()
            lower_quartile = df[col].quantile(0.25)
            upper_quartile = df[col].quantile(0.75)
            iqr = upper_quartile - lower_quartile

            # Find the rows with outliers
            outliers = df[(df[col] < lower_quartile - 1.5 * iqr) | (df[col] > upper_quartile + 1.5 * iqr)]
            indices = outliers.index
            #outliers_per = len(outliers)/len(df[col] * 100)

            # Process the outliers according to the specified method
            if method == "remove":
                df = df.drop(indices)
            elif method == "median":
                df.loc[indices, col] = median
            elif method == "mode":
                df.loc[indices, col] = df[col].mode()[0]
            elif method == "mean":
                df.loc[indices, col] = df[col].mean()
    elif flag == 2:
        # Use MAD to detect outliers
        for col in columns:
            median = df[col].median()
            mad = np.mean(np.abs(df[col] - median))

            # Find the rows with outliers
            outliers = df[(df[col] - median).abs() > 3*mad]
            indices = outliers.index

            # Process the outliers according to the specified method
            if method == "remove":
                df = df.drop(indices)
            elif method == "median":
                df.loc[indices, col] = median
            elif method == "mode":
                df.loc[indices, col] = df[col].mode()[0]
            elif method == "mean":
                df.loc[indices, col] = df[col].mean()
    else:
        raise ValueError("Invalid flag. Must be 1 (IQR) or 2 (MAD).")

    return df

In [38]:
not_skewed_columns = ['avg_temperature']
skewed_columns = ['delay_in_seconds', 'avg_atm_pressure','avg_humidity']

# tratar as desviadas com log transform
print("Trabalho nos dados de treino ...")
incidentes_transformed = check_and_transform_skewness(skewed_columns, incidentes)
print("Trabalho nos dados de teste ...")
teste_transformed = check_and_transform_skewness(skewed_columns, teste)
print("Done.")

# tratar as não desviadas com IQR
incidentes_moda = incidentes_transformed.copy()
incidentes_moda = process_outliers(incidentes_moda,not_skewed_columns,1,"mode")
incidentes_mediana = incidentes_transformed.copy()
incidentes_mediana = process_outliers(incidentes_mediana,not_skewed_columns,1,"median")
incidentes_media = incidentes_transformed.copy()
incidentes_media = process_outliers(incidentes_media,not_skewed_columns,1,"mean")
incidentes_remocao = incidentes_transformed.copy()
incidentes_remocao = process_outliers(incidentes_remocao,not_skewed_columns,1,"remove")

Trabalho nos dados de treino ...
Initial skewness: 6.274560871857796
Trabalho nos dados de teste ...
Initial skewness: 4.866904832456009
Done.


### Modelos

In [39]:
#divisão do target
x_moda = incidentes_moda.drop(['incidents'], axis=1)
y_moda = incidentes_moda['incidents']
#divisão do target
x_media = incidentes_media.drop(['incidents'], axis=1)
y_media = incidentes_media['incidents']
#divisão do target
x_mediana = incidentes_mediana.drop(['incidents'], axis=1)
y_mediana = incidentes_mediana['incidents']
#divisão do target
x_remocao = incidentes_remocao.drop(['incidents'], axis=1)
y_remocao = incidentes_remocao['incidents']

In [40]:
# função que constrói os modelos, apresentando as métricas
def train_and_predict(X, y, teste):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    models = [RandomForestClassifier(n_estimators=100, max_features="auto", random_state=42),
             LGBMClassifier(boosting_type='gbdt', learning_rate=0.1, n_estimators=100, random_state=42),
             GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, random_state=42),
             DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=20, random_state=42),
             ExtraTreesClassifier(criterion='gini', max_depth=20, random_state=42),
             KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto')]
             
    reports=[]
    acc = []
    pred = []
    names = ['RF','LGB','GB','DT','ET','KN']
    
    for i, model in enumerate(models):
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions on the test set
        y_pred = model.predict(X_test)
        report = classification_report(y_test, y_pred)
        reports.append(report)
        accuracy = accuracy_score(y_test, y_pred)
        acc.append(accuracy)
        test_predictions = model.predict(teste)
        pred.append(test_predictions)

    i = 0   
    #Print the reports side by side
    for i, report in enumerate(reports):
        print("Model {} {}:\n{}".format(i + 1, names[i], report))
        print("Accuracy: {:.5f}".format(acc[i]))
        print("=" * 50)
    return pred

In [41]:
train_and_predict(x_moda,y_moda,teste_transformed)

  warn(
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f3f5b78e160>
Traceback (most recent call last):
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling cty

Model 1 RF:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       517
           1       0.87      0.90      0.89       189
           2       0.84      0.82      0.83       144
           3       0.93      0.86      0.89       265
           4       0.88      0.95      0.91       135

    accuracy                           0.93      1250
   macro avg       0.90      0.91      0.90      1250
weighted avg       0.93      0.93      0.93      1250

Accuracy: 0.92720
Model 2 LGB:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       517
           1       0.90      0.93      0.91       189
           2       0.86      0.82      0.84       144
           3       0.91      0.91      0.91       265
           4       0.91      0.93      0.92       135

    accuracy                           0.94      1250
   macro avg       0.91      0.91      0.91      1250
weighted avg       0.94      0.94 

[array([4, 0, 0, ..., 3, 1, 3]),
 array([4, 0, 0, ..., 3, 1, 3]),
 array([4, 0, 0, ..., 3, 1, 3]),
 array([4, 0, 0, ..., 1, 1, 3]),
 array([4, 0, 0, ..., 2, 1, 3]),
 array([3, 0, 0, ..., 2, 3, 3])]

In [42]:
train_and_predict(x_media,y_media,teste_transformed)

  warn(
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f3fb069c820>
Traceback (most recent call last):
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling cty

Model 1 RF:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       517
           1       0.86      0.90      0.88       189
           2       0.83      0.80      0.82       144
           3       0.92      0.86      0.89       265
           4       0.88      0.95      0.91       135

    accuracy                           0.92      1250
   macro avg       0.90      0.90      0.90      1250
weighted avg       0.92      0.92      0.92      1250

Accuracy: 0.92320
Model 2 LGB:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       517
           1       0.88      0.93      0.91       189
           2       0.86      0.81      0.84       144
           3       0.93      0.90      0.92       265
           4       0.91      0.96      0.93       135

    accuracy                           0.94      1250
   macro avg       0.92      0.92      0.92      1250
weighted avg       0.94      0.94 

[array([4, 0, 0, ..., 3, 1, 3]),
 array([4, 0, 0, ..., 3, 1, 3]),
 array([4, 0, 0, ..., 3, 1, 3]),
 array([4, 0, 0, ..., 1, 1, 3]),
 array([4, 0, 0, ..., 3, 1, 3]),
 array([3, 0, 0, ..., 2, 3, 3])]

In [43]:
train_and_predict(x_mediana,y_mediana,teste_transformed)

  warn(
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f3f5b6580d0>
Traceback (most recent call last):
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling cty

Model 1 RF:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       517
           1       0.88      0.91      0.90       189
           2       0.83      0.81      0.82       144
           3       0.92      0.88      0.90       265
           4       0.90      0.95      0.92       135

    accuracy                           0.93      1250
   macro avg       0.90      0.91      0.90      1250
weighted avg       0.93      0.93      0.93      1250

Accuracy: 0.92880
Model 2 LGB:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       517
           1       0.90      0.93      0.92       189
           2       0.86      0.83      0.85       144
           3       0.92      0.89      0.90       265
           4       0.90      0.95      0.92       135

    accuracy                           0.94      1250
   macro avg       0.91      0.92      0.92      1250
weighted avg       0.94      0.94 

[array([4, 0, 0, ..., 3, 1, 3]),
 array([4, 0, 0, ..., 3, 1, 3]),
 array([4, 0, 0, ..., 3, 1, 3]),
 array([4, 0, 0, ..., 1, 1, 3]),
 array([4, 0, 0, ..., 3, 1, 3]),
 array([3, 0, 0, ..., 2, 3, 3])]

In [44]:
train_and_predict(x_remocao,y_remocao,teste_transformed)

  warn(
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f3fb0edaca0>
Traceback (most recent call last):
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/purp/anaconda3/envs/dataset_COMPETICAO/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling cty

Model 1 RF:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       524
           1       0.93      0.90      0.91       179
           2       0.84      0.90      0.87       141
           3       0.91      0.91      0.91       255
           4       0.95      0.91      0.93       141

    accuracy                           0.94      1240
   macro avg       0.92      0.92      0.92      1240
weighted avg       0.94      0.94      0.94      1240

Accuracy: 0.93871
Model 2 LGB:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       524
           1       0.91      0.92      0.91       179
           2       0.87      0.89      0.88       141
           3       0.94      0.92      0.93       255
           4       0.94      0.94      0.94       141

    accuracy                           0.95      1240
   macro avg       0.93      0.93      0.93      1240
weighted avg       0.95      0.95 

[array([4, 0, 0, ..., 3, 1, 3]),
 array([4, 0, 0, ..., 3, 1, 3]),
 array([4, 0, 0, ..., 3, 1, 3]),
 array([4, 0, 0, ..., 1, 1, 3]),
 array([4, 0, 0, ..., 3, 1, 3]),
 array([3, 0, 0, ..., 2, 3, 3])]

Itálico - melhorou com a aplicação do Método 2 (este método)

Negrito - utilizado

| **Modelo**                          | **Antes** | **Depois (moda)** | **Depois (moda) M2** | **Depois (mediana)** | **Depois (mediana) M2** | **Depois (media)** | **Depois (media)M2** | **Depois (removido)** | **Depois (removido)M2** |
|-------------------------------------|-----------|-------------------|----------------------|----------------------|-------------------------|--------------------|----------------------|-----------------------|-------------------------|
| **K Neighbors Classifier**          | 0.83360   | 0.76400           | 0.83360              | 0.82320              | 0.83360                 | 0.76480            | 0.83360              | 0.89124               | 0.84194                 |
| **Extra Trees Classifier**          | 0.90640   | 0.90400           | _0.90960_            | 0.91040              | 0.90960                 | 0.89600            | 0.91040              | 0.94864               | 0.89274                 |
| **Gradient Boosting Classifier**    | 0.90960   | 0.89760           | _0.91200_            | 0.90400              | 0.90800                 | 0.90000            | 0.90800              | 0.95670               | 0.92661                 |
| **Decision Tree Classifier**        | 0.92240   | 0.89680           | _0.92240_            | 0.91280              | 0.92240                 | 0.89760            | 0.92240              | **_0.96274_**         | 0.93710                 |
| **Random Forest Classifier**        | 0.92960   | 0.91440           | 0.92720              | 0.91920              | 0.92880                 | 0.91520            | 0.92320              | **_0.96777_**         | 0.93871                 |
| **Light Gradient Boosting Machine** | 0.93760   | 0.91920           | 0.93760              | 0.92320              | 0.93840                 | 0.92080            | **0.94000**          | **_0.96979_**         | 0.94677                 |


Decidimos não remover os outliers, apesar de ser o método que dá uma melhor accuracy. Há features em que a maioria dos registos são outliers. Eliminá-los pode levar à perda de informação importante (e overfitting), logo mantivemos e decidimos substituir se isso melhorar as métricas:

* LGBM : M2 - Media;
* RF   : Manter;
* DT   : M2 - Manter;
* GB   : M2 - Moda;

Vamos, agora, fazer um processamento individual para os top-2 modelos.

Seguir para o notebook 4.