In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv("data/train.csv", low_memory=False)
train_sample = train_data.sample(1000)
train_sample.head()

Unnamed: 0,SEM_NOT,SEM_PRI,SG_UF_NOT,ID_REGIONA,CO_REGIONA,ID_MUNICIP,CO_MUN_NOT,CS_SEXO,NU_IDADE_N,TP_IDADE,...,PERD_PALA,TOMO_RES,TOMO_OUT,VACINA_COV,DOSE_1_COV,DOSE_2_COV,DOSE_REF,FNT_IN_COV,DELTA_UTI,ID
1220349,51,48,SP,GVE XXXII ITAPEVA,1575.0,APIAI,350270,M,59,3,...,2.0,,,2.0,0,0,0,2.0,-1,20d0e2f2-fdcd-4038-910c-67811f7023ac
262142,46,44,PR,17RS LONDRINA,1371.0,LONDRINA,411370,F,78,3,...,2.0,4.0,,1.0,1,1,0,2.0,-1,7582753b-a1d4-41b5-9889-90c1c9dc10f7
814674,1,1,MG,BELO HORIZONTE,1449.0,BELO HORIZONTE,310620,M,70,3,...,2.0,6.0,,1.0,0,1,0,2.0,-1,22a5033e-6ce0-4cc5-b646-9ebb879d7681
808737,11,10,AL,1 MICRORREGIAO DE SAUDE,1533.0,MACEIO,270430,M,1,2,...,2.0,,,,0,0,0,,-1,47d72358-681a-42a7-baa7-a5284d952f6d
21084,31,30,SC,ITAJAI,1550.0,BALNEARIO CAMBORIU,420200,F,40,3,...,2.0,,,2.0,0,0,0,1.0,-1,f2447f7f-84ba-4ae1-822a-923eb94ece76


In [3]:
def glimpse(df):
    print(f"Rows: {df.shape[0]}")
    print(f"Columns: {df.shape[1]}")
    for col in df.columns:
        print(f"$ {col} <{df[col].dtype}> {df[col].head().values}")


In [4]:
glimpse(train_sample)

Rows: 1000
Columns: 76
$ SEM_NOT <int64> [51 46  1 11 31]
$ SEM_PRI <int64> [48 44  1 10 30]
$ SG_UF_NOT <object> ['SP' 'PR' 'MG' 'AL' 'SC']
$ ID_REGIONA <object> ['GVE XXXII ITAPEVA' '17RS LONDRINA' 'BELO HORIZONTE'
 '1 MICRORREGIAO DE SAUDE' 'ITAJAI']
$ CO_REGIONA <float64> [1575. 1371. 1449. 1533. 1550.]
$ ID_MUNICIP <object> ['APIAI' 'LONDRINA' 'BELO HORIZONTE' 'MACEIO' 'BALNEARIO CAMBORIU']
$ CO_MUN_NOT <int64> [350270 411370 310620 270430 420200]
$ CS_SEXO <object> ['M' 'F' 'M' 'M' 'F']
$ NU_IDADE_N <int64> [59 78 70  1 40]
$ TP_IDADE <int64> [3 3 3 2 3]
$ COD_IDADE <object> ['3059' '3078' '3070' '2001' '3040']
$ CS_GESTANT <int64> [6 5 6 6 5]
$ CS_RACA <int64> [4 4 4 4 1]
$ CS_ESCOL_N <float64> [nan nan  9. nan  3.]
$ SG_UF <object> ['SP' 'PR' 'MG' 'AL' 'SC']
$ CS_ZONA <float64> [2. 1. 1. 2. 1.]
$ SURTO_SG <float64> [nan nan nan nan nan]
$ NOSOCOMIAL <float64> [2. 2. 2. 2. 2.]
$ AVE_SUINO <float64> [2. 2. 2. 2. 2.]
$ FEBRE <float64> [2. 2. 2. 1. 2.]
$ TOSSE <float64> [1. 1. 1. 2

In [44]:
train_data["FNT_IN_COV"].value_counts()

1.0    640448
2.0    411556
Name: FNT_IN_COV, dtype: int64

In [77]:
# training  with pipeline
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report
import yaml

In [58]:
train_data = train_data.drop(["OBES_IMC"], axis=1)

In [80]:
with open("params.yaml") as f:
    params = yaml.safe_load(f)

categorical_features = params["categorical_features"]
numerical_features = params["numerical_features"]

numerical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        (
            "encoder",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
        ),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

clf = HistGradientBoostingClassifier(
    max_iter=2000,
    random_state=42,
    class_weight={0: 2, 1: 2, 2: 2, 3: 1, 4: 1},
    max_depth=75,
    l2_regularization=1.5,
    scoring="f1_macro",
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", clf),
    ]
)


In [81]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(
    train_data.drop(["CLASSI_FIN", "ID"], axis=1),
    train_data["CLASSI_FIN"],
    test_size=0.2,
    random_state=42,
    stratify=train_data["CLASSI_FIN"],
)

In [82]:
pipe

In [83]:
pipe.fit(x_train, y_train)

In [84]:
def evaluate_model(clf, x_test, y_test):
   # evaluating the model
   y_pred = clf.predict(x_test)
   classification_report_rf = classification_report(y_test, y_pred)
   print(classification_report_rf)


In [85]:
evaluate_model(pipe, x_test, y_test)

              precision    recall  f1-score   support

           1       0.49      0.34      0.40      3126
           2       0.48      0.54      0.51      6287
           3       0.51      0.32      0.39      1187
           4       0.71      0.56      0.63     76082
           5       0.84      0.92      0.88    173514

    accuracy                           0.80    260196
   macro avg       0.61      0.54      0.56    260196
weighted avg       0.79      0.80      0.79    260196



In [86]:
# import kaggle test data
test_data = pd.read_csv("data/test.csv", low_memory=False)
test_data.drop(["OBES_IMC"], axis=1, inplace=True)
test_data_id = test_data["ID"]
test_data.drop(["ID"], axis=1, inplace=True)

submission = pd.DataFrame({"ID": test_data_id, "CLASSI_FIN": pipe.predict(test_data)})

In [87]:
submission.to_csv("data/submission_v4.csv", index=False)