In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under 
# the input directory

import os
for dirname, _, filenames in os.walk('kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as 
# output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be 
# saved outside of the current session

kaggle/input/.DS_Store
kaggle/input/tabular-playground-series-sep-2021/test.csv
kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv
kaggle/input/tabular-playground-series-sep-2021/train.csv


In [14]:
# Define funciones de preprocesamiento
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Define modelos de aprendizaje automatico a ocupar
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Definir importe de una red neuronal
# importar red neuronal en PyTorch

# Define metricas de clasificacion
from sklearn.metrics import accuracy_score, classification_report

## Preprocesamiento de datos para ocuparlos en Machine Learning

In [3]:
# Lee archivos train, test y sample solution
df_train = pd.read_csv("kaggle/input/tabular-playground-series-sep-2021/train.csv")
df_test = pd.read_csv("kaggle/input/tabular-playground-series-sep-2021/test.csv")
sample_solution = pd.read_csv("kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv")

In [4]:
# Separa en variables independientes y dependientes
X = df_train.drop(columns=["id", "claim"])
y = df_train["claim"]

In [5]:
GENERADOR = 42
# Separa en train y valid para entrenar los modelos
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=GENERADOR)

# Escala los datos de X_train
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

# Imputa valores faltantes
imputador = SimpleImputer()
X_train = imputador.fit_transform(X_train)
X_valid = imputador.transform(X_valid)

## Definicion de modelos de Machine Learning - Clasificacion

In [15]:
# Define los modelos a entrenar en tanda
modelos = [
    {
        "nombre": "Regresion Logistica",
        "modelo": LogisticRegression(),
        "param_grid": {
            "penalty": ["l1", "l2"],
            "C": [0.01, 0.1, 1.0, 10.0],
            "fit_intercept": [True, False],
            "solver": ["liblinear", "saga"],
            "max_iter": [100, 200, 300],
        },
    },
    {
        "nombre": "K-Means",
        "modelo": KMeans(n_clusters=2),
        "param_grid": {
            "init": ["k-means++", "random"],
            "max_iter": [100, 300, 500],
        },
    },
    {
        "nombre": "Decision Tree",
        "modelo": DecisionTreeClassifier(),
        "param_grid": {
            "max_depth": [3, 4, 5, 6, 7, 8, 9, 10],
            "max_leaf_nodes": [10, 50, 100],
            "criterion": ["gini", "entropy"],
            "min_samples_split": [0.05, 2],
        },
    },
    {
        "nombre": "Random Forest",
        "modelo": RandomForestClassifier(min_samples_split=2),
        "param_grid": {
            "max_depth": [5, 6, 7, 8, 9, 10],
            "n_estimators": [500, 600, 700, 800, 900, 1000],
            "max_leaf_nodes": [10, 50, 100],
        },
    },
    {
        "nombre": "Gradient Boosting",
        "modelo": GradientBoostingClassifier(min_samples_split=2),
        "param_grid": {
            "max_depth": [5, 6, 7, 8, 9, 10],
            "learning_rate": [0.001, 0.01, 0.1, 1],
            "max_leaf_nodes": [10, 50, 100],
        },
    },
    {
        "nombre": "SVM",
        "modelo": SVC(),
        "param_grid": {"C": [0.1, 1, 10, 100], "kernel": ["linear", "poly"], "degree": [2, 3, 4]},
    },
]