In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under 
# the input directory

import os
for dirname, _, filenames in os.walk('kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as 
# output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be 
# saved outside of the current session

kaggle/input/.DS_Store
kaggle/input/tabular-playground-series-sep-2021/test.csv
kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv
kaggle/input/tabular-playground-series-sep-2021/train.csv


In [14]:
# Define funciones de preprocesamiento
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Define modelos de aprendizaje automatico a ocupar
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Definir importe de una red neuronal
# importar red neuronal en PyTorch

# Define metricas de clasificacion
from sklearn.metrics import accuracy_score, classification_report

## Preprocesamiento de datos para ocuparlos en Machine Learning

In [3]:
# Lee archivos train, test y sample solution
df_train = pd.read_csv("kaggle/input/tabular-playground-series-sep-2021/train.csv")
df_test = pd.read_csv("kaggle/input/tabular-playground-series-sep-2021/test.csv")
sample_solution = pd.read_csv("kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv")

In [4]:
# Separa en variables independientes y dependientes
X = df_train.drop(columns=["id", "claim"])
y = df_train["claim"]

In [5]:
GENERADOR = 42
# Separa en train y valid para entrenar los modelos
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=GENERADOR)

# Escala los datos de X_train
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

# Imputa valores faltantes
imputador = SimpleImputer()
X_train = imputador.fit_transform(X_train)
X_valid = imputador.transform(X_valid)

## Definicion de modelos de Machine Learning - Clasificacion

In [15]:
# Define los modelos a entrenar en tanda
modelos = [
    {
        "nombre": "Regresion Logistica",
        "modelo": LogisticRegression(),
        "param_grid": {
            "penalty": ["l1", "l2"],
            "C": [0.01, 0.1, 1.0, 10.0],
            "fit_intercept": [True, False],
            "solver": ["liblinear", "saga"],
            "max_iter": [100, 200, 300],
        },
    },
    {
        "nombre": "K-Means",
        "modelo": KMeans(n_clusters=2),
        "param_grid": {
            "init": ["k-means++", "random"],
            "max_iter": [100, 300, 500],
        },
    },
    {
        "nombre": "Decision Tree",
        "modelo": DecisionTreeClassifier(),
        "param_grid": {
            "max_depth": [3, 4, 5, 6, 7, 8, 9, 10],
            "max_leaf_nodes": [10, 50, 100],
            "criterion": ["gini", "entropy"],
            "min_samples_split": [0.05, 2],
        },
    },
    {
        "nombre": "Random Forest",
        "modelo": RandomForestClassifier(min_samples_split=2),
        "param_grid": {
            "max_depth": [5, 6, 7, 8, 9, 10],
            "n_estimators": [500, 600, 700, 800, 900, 1000],
            "max_leaf_nodes": [10, 50, 100],
        },
    },
    {
        "nombre": "Gradient Boosting",
        "modelo": GradientBoostingClassifier(min_samples_split=2),
        "param_grid": {
            "max_depth": [5, 6, 7, 8, 9, 10],
            "learning_rate": [0.001, 0.01, 0.1, 1],
            "max_leaf_nodes": [10, 50, 100],
        },
    },
    {
        "nombre": "SVM",
        "modelo": SVC(),
        "param_grid": {"C": [0.1, 1, 10, 100], "degree": [2, 3, 4]},
    },
]

In [7]:
df_test

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
0,957919,0.165850,0.487050,1295.00,0.023100,0.31900,0.901880,573.29,3743.7,2.705700e+12,...,0.162530,-22.1890,2.0655,0.430880,-10.7410,81606.00,1.1940,1.980400e+14,2017.10,0.46357
1,957920,0.129650,0.373480,1763.00,0.728840,0.33247,-1.263100,875.55,554370.0,5.955700e+14,...,0.815280,-1.6342,1.5736,-1.071200,11.8320,90114.00,1.1507,4.388000e+16,6638.90,0.28125
2,957921,0.120190,0.445210,736.26,0.046150,0.29605,0.316650,2659.50,317140.0,3.977800e+14,...,0.818310,-32.7800,2.1364,-1.931200,-3.2804,37739.00,1.1548,1.718100e+14,5844.00,0.13797
3,957922,0.054008,0.395960,996.14,0.859340,0.36678,-0.170600,386.56,325680.0,-3.432200e+13,...,0.865590,-2.4162,1.5199,-0.011633,1.3840,26849.00,1.1490,2.138800e+17,6173.30,0.32910
4,957923,0.079947,-0.006919,10574.00,0.348450,0.45008,-1.842000,3027.00,428150.0,9.291500e+11,...,0.251900,-18.6300,3.7387,0.757080,-4.9405,50336.00,1.2488,2.151300e+17,2250.10,0.33796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493469,1451388,-0.009112,0.308190,637.64,0.778200,0.41415,-1.068500,651.22,985000.0,6.079700e+15,...,0.249550,-17.5100,2.3325,0.112260,-5.9238,,1.1559,8.163900e+16,857.09,1.56330
493470,1451389,0.088922,0.482650,6924.10,0.025963,0.35540,-0.870200,2514.20,18004.0,6.073500e+14,...,0.017970,-7.8690,1.7005,97.813000,4.3793,-2432.00,1.0707,4.691800e+16,7497.10,0.67075
493471,1451390,0.140620,0.484750,1797.10,0.147020,0.28803,-1.407100,434.03,333050.0,2.351000e+15,...,0.013841,-5.3824,1.6827,,8.0633,2471.40,1.1725,7.900900e+16,2904.60,0.18005
493472,1451391,0.168000,0.351760,454.79,0.164580,0.16983,0.323850,2331.20,223980.0,-2.795300e+12,...,0.858730,-4.2416,1.3531,155.210000,13.9630,-11.44,1.1946,-1.770600e+14,6763.10,0.33223
