In [1]:
# Acá deberían cargar todas las librerías que vayan a usar. 
# Es preferible cargar las librerías al principio para que quede claro que requerimientos tiene la notebook. 
# Tengan cuidado de no cargar librerías que no vayan a utilizar ya que esto incurre en un gasto de memoria innecesario

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [2]:
#Podriamos cargar los datos con la API de Kaggle sin necesidad de bajarlos, o alternativamente subirlos y no correr esta celda

import json
from google.colab import drive

!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

drive.mount('/content/drive', force_remount=True)
with open("/content/drive/My Drive/kaggle.json", 'r') as f:
    api_token= json.load(f)

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c ap-2022q1


import zipfile
import os

os.listdir()

for file in os.listdir():
    if file.endswith('.zip'):
      zip_ref = zipfile.ZipFile(file, 'r')
      zip_ref.extractall()
      zip_ref.close()

Mounted at /content/drive
Downloading base_val.csv.zip to /content
 82% 11.0M/13.4M [00:00<00:00, 51.8MB/s]
100% 13.4M/13.4M [00:00<00:00, 53.0MB/s]
Downloading base_train.csv.zip to /content
 99% 58.0M/58.4M [00:00<00:00, 118MB/s]
100% 58.4M/58.4M [00:00<00:00, 148MB/s]


In [3]:
#Levantamos los datos (si los tenemos local y no corrimos la celda anterior, habría que subirlos)

df_train=pd.read_csv("base_train.csv", sep=",")
df_test=pd.read_csv("base_val.csv", sep=",")

In [4]:
# Después deberían hacer un poco de EDA para conocer la base
df_train.columns

Index(['Unnamed: 0', 'symboling', 'normalized-losses', 'wheel-base', 'length',
       'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

In [5]:
df_train.shape

(800000, 17)

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         800000 non-null  int64  
 1   symboling          800000 non-null  int64  
 2   normalized-losses  800000 non-null  float64
 3   wheel-base         800000 non-null  float64
 4   length             800000 non-null  float64
 5   width              800000 non-null  float64
 6   height             800000 non-null  float64
 7   curb-weight        800000 non-null  float64
 8   engine-size        800000 non-null  float64
 9   bore               800000 non-null  float64
 10  stroke             800000 non-null  float64
 11  compression-ratio  800000 non-null  float64
 12  horsepower         800000 non-null  float64
 13  peak-rpm           800000 non-null  float64
 14  city-mpg           800000 non-null  float64
 15  highway-mpg        800000 non-null  float64
 16  pr

In [8]:
#Elimino los NA de las variables continuas

contin_vars = ['normalized-losses', 'wheel-base', 'length',
       'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price']
for v in contin_vars: df_train.loc[df_train[v].isnull(), v] = 0
for v in contin_vars: df_train.loc[(df_train[v].isna()), v] = 0

In [9]:
# Elegir un modelo y correrlo
# Recuerden que lo que tienen que predecir es la columna averageRating
reg=RandomForestRegressor(min_samples_split=55, n_estimators=50)

Xs = ['normalized-losses', 'wheel-base', 'length',
       'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg']

X=np.array(df_train[Xs])

y=df_train['price']

res=reg.fit(X, y)

In [10]:
res.score(X,y)

0.7535115579754372

In [11]:
# Luego podría corresponder ajustar hiperparámetros, hacer CV, etc...

In [36]:
X=np.array(df_test[Xs])

In [37]:
resultado=reg.predict(X)

In [41]:
# Finalmente generar un CSV con los resultados aplicados al CSV de resultados que no contiene la columna target

salida = pd.DataFrame(data={"price": resultado}).astype(str)
salida.index = df_test.index + 800000
salida.to_csv("RF.csv", sep=',',index=True,  index_label='id')