In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv()

EDA

In [None]:
df.info(memory_usage='deep')
df.isnull().sum()
df.isnull().sum() > 0

# Mostrar columnas con nº datos nulos
null_cols = df.isnull().sum()
null_cols[null_cols > 0]
null_cols[null_cols > 0] / len(data) * 100 # Porcentaje de datos nulos

In [None]:
# Revisar el final que esta metido un poco a cholon

In [None]:
# Muestra -> Rehacer con datos.
# Se están comparando columnas contra price (ej diamantes)

fig, axes = plt.subplots(ncols=5, nrows=1, figsize=(20, 5))
ax1 = axes[0]
ax2 = axes[1]
ax3 = axes[2]
ax4 = axes[3]
ax5 = axes[4]

for ax, column in [[ax1, 'carat'], [ax2, 'depth'], [ax3, 'x'], [ax4, 'y'], [ax5, 'z']]:
    ax.scatter(train[column], train['price'])
    ax.set_title(f'{column} vs price')
    ax.set_xlabel(column)
    ax.set_ylabel('Price')
    plt.tight_layout()

.reset_index() despues de cada modificacion en el df

In [None]:
# Trasformación del df 
# Detección de outlayers y gestión (borrado o mantener)
# Numerical and categorical features

NUM_FEATS = ['carat', 'depth', 'table', 'x', 'y', 'z']
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS  #FEATS es features
TARGET = 'price'

# Imputar valores nulos
# Estandarizar

numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                                      ('scaler', StandardScaler())])  

categorical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy='constant', 
                                                                      fill_value='missing')),
                                             ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS), 
                                               ('cat', categorical_transformer, CAT_FEATS)])

pd.DataFrame(preprocessor.fit_transform(df)).head()

In [None]:
# Separar train test

df_train, df_test = train_test_split(df)

In [None]:
# Modelo ML

model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor())])

# Primera prueba (train test)

y_train_real = df_train[TARGET]
y_train_pred = model.predict(df_train[FEATS])
mean_squared_error(y_pred=y_train_pred, y_true=y_train_real, squared=False)
r2_score(y_train_real, y_train_pred)

y_test_real = df_test[TARGET]
y_test_pred = model.predict(df_test[FEATS])
mean_squared_error(y_pred=y_test_pred, y_true=y_test_real, squared=False)
r2_score(y_test_real, y_test_pred)

# Validar con cross validation

scores = cross_val_score(model,
                        df[FEATS],
                        df[TARGET],
                        scoring='neg_root_mean_squared_error',
                        cv=5, n_jobs=-1)

np.mean(scores)

# Búsqueda de hpyerparámetros

param_grid = {
    'preprocessor__num__imputer__strategy':['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
    }

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=8, 
                                 verbose=10, 
                                scoring='neg_root_mean_squared_error',
                                n_jobs=-1,
                                n_iter=30
                                )

grid_search.fit(df[TARGET], df[FEATS])

grid_search.best_params_
grid_search.best_score_

# Validar con cross validation

scores = cross_val_score(model,
                        df_test[FEATS],
                        df_test[TARGET],
                        scoring='r2', # r2 bien ahi? o mejor poner 'neg_root_mean_squared_error' ¿?
                        cv=5, n_jobs=-1)

np.mean(scores)

# Predecir

y_pred = grid_search.predict(df_predict[FEATS]) #df_predict es el df que nos dan para predecir

# Juntar en df con id

submission_df = pd.DataFrame({'id': df_predict['id'], 'price': y_pred})
submission_df.head()
submission_df.describe() # Analizar por si hay valores atípicos (outliers)
submission_df.price.clip(0, 20000, inplace=True) # De los valores obtenidos en describe, con clip limitamos el valor máximo y el mínimo

# Exportar a .csv
submission_df.to_csv('../prueba.csv', index=False)

In [None]:
# Probar conlightgbm

model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', LGBMRegressor(boosting_type='gbdt', 
                       bagging_freq=1, 
                       bagging_fraction = 0.9, 
                       n_estimators=100))])

EDA a cholon -> Revisar el notebook "analisis_exploratorio.ipynb"

In [None]:
fig, ax = plt.subplots(ncols=3, nrows=1, figsize=(30, 7))
corr_methods = ['pearson', 'kendall', 'spearman']

for i in range(len(corr_methods)):
    sns.heatmap(df.corr(method=corr_methods[i]), annot=True, fmt='.2f', ax=ax[i]);
    ax[i].set_title(f'{corr_methods[i].upper()} Correlation')

plt.show()

In [None]:
sns.pairplot(df)
plt.show()