In [1]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
#import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import joblib

In [2]:
url = 'https://raw.githubusercontent.com/Maxibrionest/FMY-Fundamentos-Machine-Learning/main/Entrega1/boston_housing.csv'
df = pd.read_csv(url, sep=",")

In [3]:
def clean_custom(df, col, iqrFences):
    q1 = df[col].quantile(.25)
    q3 = df[col].quantile(.75)
    iqr = q3-q1 #IQR definido anteriormente
    fence_low  = q1 - iqrFences * iqr
    fence_high = q3 + iqrFences * iqr
    df_out = df.loc[(df[col] > fence_low) & (df[col] < fence_high)]
    size = df_out.shape[0]
    pctg = size/df.shape[0]   # calcula el porcentaje de observaciones consideradas respecto al df original
    return df_out, pctg

def clean_outliers_hard(df, col):
    # elimina outliers fuera del rango +- 1.5 IQR de los cuartiles
    # En el boxplot, elimina los datos atípicos
    q1 = df[col].quantile(.25)
    q3 = df[col].quantile(.75)
    iqr = q3-q1 #IQR definido anteriormente
    fence_low  = q1 - 1.5*iqr
    fence_high = q3 + 1.5*iqr
    df_out = df.loc[(df[col] > fence_low) & (df[col] < fence_high)]
    return df_out

def clean_outliers_soft(df, col):
    # elimina outliers fuera del rango +- 3.0 IQR de los cuartiles
    # Dado que sus limites son mas amplios, elimina menos datos o equivalentemente considera mas datos
    q1 = df[col].quantile(.25)
    q3 = df[col].quantile(.75)
    iqr = q3-q1 #IQR definido anteriormente
    fence_low  = q1 - 3*iqr
    fence_high = q3 + 3*iqr
    df_out = df.loc[(df[col] > fence_low) & (df[col] < fence_high)]
    return df_out


In [4]:
train_mode_a = dict(df.drop(['medv'], axis=1).mode().iloc[0])

In [5]:
train_mode_a

{'crim': 0.0150099999999999,
 'zn': 0.0,
 'indus': 18.1,
 'chas': 0.0,
 'nox': 0.538,
 'rm': 5.713,
 'age': 100.0,
 'dis': 3.4952,
 'rad': 24.0,
 'tax': 666.0,
 'ptratio': 20.2,
 'black': 396.9,
 'lstat': 6.36}

In [6]:
df_clean, df_clean_p = clean_custom(df, "crim", 1.5)
df_clean, df_clean_p = clean_custom(df_clean, "zn", 1.5)
df_clean, df_clean_p = clean_custom(df_clean, "indus", 1.5)
df_clean, df_clean_p = clean_custom(df_clean, "nox", 1.5)
df_clean, df_clean_p = clean_custom(df_clean, "rm", 1.5)
df_clean, df_clean_p = clean_custom(df_clean, "age", 1.5)
df_clean, df_clean_p = clean_custom(df_clean, "dis", 1.5)
df_clean, df_clean_p = clean_custom(df_clean, "rad", 1.5)
df_clean, df_clean_p = clean_custom(df_clean, "tax", 1.5)
df_clean, df_clean_p = clean_custom(df_clean, "ptratio", 1.5)
df_clean, df_clean_p = clean_custom(df_clean, "black", 1.5)
df_clean, df_clean_p = clean_custom(df_clean, "lstat", 1.5)

In [7]:
corrdat = df_clean.corr()
def getCorr(dat, lim): #obtiene los features con mayor impacto en el target
    feature = []
    valor = []
    
    for i , index in enumerate(dat.index): #bucle para todos los indices
        if abs(dat[index]) > lim:
            feature.append(index)
            #print(index)
            valor.append(dat[index])

    df = pd.DataFrame(data = valor, index = feature, columns = ['corr value'])

    return df

lim = 0.4
corr_value = getCorr(corrdat["medv"], lim)

In [8]:
df_op = df_clean[corr_value.index]

In [9]:
X = df_op.drop(['medv'], axis=1)
Y = df_op.medv.copy()

In [10]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.2, random_state=16062021)

In [11]:
model_tree_grid = DecisionTreeRegressor()

params = {"max_depth": np.arange(3, 10),          # ej: [3,4,5]
          "min_samples_split": np.arange(2, 10),  # ej: [2,3,4]
          "random_state": np.array([16062021])
          }

In [13]:
train_mode_b = dict(Xtrain.mode().iloc[0])

In [14]:
train_mode_b

{'rm': 6.004, 'ptratio': 19.2, 'lstat': 5.33}

In [15]:
grid = GridSearchCV(estimator=model_tree_grid, param_grid=params, cv=10, scoring='r2')
grid.fit(Xtrain, Ytrain)

GridSearchCV(cv=10, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': array([3, 4, 5, 6, 7, 8, 9]),
                         'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9]),
                         'random_state': array([16062021])},
             scoring='r2')

In [16]:
print(grid.best_score_)
print(grid.best_params_)

0.6673883614425868
{'max_depth': 9, 'min_samples_split': 9, 'random_state': 16062021}


In [17]:
dtree = DecisionTreeRegressor(max_depth=9, min_samples_split=9, random_state=16062021)
dtree.fit(X, Y)

DecisionTreeRegressor(max_depth=9, min_samples_split=9, random_state=16062021)

In [19]:
joblib.dump(dtree, "./decision_tree.joblib", compress=True)
joblib.dump(train_mode_a, "./train_mode_a.joblib", compress=True)
joblib.dump(train_mode_b, "./train_mode_b.joblib", compress=True)

['./train_mode_b.joblib']

In [45]:
input_data = [6.998,18.7,2.94]
np.array(input_data).reshape(1, -1)

array([[ 6.998, 18.7  ,  2.94 ]])

In [42]:
#Xtest

In [43]:
dtree.predict(np.array(input_data).reshape(1, -1))

array([34.5375])