In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
casas = pd.read_csv('data/casas.csv')
casas = casas.dropna(axis=0)
casas.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1068828.0,9.751097,3101.947708,2.902034,1.57634,1.573596,471.00694,141.568645,1964.081988,-37.807904,144.990201,7435.489509
std,0.971079,675156.4,5.612065,86.421604,0.970055,0.711362,0.929947,897.449881,90.834824,38.105673,0.07585,0.099165,4337.698917
min,1.0,131000.0,0.0,3000.0,0.0,1.0,0.0,0.0,0.0,1196.0,-38.16492,144.54237,389.0
25%,2.0,620000.0,5.9,3044.0,2.0,1.0,1.0,152.0,91.0,1940.0,-37.855438,144.926198,4383.75
50%,3.0,880000.0,9.0,3081.0,3.0,1.0,1.0,373.0,124.0,1970.0,-37.80225,144.9958,6567.0
75%,4.0,1325000.0,12.4,3147.0,3.0,2.0,2.0,628.0,170.0,2000.0,-37.7582,145.0527,10175.0
max,8.0,9000000.0,47.4,3977.0,9.0,8.0,10.0,37000.0,3112.0,2018.0,-37.45709,145.52635,21650.0


In [3]:
target = casas['Price'] # y
features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
data = casas[features] # X
data.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [4]:
model = DecisionTreeRegressor(random_state=1)
model.fit(data, target)
print(data.head())
predictions = model.predict(data)
print(predictions)

   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
[1035000. 1465000. 1600000. ...  385000.  560000. 2450000.]


In [5]:
print(mae(target, predictions))

1115.7467183128902


In [6]:
trainX, valX, trainY, valY = train_test_split(data, target, random_state=0)
model = DecisionTreeRegressor()
model.fit(trainX, trainY)
predictions = model.predict(valX)
print(mae(valY, predictions))

276201.61007101357


In [7]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    _mae = mae(val_y, preds_val)
    return(_mae)

In [8]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, trainX, valX, trainY, valY)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  385696
Max leaf nodes: 50  		 Mean Absolute Error:  279794
Max leaf nodes: 500  		 Mean Absolute Error:  261718
Max leaf nodes: 5000  		 Mean Absolute Error:  271320


In [9]:
forest_gump = RandomForestRegressor(random_state=1)
forest_gump.fit(trainX, trainY)
predictions = forest_gump.predict(valX)
print(mae(valY, predictions))

207190.6873773146


In [12]:
cols_with_missing = [col for col in trainX.columns
                     if trainX[col].isnull().any()]

reduced_X_train = trainX.drop(cols_with_missing, axis=1) # nao temos colunas com valores nulos
reduced_X_valid = valX.drop(cols_with_missing, axis=1) # nao temos colunas com valores nulos

print(cols_with_missing)

[]


In [13]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(trainX))
imputed_X_valid = pd.DataFrame(my_imputer.transform(valX)) # type: ignore

# Imputation removed column names; put them back
imputed_X_train.columns = trainX.columns
imputed_X_valid.columns = valX.columns

In [14]:
X_train_plus = trainX.copy()
X_valid_plus = valX.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus)) # type: ignore

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

In [21]:
digimons = pd.read_csv('data/digimons/digimons.csv')
types = (digimons.dtypes == 'object')
object_cols = list(types[types].index)
print(object_cols)

['Digimon', 'Stage', 'Type', 'Attribute']


In [22]:
drop_X_train = digimons.select_dtypes(exclude=['object']) # use this to train the model
drop_X_train.head()

Unnamed: 0,Number,Memory,Equip Slots,Lv 50 HP,Lv50 SP,Lv50 Atk,Lv50 Def,Lv50 Int,Lv50 Spd
0,1,2,0,590,77,79,69,68,95
1,2,2,0,950,62,76,76,69,68
2,3,2,0,870,50,97,87,50,75
3,4,2,0,690,68,77,95,76,61
4,5,2,0,540,98,54,59,95,86


In [28]:
digimons.head()

Unnamed: 0,Number,Digimon,Stage,Type,Attribute,Memory,Equip Slots,Lv 50 HP,Lv50 SP,Lv50 Atk,Lv50 Def,Lv50 Int,Lv50 Spd
0,1,Kuramon,Baby,Free,Neutral,2,0,590,77,79,69,68,95
1,2,Pabumon,Baby,Free,Neutral,2,0,950,62,76,76,69,68
2,3,Punimon,Baby,Free,Neutral,2,0,870,50,97,87,50,75
3,4,Botamon,Baby,Free,Neutral,2,0,690,68,77,95,76,61
4,5,Poyomon,Baby,Free,Neutral,2,0,540,98,54,59,95,86


In [26]:
from sklearn.preprocessing import OrdinalEncoder

copy_digi = digimons.copy()
ordinal_encoder = OrdinalEncoder()
copy_digi[object_cols] = ordinal_encoder.fit_transform(copy_digi[object_cols]) # or use transform
copy_digi.head()

Unnamed: 0,Number,Digimon,Stage,Type,Attribute,Memory,Equip Slots,Lv 50 HP,Lv50 SP,Lv50 Atk,Lv50 Def,Lv50 Int,Lv50 Spd
0,1,111.0,1.0,1.0,5.0,2,0,590,77,79,69,68,95
1,2,167.0,1.0,1.0,5.0,2,0,950,62,76,76,69,68
2,3,182.0,1.0,1.0,5.0,2,0,870,50,97,87,50,75
3,4,26.0,1.0,1.0,5.0,2,0,690,68,77,95,76,61
4,5,179.0,1.0,1.0,5.0,2,0,540,98,54,59,95,86


In [33]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as mae

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['Memory', 'Equip Slots','Lv50 SP','Lv50 Atk','Lv50 Def','Lv50 Int','Lv50 Spd']), 
        ('cat', categorical_transformer, ['Type', 'Stage','Attribute'])
    ])

model = RandomForestRegressor(n_estimators=100, random_state=0)
pipes = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

X_train = digimons.drop(['Digimon','Lv 50 HP'], axis=1)
y_train = digimons['Lv 50 HP']
X_valid = digimons.drop(['Digimon','Lv 50 HP'], axis=1)
y_valid = digimons['Lv 50 HP']

pipes.fit(X_train, y_train)

preds = pipes.predict(X_valid)

print(mae(y_valid, preds))

46.822088353413655


In [35]:
from sklearn.model_selection import cross_val_score

scores = -1 * cross_val_score(pipes, X_train, y_train,cv=5, scoring='neg_mean_absolute_error')
print(scores.mean())

164.03011428571426


In [39]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

cols_to_use = ['Memory', 'Equip Slots','Lv50 SP','Lv50 Atk','Lv50 Def','Lv50 Int','Lv50 Spd'] 
X = digimons[cols_to_use]
y = digimons['Lv 50 HP']

trainX, valX, trainY, valY = train_test_split(X, y, random_state=0)

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(trainX, trainY,
             early_stopping_rounds=5,
             eval_set=[(valX, valY)],
                verbose=False)
predictions = my_model.predict(valX)
print(mae(valY, predictions))


126.40532962859623


