# FULL FEATURES

In [24]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline

In [25]:
train = pd.read_csv('train.csv', index_col='Id')
test_df = pd.read_csv('test.csv', index_col='Id')

In [26]:
train.head

<bound method NDFrame.head of       MSSubClass MSZoning  LotFrontage  ...  SaleType SaleCondition SalePrice
Id                                      ...                                  
1             60       RL         65.0  ...        WD        Normal    208500
2             20       RL         80.0  ...        WD        Normal    181500
3             60       RL         68.0  ...        WD        Normal    223500
4             70       RL         60.0  ...        WD       Abnorml    140000
5             60       RL         84.0  ...        WD        Normal    250000
...          ...      ...          ...  ...       ...           ...       ...
1456          60       RL         62.0  ...        WD        Normal    175000
1457          20       RL         85.0  ...        WD        Normal    210000
1458          70       RL         66.0  ...        WD        Normal    266500
1459          20       RL         68.0  ...        WD        Normal    142125
1460          20       RL         

In [27]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [28]:
y = train['SalePrice']

X = train.drop(['SalePrice'], axis=1)

In [29]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size=0.8, test_size=0.2, random_state=0)

In [30]:
# categorical columns

categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object' and 
                    X_train[col].nunique() < 9]

# numerical columns

numerical_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64','float64']]
    
# total number of columns to be learn from 

my_cols = categorical_cols + numerical_cols

train_X =X_train[my_cols].copy()
valid_X = X_valid[my_cols].copy()
test_X = test_df[my_cols].copy()

print("Total number of columns to preprocess: ", len(my_cols))

Total number of columns to preprocess:  74


In [32]:
# numerical columns transoformation
numerical_transformer = SimpleImputer(strategy='mean')

# categorical columns transformation

categorical_transformer = Pipeline(steps=[('impute', SimpleImputer(strategy='constant')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
                             ])

# preprocess both num and cat columns

preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),
                                               ('cat', categorical_transformer,categorical_cols)
])

# define model

model_ = RandomForestRegressor(n_estimators=155, random_state=0)

# combine both preprocessor and model

final_model = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model',model_ )
    ])

# fitting model to data

final_model.fit(train_X, y_train)

# predcting outcomes

predicted_price = final_model.predict(valid_X)
print(final_model.score(X_valid, y_valid))

# calculating MAE

MAE = mean_absolute_error(y_valid,predicted_price)

print("MAE: ", MAE)

0.83891461861817
MAE:  17266.14752540875




# Modelo retirando las columnas categoricas y valores nulos. 

In [33]:
y = train['SalePrice']

train_df2 = train.drop(['SalePrice'], axis=1)
X = train_df2.select_dtypes(exclude=['object'])

X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size=0.8, test_size=0.2, random_state=0)

col_wiht_missing_value = [col for col in X_train.columns 
                          if X_train[col].isnull().any()]

print(col_wiht_missing_value)

X_train = X_train.drop(col_wiht_missing_value, axis=1)
X_valid = X_valid.drop(col_wiht_missing_value, axis=1)

# random forest model training

model = RandomForestRegressor(random_state=0)

model.fit(X_train,y_train)

# predicting validation values

pred_values = model.predict(X_valid)

print(model.score(X_valid, y_valid))
# Calculating MAE

MAE = mean_absolute_error(y_valid,pred_values)

print("mean absolute error of model:", MAE)

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
0.8412665387023727
mean absolute error of model: 17837.82570776256


In [34]:
from sklearn.linear_model import LinearRegression

regr = LinearRegression()
regr.fit(X_train, y_train)
pred=regr.predict(X_valid)
print(regr.score(X_valid, y_valid))

MAE = mean_absolute_error(y_valid,pred)

print("mean absolute error of model:", MAE)

0.6436463812634914
mean absolute error of model: 23831.423587890902


In [35]:
from sklearn.tree import DecisionTreeRegressor
regressorTree = DecisionTreeRegressor(random_state = 0)
regressorTree.fit(X_train, y_train)  
pred1=regressorTree.predict(X_valid)
print(regressorTree.score(X_valid, y_valid))

MAE = mean_absolute_error(y_valid,pred1)

print("mean absolute error of model:", MAE)

0.7810313452488958
mean absolute error of model: 26025.359589041094
