In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import os

In [2]:
raw_path = '../data/raw/'
train = pd.read_csv(os.path.join(raw_path, 'train.csv'))
test = pd.read_csv(os.path.join(raw_path, 'test.csv'))

In [3]:
train['SalePrice'].isnull().count()

1460

In [4]:
import sys
sys.path.append('../src')
from config import TARGET_COL

In [5]:
def extract_target(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    df, target = df.drop(TARGET_COL, axis=1), df[TARGET_COL].copy()
    return df, target

In [6]:
train_data, train_target =  extract_target(train)

In [7]:
def fill_LotFrontage(df: pd.DataFrame) -> pd.DataFrame:
    most_freq = df['LotFrontage'].value_counts().index[0]
    df['LotFrontage'] = df['LotFrontage'].fillna(most_freq)
    return df

In [8]:
train_data = fill_LotFrontage(train_data)

In [9]:
def fill_Alley(df: pd.DataFrame) -> pd.DataFrame:
    df['Alley'] = df['Alley'].fillna('No alley access')
    return df

In [10]:
train_data = fill_Alley(train_data)

In [11]:
def fill_MasVnrType(df: pd.DataFrame) -> pd.DataFrame:
    df['MasVnrType'] = df['MasVnrType'].fillna('None')
    return df

In [12]:
train_data = fill_MasVnrType(train_data)

In [13]:
def fill_MasVnrArea(df: pd.DataFrame) -> pd.DataFrame:
    df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
    return df

In [14]:
train_data = fill_MasVnrArea(train_data)

In [15]:
def fill_Bsmt(df: pd.DataFrame) -> pd.DataFrame:
    df['BsmtQual'] = df['BsmtQual'].fillna('No Basement')
    df['BsmtCond'] = df['BsmtCond'].fillna('No Basement')
    df['BsmtExposure'] = df['BsmtExposure'].fillna('No Basement')
    df['BsmtFinType1'] = df['BsmtFinType1'].fillna('No Basement')
    df['BsmtFinType2'] = df['BsmtFinType2'].fillna('No Basement')
    return df

In [16]:
train_data = fill_Bsmt(train_data)

In [17]:
def fill_FireplaceQu(df: pd.DataFrame) -> pd.DataFrame:
    df['FireplaceQu'] = df['FireplaceQu'].fillna('No Fireplace')
    return df

In [18]:
train_data = fill_FireplaceQu(train_data)

In [19]:
def fill_Garage(df: pd.DataFrame) -> pd.DataFrame:
    df['GarageType'] = df['GarageType'].fillna('No Garage')
    df['GarageFinish'] = df['GarageFinish'].fillna('No Garage')
    df['GarageQual'] = df['GarageQual'].fillna('No Garage')
    df['GarageCond'] = df['GarageCond'].fillna('No Garage')
    return df

In [20]:
train_data = fill_Garage(train_data)

In [21]:
def fill_Pool_Fence_Misc(df: pd.DataFrame) -> pd.DataFrame:
    df['PoolQC'] = df['PoolQC'].fillna('No Pool')
    df['Fence'] = df['Fence'].fillna('No Fence')
    df['MiscFeature'] = df['MiscFeature'].fillna('None')
    return df

In [22]:
train_data = fill_Pool_Fence_Misc(train_data)

In [23]:
from config import CAT_COLS, REAL_COLS

In [24]:
CAT_COLS

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [25]:
def cast_types(df: pd.DataFrame) -> pd.DataFrame:
    df[CAT_COLS] = df[CAT_COLS].astype('category')

    df[REAL_COLS] = df[REAL_COLS].astype(np.float32)
    return df

In [26]:
train_data = cast_types(train_data)

In [27]:
train_data['LandContour']

0       Lvl
1       Lvl
2       Lvl
3       Lvl
4       Lvl
       ... 
1455    Lvl
1456    Lvl
1457    Lvl
1458    Lvl
1459    Lvl
Name: LandContour, Length: 1460, dtype: category
Categories (4, object): ['Bnk', 'HLS', 'Low', 'Lvl']

In [28]:
def Easy_Access(data: pd.DataFrame) -> None:
    data['easy_access'] = 0
    for i in range(len(data['Street'])):
        if data['Street'][i] == 'Pave' and data['Alley'][i] == 'Pave' and data['LandContour'][i] == 'Lvl':
            data['easy_access'][i] = 0
        else:
            data['easy_access'][i] = 1

In [29]:
Easy_Access(train_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['easy_access'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['easy_access'][i] = 0


In [30]:
def Extra_Amenities(data: pd.DataFrame) -> None:
    data['extra_amenities'] = 0
    for i in range(len(data['GarageType'])):
        if data['GarageType'][i] != 'No Garage' or data['PoolQC'][i] != 'No Pool':
            data['extra_amenities'][i] = 0
        else:
            data['extra_amenities'][i] = 1

In [31]:
Extra_Amenities(train_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['extra_amenities'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['extra_amenities'][i] = 1


In [32]:
def long_street(data: pd.DataFrame) -> None:
    data['long_street'] = 0
    for i in range(len(data['LotFrontage'])):
        if data['LotFrontage'][i] > 80:
            data['long_street'][i] = 0
        else:
            data['long_street'][i] = 1

In [33]:
long_street(train_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['long_street'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['long_street'][i] = 0


In [34]:
from sklearn.model_selection import train_test_split
train_data, X_test, train_target, y_test = train_test_split(train_data, train_target, test_size=0.4, random_state=7)

SKLEARN PIPELINE

In [35]:
import category_encoders as ce
import config as cfg
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import LinearSVR, LinearSVC
from sklearn.linear_model import LinearRegression, RidgeCV

real_pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
    ]
)

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ]
)

preprocess_pipe = ColumnTransformer(transformers=[
    ('real_cols', real_pipe, cfg.REAL_COLS),
    ('cat_cols', cat_pipe, cfg.CAT_COLS),
    ]
)

base_model = LinearRegression(positive=True)

model = Pipeline([
    ('preprocess', preprocess_pipe),
    ('model', base_model)
    ]
)

In [36]:
model.fit(train_data,train_target)

In [37]:
res_pred = model.predict(X_test)

In [38]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(train_score, test_score)

NameError: name 'X_train' is not defined

In [39]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [40]:
mean_absolute_error(y_test, res_pred)

1.1616360467623412e+16

In [41]:
mean_squared_error(y_test, res_pred)

1.475419709401162e+34

In [42]:
r2_score(y_test, res_pred)

-2.2010491852541612e+24

In [43]:
from catboost import CatBoostRegressor
from config import CAT_COLS
import category_encoders as ce
import config as cfg
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import LinearSVR, LinearSVC
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.model_selection import GridSearchCV

real_pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
    ]
)

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ]
)

preprocess_pipe = ColumnTransformer(transformers=[
    ('real_cols', real_pipe, cfg.REAL_COLS),
    ('cat_cols', cat_pipe, cfg.CAT_COLS),
    ]
)

base_model = CatBoostRegressor(iterations=1000,
                          learning_rate=1,
                          depth=2)

rscv = GridSearchCV(
    estimator=base_model,
    param_grid={'learning_rate': [0.03, 0.1],
                'depth': [2, 4],
                'l2_leaf_reg': [0.2, 0.5],
                'model_size_reg': [0.5, 1]},
    scoring='explained_variance',
    cv=5,
    refit=True
)

model = Pipeline([
    ('preprocess', preprocess_pipe),
    ('model', rscv)
    ]
)

In [44]:
model.fit(train_data, train_target)
# Get predictions
preds = model.predict(X_test)

0:	learn: 77690.4978355	total: 112ms	remaining: 1m 51s
1:	learn: 76560.2716778	total: 115ms	remaining: 57.4s
2:	learn: 75264.3331777	total: 118ms	remaining: 39.2s
3:	learn: 74259.2334326	total: 121ms	remaining: 30s
4:	learn: 73056.7179074	total: 123ms	remaining: 24.5s
5:	learn: 72005.0509105	total: 126ms	remaining: 20.8s
6:	learn: 70965.5589069	total: 128ms	remaining: 18.2s
7:	learn: 69923.6004864	total: 131ms	remaining: 16.3s
8:	learn: 68880.1423512	total: 134ms	remaining: 14.7s
9:	learn: 67882.7056638	total: 136ms	remaining: 13.5s
10:	learn: 66844.5166177	total: 138ms	remaining: 12.4s
11:	learn: 65793.7056710	total: 140ms	remaining: 11.6s
12:	learn: 64965.9374629	total: 142ms	remaining: 10.8s
13:	learn: 63967.3881698	total: 144ms	remaining: 10.1s
14:	learn: 63112.0600383	total: 146ms	remaining: 9.56s
15:	learn: 62202.6842997	total: 147ms	remaining: 9.07s
16:	learn: 61398.0437255	total: 149ms	remaining: 8.62s
17:	learn: 60558.7915880	total: 150ms	remaining: 8.21s
18:	learn: 59741.3383

In [45]:
mean_absolute_error(y_test, preds)

15985.463471765317

In [46]:
mean_squared_error(y_test, preds)

943410288.6217557

In [47]:
r2_score(y_test, preds)

0.8592608981769596

In [48]:
from xgboost import XGBRegressor

In [49]:
from catboost import CatBoostRegressor
from config import CAT_COLS
import category_encoders as ce
import config as cfg
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import LinearSVR, LinearSVC
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.model_selection import GridSearchCV

real_pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
    ]
)

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ]
)

preprocess_pipe = ColumnTransformer(transformers=[
    ('real_cols', real_pipe, cfg.REAL_COLS),
    ('cat_cols', cat_pipe, cfg.CAT_COLS),
    ]
)

model = Pipeline([
    ('preprocess', preprocess_pipe),
    ('model', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=2, min_child_weight=1, n_estimators=100,
       n_jobs=2, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1))
    ]
)

In [51]:
model.fit(train_data, train_target)
# Get predictions
preds = model.predict(X_test)



In [52]:
mean_absolute_error(y_test, preds)

18281.613308005137

In [53]:
mean_squared_error(y_test, preds)

957274210.8817519

In [54]:
r2_score(y_test, preds)

0.8571926612813594