In [None]:
import numpy as np
import pandas as pd
from scipy import stats

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import config_context
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor


from sklearn.linear_model import RidgeCV
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error

In [None]:
data = "train.csv"

HP = pd.read_csv(data)
HP_kaggle= pd.read_csv("test.csv")
HP_sub = pd.read_csv('sample_submission.csv')

HP.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
y = HP['SalePrice']



In [None]:
new_submission = HP_kaggle[["Id"]]

In [None]:
def preproces_data(x):
  x_output = x.select_dtypes(exclude=['object'])
  x_output = x_output.fillna(0)
  for var in ['Id', 'SalePrice']:
    if var in x_output.columns:
      x_output.drop(var, inplace=True, axis =1)
  return(x_output)

In [None]:
def fill_nulls_with_mean(dataframe):


    for column in dataframe.columns:
        if dataframe[column].dtype in ['float64', 'int64']:
            mean_value = dataframe[column].mean()
            dataframe[column].fillna(mean_value, inplace=True)
    return dataframe

In [None]:
def transform_categoricals_to_dummies(origin_df, columns, merge_with):

  dummies = pd.get_dummies(origin_df[columns])

  output_df = merge_with.merge(dummies, left_index=True, right_index=True, how='outer')

  return output_df

In [None]:
def process_nulls(dataframe):
  for col in dataframe.columns:
    if dataframe[col].dtype in ['float64', 'int64']:
      dataframe[col].fillna( dataframe[col].mean(), inplace=True )
  return dataframe

In [None]:
def map_categoricals(origin_df, merge_with):

  output_df = pd.DataFrame(index=origin_df.index)
  output_df.loc[:, ['Utilities']] = origin_df['Utilities'].map({'ELO': 0, 'NoSeWa': 1, 'NoSewr': 2, 'AllPub': 3})
  output_df.loc[:, ['LandContour']] = origin_df['LandContour'].map({'Low': 0, 'HLS': 1, 'Bnk': 2, 'Lvl': 3})
  output_df.loc[:, ['LandSlope']] = origin_df['LandSlope'].map({'Sev': 0, 'Mod': 1, 'Gtl': 2})
  output_df.loc[:, ['BldgType']] = origin_df['BldgType'].map({'TwnhsI': 0, 'TwnhsE': 1, 'Duplx': 2, '2FmCon': 3, '1Fam': 4})
  output_df.loc[:, ['HouseStyle']] = origin_df['HouseStyle'].map({'1Story': 0, '1.5Fin': 1, '1.5Unf': 2, '2Story': 3, '2.5Fin': 4,'2.5Unf': 5, 'SFoyer': 6, 'SLvl': 7})
  output_df.loc[:, ['aExterQual']] = origin_df['ExterQual'].map({'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4})
  output_df.loc[:, ['ExterCond']] = origin_df['ExterCond'].map({'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4})
  output_df.loc[:, ['BsmtCond']] = origin_df['BsmtCond'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
  output_df.loc[:, ['HeatingQC']] = origin_df['HeatingQC'].map({'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4})
  output_df.loc[:, ['CentralAir']] = origin_df['CentralAir'].map({'N': 0, 'Y': 1})
  output_df.loc[:, ['KitchenQual']] = origin_df['KitchenQual'].map({'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4})
  output_df.loc[:, ['GarageFinish']] = origin_df['GarageFinish'].map({'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3})
  output_df.loc[:, ['GarageQual']] = origin_df['GarageQual'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
  output_df.loc[:, ['GarageCond']] = origin_df['GarageCond'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
  output_df.loc[:, ['PavedDrive']] = origin_df['PavedDrive'].map({'N': 0, 'P': 1, 'Y': 2})
  output_df.loc[:, ['PoolQC']] = origin_df['PoolQC'].map({'NaN': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
  output_df.loc[:, ['Fence']] = origin_df['Fence'].map({'NA': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4})

  output_df.fillna(0, inplace=True)

  output_df = merge_with.merge(output_df, left_index=True, right_index=True, how='outer')

  return output_df

In [None]:
def preproces_data(x):

  for var in ['Id', 'SalePrice']:
    if var in x.columns:
      x.drop(var, inplace=True, axis =1)

  x_numericals = x.select_dtypes(exclude=['object'])
  process_nulls(x_numericals)

  x_output = transform_categoricals_to_dummies(x, ['Neighborhood', 'Condition1', 'RoofStyle', 'MasVnrType'], x_numericals)
  x_output = map_categoricals(x, x_output)

  return(x_output)


In [None]:
def add_special_difference(dataframe, col_1, col_2):

    dataframe["new_col"] = dataframe[col_1] - dataframe[col_2]
    set_new_col_name = input("Name new column: ")
    dataframe.rename(columns={"new_col": set_new_col_name}, inplace=True)

    return dataframe

In [None]:
def drop_outliers(df, outliers_ids):

  output_df = df.drop(index=outliers_ids, inplace=True)

In [None]:
outliers_id = [186, 250, 314, 336, 379, 458, 598, 691, 692, 707, 739, 770, 899, 935, 955,1170, 1182, 1183, 1299]

In [None]:
# 1. Map categorical values, preprocess the data

X = preproces_data(HP)
X_kaggle = preproces_data(HP_kaggle)

drop_outliers(X, outliers_id)
drop_outliers(y, outliers_id)

In [None]:
X["property_age"] = X["YrSold"] - X["YearBuilt"]
X['totalsf'] = X['1stFlrSF'] + X['2ndFlrSF'] + X['BsmtFinSF1'] + X['BsmtFinSF2']
X['totalarea'] = X['GrLivArea'] + X['TotalBsmtSF']
X['totalbaths'] = X['BsmtFullBath'] + X['FullBath'] + 0.5 * (X['BsmtHalfBath'] + X['HalfBath'])

X_kaggle["property_age"] = X_kaggle["YrSold"] - X_kaggle["YearBuilt"]
X_kaggle['totalsf'] = X_kaggle['1stFlrSF'] + X_kaggle['2ndFlrSF'] + X_kaggle['BsmtFinSF1'] + X_kaggle['BsmtFinSF2']
X_kaggle['totalarea'] = X_kaggle['GrLivArea'] + X_kaggle['TotalBsmtSF']
X_kaggle['totalbaths'] = X_kaggle['BsmtFullBath'] + X_kaggle['FullBath'] + 0.5 * (X_kaggle['BsmtHalfBath'] + X_kaggle['HalfBath'])

In [None]:
# Random Forest model

RFR = RandomForestRegressor(random_state=13)


param_grid_RFR = {
    'max_depth': [5, 10, 15],
    'n_estimators': [100, 250, 500],
    'min_samples_split': [3, 5, 10]
}


rfr_cv = GridSearchCV(RFR, param_grid_RFR, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

rfr_cv.fit(X_train, y_train)

best_rmse = np.sqrt(-1 * rfr_cv.best_score_)
print(f"Best RMSE: {best_rmse}")

print(f"Best Parameters: {rfr_cv.best_params_}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model_rfr = RandomForestRegressor(max_depth=15, min_samples_split=5, n_estimators=250)
model_rfr.fit(X_train, y_train)

y_pred_rfr = model_rfr.predict(X_test)

  # if prediction is negative:
  # mean_squared_error(np.log(np.where(ypred<=0, np.mean(y_train), ypred)), np.log(y_test), squared = False)

mean_squared_error(np.log(y_pred_rfr), np.log(y_test), squared = False)

0.12863188004366016

In [None]:
y_pred_final = model_rfr.predict(X_kaggle)
new_submission.loc[:, 'SalePrice'] = y_pred_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_submission.loc[:, 'SalePrice'] = y_pred_final


In [None]:
new_submission.to_csv("random_forest_regressor_submission_new_feat.csv", index = False)

In [None]:
# random fores combined with ridge by stacking regressor

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model_rfr = RandomForestRegressor(max_depth=15, min_samples_split=5, n_estimators=250)
ridge = RidgeCV(alphas=[0.1, 1.0, 10.0])


stacking_regressor = StackingRegressor( estimators=[('rf', model_rfr)], final_estimator=ridge, cv=5)
stacking_regressor.fit(X_train, y_train)


y_train_pred = stacking_regressor.predict(X_train)
y_test_pred = stacking_regressor.predict(X_test)



y_test_pred_log = np.log(np.where(y_test_pred <= 0, np.nextafter(0, 1), y_test_pred))
y_test_log = np.log(np.where(y_test <= 0, np.nextafter(0, 1), y_test))

In [None]:
rmsle = np.sqrt(mean_squared_error(y_test_log, y_test_pred_log))
print(f"Root Mean Squared Log Error on test set: {rmsle}")

Root Mean Squared Log Error on test set: 0.12278332454223094


In [None]:
y_pred_final = stacking_regressor.predict(X_kaggle)
new_submission.loc[:, 'SalePrice'] = y_pred_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_submission.loc[:, 'SalePrice'] = y_pred_final


In [None]:
new_submission.to_csv("random_forest_regressor_submission_new_feat_and:ridge.csv", index = False)