# Residency Valuation using Advanced Regression Modeling

#### Importing Required libraries

In [367]:
import numpy as np
import csv
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor

from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import mean_squared_error

In [368]:
## reading csv 
train = pd.read_csv('train.csv')                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
test = pd.read_csv('test.csv')


 ### performing effectrive feature Engineering

In [369]:
train.fillna(0,inplace=True)      
test.fillna(0,inplace=True)

y_train = train['SalePrice']

# ONE-HOT encoding categorical features

traincol = train.select_dtypes(include = ['object']).columns.tolist()
testcol= test.select_dtypes(include = ['object']).columns.tolist()

train_int =train.select_dtypes(include = ['int']).columns.tolist()
train = pd.get_dummies(train,traincol,).astype(int)
X_test = pd.get_dummies(test,testcol,).astype(int)

# Droping the target variable from the features
X_train = train.drop('SalePrice', axis = 1)




In [370]:
def FIXfeat(df):
    df['TotalSqFt'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']


    df['HasGarage'] = ((df['GarageArea'] > 0) | (df['GarageCars'] > 0)).astype(int)

    df['HasPool'] = (df['PoolArea'] > 0).astype(int)

    df['TotalBathrooms'] = df['FullBath'] + (0.5 * df['HalfBath']) + df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath'])

    df['TotalPorchArea'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']

    df['HasFireplace'] = (df['Fireplaces'] > 0).astype(int)
    
    return df

X_train,X_test = X_train.align(X_test,fill_value=0)

X_train = FIXfeat(X_train)
X_test = FIXfeat(X_test)


Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley_0,Alley_Grvl,Alley_Pave,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,...,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,TotalSqFt,HasGarage,HasPool,TotalBathrooms,TotalPorchArea,HasFireplace
0,856,854,0,1,0,0,3,1,0,0,...,0,2003,2003,2008,2566,1,0,3.5,61,0
1,1262,0,0,1,0,0,3,1,0,0,...,298,1976,1976,2007,2524,1,0,2.5,0,1
2,920,866,0,1,0,0,3,1,0,0,...,0,2001,2002,2008,2706,1,0,3.5,42,1
3,961,756,0,1,0,0,3,1,0,0,...,0,1915,1970,2006,2473,1,0,2.0,307,1
4,1145,1053,0,1,0,0,4,1,0,0,...,192,2000,2000,2008,3343,1,0,3.5,84,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,953,694,0,1,0,0,3,1,0,0,...,0,1999,2000,2007,2600,1,0,2.5,40,1
1456,2073,0,0,1,0,0,3,1,0,0,...,349,1978,1988,2010,3615,1,0,3.0,0,1
1457,1188,1152,0,1,0,0,4,1,0,0,...,0,1941,2006,2010,3492,1,0,2.0,60,1
1458,1078,0,0,1,0,0,2,1,0,0,...,366,1950,1996,2010,2156,1,0,2.0,112,0


### Creating a evaluation Metric Function

In [371]:
def rmse_log(y_true, y_pred):
    log_y_true = np.log1p(y_true)
    log_y_pred = np.log(y_pred)
    squared_diff = (log_y_true - log_y_pred) ** 2
    
    mean_squared_error = np.mean(squared_diff)
    
    rmse = np.sqrt(mean_squared_error)
    return rmse


In [372]:
## spliting the data set to train and test
X_train,X_,y_train,y_=train_test_split(X_train,y_train,test_size=0.2,random_state=42)


254     145000
1066    178000
638      85000
799     175000
380     127000
         ...  
1095    176432
1130    135000
1294    115000
860     189950
1126    174000
Name: SalePrice, Length: 1168, dtype: int64

In [374]:
## found the best_params by implementing GridSearchCV and fetching best_params_

best_params = {'subsample': 0.8,
                 'n_estimators': 170,
                 'min_child_weight': 1, 
                 'max_depth': 3, 
                 'learning_rate': 0.05, 
                 'gamma': 0.1,
                 'colsample_bytree': 0.8}

xgb = XGBRegressor(**best_params).fit(X_train,y_train)
yhat = xgb.predict(X_test)
yt= xgb.predict(X_train)
yv= xgb.predict(X_)



print(rmse_log(np.array(y_train).tolist(),yt.tolist()),rmse_log(np.array(y_).tolist(),yv.tolist()))

param_dist = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50, 25)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'max_iter': [100, 200, 300],
}

xgb = MLPRegressor()
model= GridSearchCV(xgb,param_grid=param_dist,scoring='neg_root_mean_squared_log_error',verbose=1,n_jobs=-1,cv=5).fit(X_train,y_train)
best_p = model.best_params_
xgb = MLPRegressor(**best_p).fit(X_train,y_train)
yhat = xgb.predict(X_test)
yt= xgb.predict(X_train)
yv= xgb.predict(X_)



print(rmse_log(np.array(y_train).tolist(),yt.tolist()),rmse_log(np.array(y_).tolist(),yv.tolist()))

0.09228961465859144 0.13902396144465085
Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


KeyboardInterrupt: 

### saving the Results

In [331]:
import csv
with open('submission.csv', 'w', newline='') as csvfile:
   
    submit = csv.writer(csvfile, delimiter = ',')
    
    submit.writerow(['Id','SalePrice'])
    
    for ids,predict in zip(test['Id'],yhat):
        submit.writerow([ids,predict])
        