In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
warnings.filterwarnings('ignore')

In [2]:
df1 = pd.read_csv('train.csv')

In [3]:
df1 = df1.drop(['Id'], axis = 1)

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [5]:
y = df1.iloc[ : , -1].values

In [6]:
print(len(y))

1460


In [7]:
df1 = df1.drop(['SalePrice'], axis = 1)

In [8]:
# Adding the Test file
dd = pd.read_csv('test.csv')
df1 = pd.concat([df1, dd], ignore_index = True)

In [9]:
df1 = df1.drop(['Id'], axis = 1)

#### Finding and deleting the Columns having NaN values greater than half

In [10]:
null_column = df1.columns[df1.isnull().sum() > (len(df1) // 2)]
print(null_column)

Index(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')


In [11]:
df3 = df1.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis = 1)
df3.shape

(2919, 75)

#### Changing NaN values of the Categorical data with mode of the particular Column

In [12]:
categorical_column_with_null = df3.columns[(df3.isnull().any()) & (df3.dtypes == 'object')]
print(categorical_column_with_null)

Index(['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType'],
      dtype='object')


In [13]:
def replacing_val_mod(df3):
    for col in categorical_column_with_null:
        df3[col] = df3[col].fillna(df3[col].mode()[0])
    return df3
df4 = replacing_val_mod(df3)
df4.shape

(2919, 75)

#### Changing NaN values of the Int and Float columns to their mean

In [14]:
numerical_column_with_null2 = df4.columns[(df4.isnull().any()) & ((df4.dtypes == 'float64') | (df4.dtypes == 'int64'))]
print(numerical_column_with_null2)

Index(['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea'],
      dtype='object')


In [15]:
def replacing_val_mean(df3):
    for col in numerical_column_with_null2:
        df3[col] = df3[col].fillna(df3[col].mean())
    return df3
df5 = replacing_val_mean(df4)
df5.shape

(2919, 75)

In [16]:
df5.isnull().values.any()

False

### Applying One Hot Encoding to all the Categorical Data

In [17]:
categorical_column = df5.columns[(df5.dtypes == 'object')]
categorical_column

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

In [18]:
# Not using One hot encoder again as it is frustating I have just working to convert numpy array back to dataframe for hours and it is not working
# So instead just using the Pandas dummies

def one_hot_encoding(categorical_columns, df):
    df_final = df
    i = 0
    for col in categorical_columns:
        dff = pd.get_dummies(df[col], drop_first=True)
        df = df.drop([col], axis = 1)
        if i == 0:
            i = i + 1
            df_final = dff.copy()
        else:
            df_final = pd.concat([df_final, dff], axis = 1)
    
    df_final = pd.concat([df_final, df], axis = 1)
    return df_final

df6 = one_hot_encoding(categorical_column, df5)

In [19]:
df6.head()

Unnamed: 0,FV,RH,RL,RM,Pave,IR2,IR3,Reg,HLS,Low,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,0,0,1,0,1,0,0,1,0,0,...,548.0,0,61,0,0,0,0,0,2,2008
1,0,0,1,0,1,0,0,1,0,0,...,460.0,298,0,0,0,0,0,0,5,2007
2,0,0,1,0,1,0,0,0,0,0,...,608.0,0,42,0,0,0,0,0,9,2008
3,0,0,1,0,1,0,0,0,0,0,...,642.0,0,35,272,0,0,0,0,2,2006
4,0,0,1,0,1,0,0,0,0,0,...,836.0,192,84,0,0,0,0,0,12,2008


In [20]:
final_df = df6.loc[ : , ~df6.columns.duplicated()]
final_df.shape

(2919, 176)

### Making two variables X for the training data and X_prediction as the test data

In [21]:
X = final_df.iloc[ :1460 , :].values
X_prediction = final_df.iloc[1459 : -1, : ]
print(X.shape)

(1460, 176)


# Applying Various Algorithms

## 1. XgBoost

In [22]:
from xgboost import XGBRegressor
classifier = XGBRegressor()

In [23]:
classifier.fit(X, y)

In [24]:
param_grid = {
    'n_estimators' : [200, 600, 900, 1200, 1600],
    'max_depth' : [2, 4, 7, 10, 16],
    'learning_rate' : [0.05, 0.1, 0.16, 0.2],
    'min_child_weight' : [1, 2, 3, 4],
    'booster' : ['gbtree', 'gblinear'],
    'base_score' : [0.25, 0.5, 0.75, 1]
}

In [25]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
model = RandomizedSearchCV(estimator = classifier,
                           param_distributions = param_grid, 
                           cv = 5, 
                           n_iter=50,
                           scoring = 'neg_mean_squared_error',
                           return_train_score=True,
                           random_state = 25
                          )

In [26]:
model.fit(X_train, y_train)

In [27]:
model.best_estimator_

In [28]:
xg_regressor = XGBRegressor(base_score=0.25, booster='gbtree', callbacks=None, colsample_bylevel=1,
                            colsample_bytree=1, feature_types=None, gamma=0,
                            reg_alpha = 1, reg_lambda = 1, silent = True, subsample = 1,
                            learning_rate=0.1, scale_pos_weight = 1, max_cat_threshold=None,
                            max_delta_step=0, max_depth=2, min_child_weight = 1,
                            multi_strategy=None, n_estimators=900, n_jobs=1,
                            random_state=0, objective='reg:linear'
                           )

In [29]:
xg_regressor.fit(X, y)

In [30]:
import pickle
filename = 'XGBoost_file.pkl'
pickle.dump(xg_regressor, open(filename, 'wb'))

In [31]:
y_pred_xg = xg_regressor.predict(X_prediction)
len(y_pred_xg)

1459

In [32]:
pred = pd.DataFrame(y_pred_xg)
sub_df = pd.read_csv('sample_submission.csv')
datasets = pd.concat([sub_df['Id'], pred], axis = 1)
datasets.columns = ['Id', 'SalePrice']
datasets.to_csv('sample_submission.csv', index = False)

## 2. Artificial Neural Network

In [33]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LeakyReLU, ReLU, ELU
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

In [34]:
from sklearn.metrics import mean_squared_error

In [35]:
ann = Sequential()

In [36]:
ann.add(Dense(units=176, activation='relu'))

In [37]:
ann.add(Dense(units=25, activation='relu'))

In [38]:
ann.add(Dense(units = 50, activation='relu'))

In [39]:
ann.add(Dense(units=1, activation = 'linear'))

In [40]:
ann.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics=['mean_absolute_error'])

In [None]:
ann.fit(X, y, validation_split=0.2 ,batch_size = 10, epochs = 1000)

In [43]:
ann_pred = ann.predict(X_prediction)

[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [44]:
pred = pd.DataFrame(ann_pred)
sub_df = pd.read_csv('sample_submission.csv')
datasets = pd.concat([sub_df['Id'], pred], axis = 1)
datasets.columns = ['Id', 'SalePrice']
datasets.to_csv('sample_submission.csv', index = False)