In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import layers

In [None]:
# load the data

# reading the train data as X
X = pd.read_csv('../input/home-data-for-ml-course/train.csv', index_col='Id')

# reading the test data as X_test
X_test = pd.read_csv('../input/home-data-for-ml-course/test.csv', index_col='Id')

# Examine the data
print(X.shape)
print(X_test.shape)
print(X.columns)
print(X_test.columns)
print(X.head())

In [None]:
# finding and removing the data which don't have the target values

X.dropna(axis=0, subset=['SalePrice'], inplace=True)
# seperating the target values
y = X.pop('SalePrice')

print(X.shape)
print(y.head())

In [None]:
# printing categorical column labels with cardinality

for i in X.columns:
    if X[i].dtype == 'object':
        print(i,X[i].nunique(), sep='-')

# removing the columns from X and X_test which have cardinality more than 15

list = []
for i in X.columns:
    if X[i].dtype == 'object' and X[i].nunique() > 15:
        list.append(i)

print(len(list))
X.drop(list, axis=1, inplace=True)  
X_test.drop(list, axis=1, inplace=True)
print(X.shape, X_test.shape)

#removing the columns which have more than 1000 null values 
missing_columns =[]
for i in X.columns:
    if X[i].isnull().sum()>650:
        missing_columns.append(i)
print(missing_columns)
X.drop(missing_columns, axis=1, inplace = True)
X_test.drop(missing_columns, axis = 1, inplace = True)
print(X.shape, X_test.shape)

In [None]:
# dividing the data columns into numerical and catergorical columns

numerical_columns = []
categorical_columns = []
for i in X.columns:
    if X[i].dtype == 'object':
        categorical_columns.append(i)
    elif X[i].dtype in ['int64', 'float64']:
        numerical_columns.append(i)
print(len(numerical_columns))
print(len(categorical_columns))

In [None]:
# imputing numerical_columns

# for train data
for i in numerical_columns:
    current_column = np.array(X[i]).reshape(-1,1)
    updated_column = SimpleImputer().fit_transform(current_column)
    X[i] = updated_column

# for test data
for i in numerical_columns:
    current_column = np.array(X_test[i]).reshape(-1,1)
    updated_column = SimpleImputer().fit_transform(current_column)
    X_test[i] = updated_column
    
# imputing catergorical_columns

# for train data
for i in categorical_columns:
    current_column = np.array(X[i]).reshape(-1,1)
    updated_column = SimpleImputer(strategy = 'most_frequent').fit_transform(current_column)
    X[i] = updated_column
    
# for test data
for i in categorical_columns:
    current_column = np.array(X_test[i]).reshape(-1,1)
    updated_column = SimpleImputer(strategy = 'most_frequent').fit_transform(current_column)
    X_test[i] = updated_column


In [None]:
# removing the outliers in numerical columns
numerical_columns = np.array(numerical_columns).reshape(12,3)
fig, ax = plt.subplots(12,3, figsize = (30,50))
for i in range(12):
    for j in range(3):
        sns.boxplot(y=X[numerical_columns[i][j]], data = X, ax = ax[i,j] )

X = pd.concat([X,y], axis=1)
X.drop(X[X['LotFrontage']>250].index, inplace=True)

X.drop(X[X['BsmtFinSF1']>4000].index, inplace=True)

X.drop(X[X['LotArea']>100000].index, inplace=True)

X.drop(X[X['TotalBsmtSF']>4000].index, inplace=True)

X.drop(X[X['GrLivArea']>4000].index, inplace=True)


# seperating the target values
y = X.pop('SalePrice')
print(X.shape, X_test.shape, y.shape)

In [None]:
# encoding categorical columns

ohec = OneHotEncoder(handle_unknown='ignore', sparse=False)
# fitting and transforming categorical train and test data using ohc and changing the column names
X_cat = pd.DataFrame(ohec.fit_transform(X[categorical_columns]))
X_cat.columns = ohec.get_feature_names(categorical_columns)
Xtest_cat = pd.DataFrame(ohec.transform(X_test[categorical_columns]))
Xtest_cat.columns = ohec.get_feature_names(categorical_columns)
# giving the index of train data to categorical train data
X_cat.index = X.index
Xtest_cat.index = X_test.index
# dropping the catergorical columns from original data
X.drop(categorical_columns,axis=1,inplace = True)
X_test.drop(categorical_columns, axis=1, inplace = True)
# concating the catergorical data with original data
X = pd.concat([X,X_cat], axis=1)
X_test = pd.concat([X_test, Xtest_cat], axis= 1)


In [None]:
# splitting the training data into training data(75%) and validation data(25%)
X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size = 0.95)

#train and fitting the random forest
model = RandomForestRegressor()
model.fit(X_train, y_train)
pred_valid = model.predict(X_valid)
mae = mean_absolute_error(y_valid, pred_valid)
print('MAE: ', mae )
   

In [None]:
input_ = [X_train.shape[0]]
model = keras.Sequential([
    layers.BatchNormalization(),
    layers.Dense(units=64,activation = 'relu', input_shape=input_),
    
    layers.Dense(units=32,activation = 'relu'),
   
    layers.Dense(units=16,activation = 'relu'),
    
    layers.Dense(units=8,activation = 'relu'),

    layers.Dense(units=1)
])
model.compile(optimizer='adam', loss='mae')
earlystopping = EarlyStopping(min_delta= 0.001, patience = 20, restore_best_weights = True)
history = model.fit(X_train, y_train, validation_data = (X_valid,y_valid), batch_size = 128, epochs=1000, 
                    callbacks=[earlystopping], verbose=0)
pd.DataFrame(history.history).plot()
plt.show()

In [None]:
pred_test = model.predict(X_test)
submission = pd.DataFrame({'ID':X_test.index, 'SalePrice':pred_test.flatten()})
print(submission.head())
submission.to_csv('submission.csv',index=False)