In [None]:
pip install --pre pycaret

#   Import libraries

In [None]:
import pandas as pd 
import numpy as np
import pycaret
import seaborn as sns
import matplotlib.pyplot as plt

## Reading data

In [None]:
df_train=pd.read_csv('../input/house-prediction/train1.csv')
df_test=pd.read_csv('../input/house-prediction/test1.csv')

In [None]:
df_test.set_index('Id',inplace=True)
df_test.head()

## Finding null value in data train

In [None]:
df_train['SalePrice'].isna().sum()


# findind the correlation between features with target sales price

In [None]:
corr = df_train.corr()
#correlation matrix
k = 10 #number of variables for heatmap
cols = corr.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
plt.scatter(df_train['GrLivArea'],df_train['SalePrice'])
plt.title('SalePrice and GrLivArea')
plt.show()

# Remove outliers

In [None]:
df_train.sort_values(by = 'GrLivArea', ascending = False)[:4]


In [None]:
df_train = df_train.drop(df_train[df_train['Id'] == 1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)
df_train = df_train.drop(df_train[df_train['Id'] == 1183].index)
df_train = df_train.drop(df_train[df_train['Id'] == 692].index)


In [None]:
plt.scatter(df_train['GrLivArea'],df_train['SalePrice'])
plt.title('SalePrice and GrLivArea')
plt.show()

In [None]:
corr = df_train.corr()
corr['SalePrice'].sort_values(ascending=False)

# Remove low correlation features with target

In [None]:
corr[corr['SalePrice']>0.3].index

In [None]:
df_train = df_train[['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
       'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'SalePrice']]
df_test = df_test[['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
       'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF']]

In [None]:
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

# Remove high missing value fetures


In [None]:
df_train = df_train.drop((missing_data[missing_data['Total'] > 81]).index,1)

In [None]:
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [None]:
total_test = df_test.isnull().sum().sort_values(ascending=False)
percent_test = (df_test.isnull().sum()/df_test.isnull().count()).sort_values(ascending=False)
missing_data_test = pd.concat([total_test, percent_test], axis=1, keys=['Total', 'Percent'])
missing_data_test.head(20)

In [None]:
#dealing with missing data
df_test = df_test.drop((missing_data_test[missing_data_test['Total'] > 78]).index,1)

In [None]:
df_test.isnull().sum().sort_values(ascending=False).head(20)


# Filling missing value

In [None]:
df_train['GarageYrBlt'] = df_train['GarageYrBlt'].fillna(df_train['GarageYrBlt'].mean())
df_train['MasVnrArea'] = df_train['MasVnrArea'].fillna(df_train['MasVnrArea'].mean())

df_test['GarageYrBlt'] = df_test['GarageYrBlt'].fillna(df_test['GarageYrBlt'].mean())
df_test['MasVnrArea'] = df_test['MasVnrArea'].fillna(df_test['MasVnrArea'].mean())

In [None]:
df_train = pd.get_dummies(df_train)

In [None]:
df_test = pd.get_dummies(df_test)

In [None]:
all_data_na = (df_test.isnull().sum() / len(df_test)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head()

In [None]:
for col in ( 'GarageArea', 'GarageCars'):
    df_test[col] = df_test[col].fillna(0)
for col in ('BsmtFinSF1','TotalBsmtSF'):
    df_test[col] = df_test[col].fillna(0)

In [None]:
all_data_na = (df_test.isnull().sum() / len(df_test)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head()

In [None]:
df_train.head()

# Use pycaret to predict

In [None]:
from pycaret.regression import *

In [None]:
exp_reg101 = setup(data = df_train, target = 'SalePrice', session_id=123) 

In [None]:
best=compare_models()

In [None]:
catboost=create_model('catboost')

In [None]:
evaluate_model(catboost)

In [None]:
unseen_predictions = predict_model(catboost, data=df_test)
unseen_predictions.head(100)

In [None]:
sample_submission=unseen_predictions[ 'Label']
sample_submission=pd.DataFrame(sample_submission)

sample_submission.head()
sample_submission.rename({'Label':'SalePrice'}, axis=1)

In [None]:
sample_submission.to_csv('sample_submission.csv')

