In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
sample_submission = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
train_c = train.copy()
test_c = test.copy()

In [3]:
train_c.shape


In [4]:
test_c.shape

concating the test a train dataset

In [5]:
df = train_c

In [6]:
df.dtypes

**Data preprocessing**

at first we should deal with missing values to avoid curse of dimentialitiy.

In [7]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [8]:
df.isnull().sum()/len(df)*100

In [9]:
mask = df.isnull().sum()/len(df)*100 <= 75
reduced_df = df.loc[:,mask]
reduced_df.shape


We dropped PoolQC, MiscFeature, Alley and Fence features because they have more than 75% of missing values.

**Now we will select numerical and categorical features**

In [10]:
num_df = reduced_df.select_dtypes(exclude=['object'])
cat_df = reduced_df.select_dtypes(include = ['object'])

In [11]:
cat_df.isnull().sum()

In [12]:
columns_Not_Given = ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','GarageType','GarageFinish','GarageQual','FireplaceQu','GarageCond']
cat_df[columns_Not_Given] = cat_df[columns_Not_Given].fillna('Not Given')

In [13]:
columns_mod = ['Electrical']
cat_df[columns_mod] = cat_df[columns_mod].fillna(cat_df.mode().iloc[0])

In the next step we will deal with the **numerical** features black

In [14]:
num_df.isnull().sum()

In [15]:
columns_median = ['GarageYrBlt','LotFrontage']
num_df[columns_median] = num_df[columns_median].fillna(num_df.median())
num_df = num_df.fillna(0)

we will drop some features that have low variance.

In [16]:
for i in cat_df.columns:
    display(cat_df[i].value_counts())


In [17]:
to_drop = ['Utilities','Condition2','Heating','RoofMatl','Street']
cat_df = cat_df.drop(to_drop,axis=1)

In [18]:
plt.figure(figsize=(30,30))
corr = num_df.corr()
sns.heatmap(corr[(corr > 0.5) | (corr<-0.5)],annot=True)

Some features are strongly correlated to each other,for instance 'GarageCars' and 'GarageArea', it is abvious that in a greater area more cars can be parked. Therefore, we just need one of these variables in our analysis (we can keep 'GarageCars' since its correlation with 'SalePrice' is higher)

'TotRmsAbvGrd' and 'GrLivArea', twin brothers again

'TotalBsmtSF' and '1stFloor' also seem to be strongly correlated. We can keep 'TotalBsmtSF'.

It seems that 'YearBuilt' is slightly correlated with 'GarageYrBlt'.

'OverallQual', 'GrLivArea' and 'TotalBsmtSF' are strongly correlated with 'SalePrice'

Lets plot these features to see their correlations.

In [19]:
correleted_cols = ['GarageCars','GarageArea','TotRmsAbvGrd','GrLivArea','TotalBsmtSF','1stFlrSF','YearBuilt','GarageYrBlt','OverallQual','SalePrice']
plt.figure(figsize=(30,30))
sns.pairplot(num_df[correleted_cols])

Now it is time to encode categorical features.

In [20]:
cat_df = pd.get_dummies(cat_df)

In [21]:
all_data = pd.concat([num_df,cat_df],axis=1)
all_data.shape

**Modeling**

In [22]:
X = all_data.drop(['SalePrice'],axis=1)
y = all_data[['SalePrice']]

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

**Scaling the data**

In [24]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
steps = [('scaler',StandardScaler()),('reducer',PCA())]
pipeline = Pipeline(steps)
pipeline.fit(X)
print(pipeline.steps[1][1].explained_variance_ratio_.cumsum())
plt.figure(figsize=(20,20))
plt.plot(pipeline.steps[1][1].explained_variance_ratio_)
plt.xticks(np.arange(0,250,5))
plt.show()

In [26]:
steps = [('scaler',StandardScaler()),('reducer',PCA(n_components = 10))]
pipeline = Pipeline(steps)
pipeline.fit(X)
X_pca = pipeline.transform(X)

In [27]:
X_train,X_test,y_train,y_test = train_test_split(X_pca,y,test_size=0.2,random_state=1)

In [28]:
xg_reg = xgb.XGBRegressor()
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test,preds))
print("RMSE: %f" % (rmse))
xg_reg.score(X_test,y_test)
xg_reg.get_params()

**Evaluating model quality**

In [29]:

all_data_dmatrix = xgb.DMatrix(data=X_pca, label=y)

params = {"objective":"reg:linear", "max_depth":5}

cv_results = xgb.cv(dtrain = all_data_dmatrix, params=params, nfold=5, num_boost_round=13, metrics='rmse', as_pandas=True, seed=123)

print(cv_results)

print((cv_results["test-rmse-mean"]).tail(1))