In [2]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time ,random , datetime

from sklearn.preprocessing import LabelEncoder ,StandardScaler

from sklearn.model_selection import train_test_split , cross_val_score , KFold

from sklearn.metrics import mean_squared_error ,accuracy_score

from sklearn.linear_model import LinearRegression ,Lasso ,Ridge ,ElasticNet

from sklearn.svm import SVR
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor ,AdaBoostRegressor

from xgboost import XGBRegressor

**printing out a list of all the files**

In [3]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [4]:
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
sample_submission_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
all_data = pd.concat((train_df,test_df),axis = 0)


In [5]:
train_df.head(5)

In [6]:
train_df.info()

**describe numerical variable**

In [7]:
all_num_col = train_df.describe().T.index #all numerical variable
train_df.describe().T

**describe categorical variable**

In [8]:
all_cat_col = train_df.describe(include=['O']).T.index #all categorical variable
train_df.describe(include=['O'])

**find out how many the numerical and the categorical variable**

In [9]:
print(f'there are {all_num_col.shape[0]} numerical variable & {all_cat_col.shape[0]} categorical variable')

**The correlations between numerical variables**

In [10]:
plt.figure(figsize=[30,30])
sns.heatmap(train_df.corr() , cmap='vlag_r' ,annot=True )
plt.title('The correlations between numerical variables',size=30);

**view the numerical columns that have greater than 50% correlation between sales price**

In [11]:
selceted_num_col = train_df.corr()[train_df.corr()['SalePrice'] > 0.5]['SalePrice'].index 
selceted_num_df = train_df[selceted_num_col]
train_df.corr()[train_df.corr()['SalePrice'] > 0.5]['SalePrice'].sort_values(ascending = False)

In [12]:
selceted_num_df

**visualize the selected numerical Vs SalePrice**

In [13]:
i = 1
plt.figure(figsize=(15, 20))
plt.subplots_adjust(hspace=.5 ,wspace = .3)
for var in selceted_num_df.columns:
    if var not in ['SalePrice']:
        plt.subplot(5,2,i)
        sns.regplot(data = selceted_num_df , x = selceted_num_df[var], y = selceted_num_df.SalePrice )
        i+=1

**visualize the selected numerical columns**

In [14]:
i = 1
plt.figure(figsize=(15, 20))
plt.subplots_adjust(hspace=.5 ,wspace = .3)
for feature in selceted_num_col:
    plt.subplot(6,2,i)
    sns.histplot(train_df[feature], palette='Blues_r')
    i = i +1

**find out the missing values in selected numrical columns**

In [15]:
train_df[selceted_num_col].isna().sum()

**categorical columns**

In [16]:
train_df[all_cat_col].describe()

**Deal with the Missing Values in all the data**

In [17]:
all_data.isna().sum().sort_values(ascending=False)[:40]

In [18]:
# From data_description.txt na values in PoolQC mean there is No pool
all_data.PoolQC.fillna("none",inplace=True)

# From data_description.txt na values in MiscFeature mean there is No Misc Feature
all_data.MiscFeature.fillna("none",inplace=True)

# From data_description.txt na values in Alley mean there is No Alley
all_data.Alley.fillna("none",inplace=True)

# From data_description.txt na values in Fence mean there is No Fence
all_data.Fence.fillna("none",inplace=True)

# From data_description.txt na values in FireplaceQu mean there is No Fireplace 
all_data.FireplaceQu.fillna("none",inplace=True)

# From data_description.txt na values in FireplaceQu mean there is No Fireplace 
all_data.FireplaceQu.fillna("none",inplace=True)

# Group by neighborhood and fill in missing value by the median LotFrontage
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

# From data_description.txt na values in ( GarageCond ,GarageYrBlt ,GarageFinish ,GarageQual ,GarageType ,  ) mean there is No Garage
all_data.GarageCond.fillna("none",inplace=True)
all_data.GarageYrBlt.fillna(0,inplace=True)
all_data.GarageFinish.fillna("none",inplace=True)
all_data.GarageQual.fillna("none",inplace=True)
all_data.GarageType.fillna("none",inplace=True)


# From data_description.txt na values in ( BsmtExposure ,BsmtCond ,BsmtQual ,BsmtFinType1 ,BsmtFinType2) mean there is No Basement
all_data.BsmtExposure.fillna("none",inplace=True)
all_data.BsmtCond.fillna("none",inplace=True)
all_data.BsmtQual.fillna("none",inplace=True)
all_data.BsmtFinType1.fillna("none",inplace=True)
all_data.BsmtFinType2.fillna("none",inplace=True)

# From data_description.txt na values in ( MasVnrType ,MasVnrArea ) mean there is No Masonry veneer
all_data.MasVnrType.fillna("none",inplace=True)
all_data.MasVnrArea.fillna(0,inplace=True)

# From data_description.txt na values in Functional mean Typ
all_data['Functional'].fillna('Typ',inplace = True)

# na mean Zero
all_data['GarageCars'].fillna(0,inplace = True)
all_data['GarageArea'].fillna(0,inplace = True)

# fill the rest of the columns with the most common (there are only 2 or 1 missing values)
all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0],inplace = True)
all_data['BsmtFullBath'].fillna(all_data['BsmtFullBath'].mode()[0],inplace = True)
all_data['BsmtHalfBath'].fillna(all_data['BsmtHalfBath'].mode()[0],inplace = True)
all_data['Utilities'].fillna(all_data['Utilities'].mode()[0],inplace = True)
all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0],inplace = True)
all_data['BsmtFinSF2'].fillna(all_data['BsmtFinSF2'].mode()[0],inplace = True)
all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0],inplace = True)
all_data['BsmtUnfSF'].fillna(all_data['BsmtUnfSF'].mode()[0],inplace = True)
all_data['BsmtFinSF1'].fillna(all_data['BsmtFinSF1'].mode()[0],inplace = True)
all_data['TotalBsmtSF'].fillna(all_data['TotalBsmtSF'].mode()[0],inplace = True)
all_data['SaleType'].fillna(all_data['SaleType'].mode()[0],inplace = True)
all_data['Electrical'].fillna(all_data['Electrical'].mode()[0],inplace = True)
all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0],inplace = True)


In [19]:
# One-Hot Encoding
Cat_columns = all_data.select_dtypes(include = ['object']).columns
num_columns = all_data.select_dtypes(exclude = ['object']).columns.drop(['SalePrice','Id'])
final_data = pd.get_dummies(all_data , columns = Cat_columns)

#now split the data clean data
clean_train_df = final_data.iloc[:train_df.shape[0]]
clean_test_df = final_data.iloc[train_df.shape[0]:]


In [20]:
# make sure that we get ride of all the missing data 
clean_train_df.isna().sum().sort_values(ascending=False)[:5] 

**Prepare the Target Variable for the Regression** inspire by SERIGNE kernal (https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard/notebook#Stacked-Regressions-to-predict-House-Prices)

In [21]:
# distrubition of the Sales Price
sns.histplot( x= clean_train_df['SalePrice']);

#the QQ-plot
from scipy import stats
fig = plt.figure()
res = stats.probplot(clean_train_df['SalePrice'], plot=plt)
plt.show()

**Log-transformation of the target variable**

In [22]:
clean_train_df['SalePrice'] = np.log1p(clean_train_df['SalePrice'])

In [23]:
# distrubition of the Log Sales Price
sns.histplot( x= np.log1p(clean_train_df['SalePrice']));

#the QQ-plot
from scipy import stats
fig = plt.figure()
res = stats.probplot(np.log1p(clean_train_df['SalePrice']), plot=plt)
plt.show()

**Before modeling**

In [24]:
#split the x , y 
X = clean_train_df.drop(["SalePrice",'Id'], axis=1)
y = clean_train_df["SalePrice"]
print(f"the X shape is {X.shape} & the y shape is {y.shape}")



In [25]:
scaler = StandardScaler()
# scale the train data 
X[num_columns] = scaler.fit_transform(X[num_columns])
X.head(5)


In [26]:

# scale the test data 
clean_test_df = clean_test_df.drop(['Id','SalePrice'],axis = 1)
clean_test_df[num_columns] = scaler.fit_transform(clean_test_df[num_columns])
clean_test_df.head(5) 

In [27]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(f'the shape of X_train is {X_train.shape}')
print(f'the shape of X_test is {X_test.shape}')
print(f'the shape of y_train is {y_train.shape}')
print(f'the shape of y_test is {y_test.shape}')

# Machine Learning Models


In [28]:
def evaluation(y_hat, predictions):
    rmse = np.sqrt(mean_squared_error(y_hat, predictions))
    return rmse

In [29]:
model_xgb = XGBRegressor(
                 colsample_bytree=0.2,
                 gamma=0.0,
                 learning_rate=0.01,
                 max_depth=4,
                 min_child_weight=1.5,
                 n_estimators=7200,                                                                  
                 reg_alpha=0.9,
                 reg_lambda=0.6,
                 subsample=0.2,
                 
                 )
model_xgb.fit(X_train, y_train)
# predictions =np.expm1(model_xgb.predict(X_test))
y_pred =model_xgb.predict(X_test)
rmse= evaluation(y_test, y_pred)

print(f"XGBoost score on training set: {rmse} ")

In [57]:
regr = Lasso(alpha=0.00099, max_iter=50000)
regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)
rmse= evaluation(y_test, y_pred)
print("Lasso score on training set: ", rmse)

# Submission

In [30]:

sample_submission_df.head(1)

In [62]:
predictions_xgb =model_xgb.predict(clean_test_df)
predictions_las =regr.predict(clean_test_df)

rmse_xgb= evaluation(sample_submission_df.SalePrice, np.expm1(predictions_xgb))
rmse_las= evaluation(sample_submission_df.SalePrice, np.expm1(predictions_las))
print(f"XGBoost score on test set: {rmse_xgb} ")
print(f"Lasso score on test set: {rmse_las} ")

submission = pd.DataFrame()
submission['Id'] = final_data.iloc[train_df.shape[0]:]['Id']
predictions = (predictions_xgb + predictions_las) / 2
predictions = np.expm1(predictions)
submission['SalePrice'] = predictions

In [63]:
#preview the submission 
submission.head()

done like the sample sumbition

In [55]:
# convert it to csv and finally submit
submission.to_csv('submission.csv', index=False)
print('Submission CSV is ready!')