In [35]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [36]:
original_train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
original_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")



Always work on a copy of the dataframe

In [37]:
train_copy = original_train.drop(['Id'],axis = 1).copy()
test_copy = original_test.drop(['Id'],axis = 1).copy()

# Drop columns with lots of null values

In [38]:
# calculatethe missing values in each feature
#train_copy.isnull().sum()


#Calculate the percentage of the missing values for each column
#save the columns with percentages >= 40% in a list to drop them later

nullColumns = [column for column in train_copy.columns if abs(train_copy[column].isnull().sum()/1460 * 100) >= 40]



In [39]:
nullColumns

In [40]:
# we drop the columns
train_copy = train_copy.drop(columns = nullColumns,axis = 1)
test_copy= test_copy.drop(columns = nullColumns,axis = 1)

# Impute missing values


In [41]:
def impute(df):
    
    """
    This function imputes missing values for numeric and categorical datatypes
    """
    for name in df.select_dtypes(include = (['int64','float64'])):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("object"):
        df[name] = df[name].fillna("None")
    return df

In [42]:
train_copy = impute(train_copy)


In [43]:
test_copy = impute(test_copy)

# Establish a Baseline

Judge your feature engineering based on the score from the Baseline

In [44]:
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import GradientBoostingRegressor as XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score

In [45]:
def score(df,y_transformed = False):

    """
    a function that measures the RMSE with each feature egireeing step
    """
    
    # if null values exist in your dataset an error will be raised
    # to make the function not vulnerable to nulls, always impute them:
    
    df = impute(df)
    
    X = df.copy()
    y = X.pop("SalePrice")
    
    # when we fix skewness of data later, we'll need to apply a log transform to the target variable("SalePrice")
    # so when we actually do the transform there is no need to do it again here
    
    if y_transformed == False:
        log_y = np.log1p(y)
    else:
        log_y = y
        
    
    for colname in X.select_dtypes(["object"]):
        X[colname] = X[colname].astype('category')
        X[colname] = X[colname].cat.codes
    
        
    model =XGBRegressor()
    
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_mean_squared_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    
    print("Baseline score is : ",score)

In [46]:
# show the value of the error
score(train_copy)



# Apply Pearson Correlation test for numeric variables

In [47]:
# # check the variance of the numerical features
# # drop the columns with zero variance(zero variance means the column has only one value )

train_copy.select_dtypes(include = ['int64','float64']).var()

In [48]:
# plot the correlation matrix

import matplotlib.pyplot as plt
import seaborn as sns


corr = train_copy.corr()
plt.figure(figsize=(25,25))
sns.heatmap(corr, annot = True)
plt.show()

# Check Multicollinearity

In [49]:
# this function creates a set of column names with multi-correlations 

def correlation_calculate(dataframe):  #the threshold is normaly around 85%
    correlated_features = set()
    coor_matrix = dataframe.corr()
    for x in range(len(coor_matrix.columns)):
        for y in range(x):
            if abs(coor_matrix.iloc[x,y]) > 0.7: # here the threshold is set to 70%
                clname = coor_matrix.columns[x]
                correlated_features.add(clname)
            else:pass
            
    return correlated_features

In [50]:
# the highly correlated columns names

correlation_calculate(train_copy)

In [51]:
# "1stFlrSF" is correlated with "TotalBsmtSf"  0.82

# "GarageArea" is correlated with "GarageCars"  0.88

# "GarageYearBlt" is correlated with "YearBuilt"  0.83

# "GrLivArea" is correlated with "TotRmsAbvGrd"  0.83

# drop "TotalBsmtSf" since it the "1stFlrSF" is handy

# drop "GarageArea" since it is less correlated with the target

# drop "GarageYearBuilt" since it is less correlated with the target

# drop "TotRmsAbvGrd" since it is less correlated with the target



from each pair, we drop the column that is less correlated with the target,
If the correlation with the target is equal for both the columns, we rely on domain knowledge

In [52]:
# show the value of the error

score(train_copy.drop(["TotalBsmtSF","GarageArea","TotRmsAbvGrd"],axis = 1))



notice that dropping the correlated columns barely decreased the error( or didn't make a change), so we fix Multicollinearity later using regularization


# apply Chi-square test and V-Cramer's test for the categorical features

In [53]:
# ordinal_features = ['LotShape','LandContour','LandSlope','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','HeatingQC','KitchenQual','GarageFinish','GarageQual','GarageCond']
# nominal_features = [column for column in train_copy.columns if column not in ordinal_features and train_copy[column].dtypes=='object']

In [54]:
import scipy.stats as stats

# if you use a data frame with null values, it will raise an error

def chi_square_test(dataframe):
    
    """
    this function calculates the Cramer's V-test for categorical variables 
    if there is a dependence between them
        """
    # loop through the categorical columns
    
    for column1 in dataframe.select_dtypes('object'):
        for column2 in dataframe.select_dtypes('object'):
            
            # create a contingency table between every two features
            
            myCrosstable = pd.crosstab(index = dataframe[column1],columns = dataframe[column2])
            
            # calculate the chi-square value,the p-value, the degrees of freedom and the expected values:
            
            (chiVal, pVal, df, exp) = stats.chi2_contingency(myCrosstable)
            
            # only proceed with the test if the distribution is significant
            
            if pVal <0.05:
                
                # now we check if the results are reliable:
                # 1- the least expected value should be 1
                # 2- only 20% of the cells are allowed to have a value below 5
                
                if (exp.min() >=1) and ((len(exp[exp<5])/len(exp) *100) <=20):
                    n = np.sum(np.array(myCrosstable))
                    minDim = min(myCrosstable.shape)-1
                    association_strength = np.sqrt((chiVal/n)/(minDim))
                    print("The association strength between {myField1} and {myField2} is : ".format(myField1 = column1, myField2 = column2),association_strength)
                else:pass
            else:pass
                

chi_square_test(train_copy)

In [55]:
# a value between( 0.4 to 1.00) for the V-cramer's test indicates a strong association
# we drop either one of two columns which have an association strength between 0.4-1.00 or apply some transformation on the features
# as you notice, no strong associations are present between the columns


# Now we perform the Mutual Information utility metric for feature selection

In [56]:
from sklearn.feature_selection import mutual_info_regression


In [57]:

def mutual_info_calculator(df):
    """
    calculate the mutual information between each feature and  the target variable
    """
    
    X = df.copy() 
    for colname in X.select_dtypes("object"):
        X[colname], _ = X[colname].factorize()
        
    # or: 

    # for colname in XX.select_dtypes("object"):
    #     col, index= XX[colname].factorize()
    #     XX[colname] = col
        
        
    discrete_features = X.dtypes == int
    
    mi_scores = mutual_info_regression(X, X['SalePrice'], discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=True)
   
    for i in range(len(mi_scores)) :
        print(mi_scores.index[i],mi_scores[i])

In [58]:
mutual_info_calculator(train_copy)

In [59]:
#Columns with mutual info value equal or very close to zero indicates full independence, 
#which mean they have zero effect on the target variable and thus are uninformative for us, so we drop them

train_copy.drop(columns = ['Utilities','MiscVal','MoSold','Street','YrSold','PoolArea'],inplace = True)

In [60]:
test_copy.drop(columns = ['Utilities','MiscVal','MoSold','Street','YrSold','PoolArea'],inplace = True)

In [61]:
# now check the score
score(train_copy)

# Check for data normality

In [62]:
import seaborn as sns


In [63]:
# Plot a histogram of the target variable to check whether it is normally distributed or not

sns.distplot(train_copy['SalePrice']);
# or use : sns.displot(train_copy['SalePrice']); or sns.histplot(train_copy['SalePrice']);


notice that the target variable is positively skewed

In [64]:
# Now we check the skewness of other features

train_copy.select_dtypes(include = ['int64','float64']).skew()

# Fix data skewness

In [65]:
## Import necessary modules 

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

In [66]:
#the `MSSubClass` feature is read as an `int` type, but is actually a (nominative) categorical

edit_feature = {"MSSubClass": {20:'1-STORY 1946 & NEWER ALL STYLES'
                             ,30:'1-STORY 1945 & OLDER'
                             ,40:'1-STORY W/FINISHED ATTIC ALL AGES'
                             ,45:'1-1/2 STORY - UNFINISHED ALL AGES'
                             ,50:'1-1/2 STORY FINISHED ALL AGES'
                             ,60:'2-STORY 1946 & NEWER'
                             ,70:'2-STORY 1945 & OLDER'
                             ,75:'2-1/2 STORY ALL AGES'
                             ,80:'SPLIT OR MULTI-LEVEL'
                             ,85:'SPLIT FOYER'
                             ,90:'DUPLEX - ALL STYLES AND AGES'
                             ,120:'1-STORY PUD (Planned Unit Development) - 1946 & NEWER'
                             ,150:'1-1/2 STORY PUD - ALL AGES'
                             ,160:'2-STORY PUD - 1946 & NEWER'
                             ,180:'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER'
                             ,190:'2 FAMILY CONVERSION - ALL STYLES AND AGES'}}
              
train_copy = train_copy.replace(edit_feature)
test_copy = test_copy.replace(edit_feature)



In [67]:
def view_skew(dataframe):
    """
    this function determies which transformation is better
    by comparing the value of skew for each variable after each transformation

    """
    skewed_features = [column for column in dataframe.select_dtypes(include = ['int64','float64'])]
    log = 0
    squareroot = 0
    recprocal = 0
    exponential = 0
    
    log_cols = []
    squareroot_cols = []
    recprocal_cols = []
    exponential_cols = []
    
    for column in skewed_features:
        
        log_transform = abs(np.log1p(dataframe[column]).skew())
        SquareRoot_transform = abs(np.sqrt(dataframe[column]).skew())
        reciprocal_transform = abs((1/(dataframe[column]+1)).skew())
        exponential_transform = abs((dataframe[column]**(1/5)).skew())
        
        transforms = {'log':log_transform,
                      'squareroot':SquareRoot_transform,
                      'recprocal':reciprocal_transform,
                      'exponential':exponential_transform}
        
        
        best_score =[log_transform]
        the_transform = ['log']
        
        for key,element in transforms.items():
            
            if element <best_score[0]:
                best_score = [element]
                the_transform = [key]
            else:pass
        if the_transform[0] == 'log':
            log+=1
            log_cols.append(column)
            
        elif the_transform[0] == 'squareroot':
            squareroot+=1
            squareroot_cols.append(column)
            
        elif the_transform[0] == 'recprocal':
            recprocal+=1
            recprocal_cols.append(column)
            
        else:
            exponential+=1
            exponential_cols.append(column)
            
           # uncomment the following line to view the result of each individual feature:
#         print('the best transform for : ',column,'is',the_transform, "it's score is :",best_score)
    
    # print the count of features that each transform minimzed its skew
    print('count of features where log performed better :',log)
    print('count of features where square root performed better :',squareroot)
    print('count of features where reciprocal performed better :',recprocal)
    print('count of features where exponential performed better :',exponential)
    
    return  log_cols, squareroot_cols, recprocal_cols, exponential_cols

In [68]:
log_cols, squareroot_cols, recprocal_cols, exponential_cols = view_skew(train_copy.copy())

In [69]:
def transform(df,features,transform ):
    
    """
    This function applies the appropriate transform for each feature in order to fix skew
    """
    if transform == 'sqrt':
        df[features] = np.sqrt(df[features])
        
    elif transform =='log':
        df[features] = np.log1p(df[features])
        
    elif transform =='expo':
        df[features] = df[features]**(1/5)
        
    else:
        df[features] = 1/(df[features]+1)
    
    return df

In [70]:
train_copy = transform(train_copy,log_cols,'log')
train_copy = transform(train_copy,squareroot_cols,'sqrt')
train_copy = transform(train_copy,recprocal_cols,'recipro')
train_copy = transform(train_copy,exponential_cols,'expo')

In [71]:
test_copy =  transform(test_copy,exponential_cols,'expo')
log_cols = ['LotArea','1stFlrSF','GrLivArea','FullBath','BedroomAbvGr','GarageCars','OpenPorchSF']
test_copy = transform(test_copy,log_cols,'log')
test_copy = transform(test_copy,squareroot_cols,'sqrt')
test_copy = transform(test_copy,recprocal_cols,'recipro')

In [72]:
# check the score
score(train_copy,True)

# Create features with Principal Component Analysis

In [73]:
# first we take the highest MI scoring numeric features
features = ['GarageCars','LotArea','TotalBsmtSF','YearBuilt','GrLivArea' ,'GarageArea']

In [74]:
from sklearn.decomposition import PCA


In [75]:
def create_PC(df,features,standarize = True):
    """
    This function creates principle component features from variations in original features
    """

    X = df.loc[:, features]

    # Standardize
    if standarize:
        X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)
    
    # Create principal components
    pca = PCA()
    X_pca = pca.fit_transform(X_scaled)

    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    
    # Create loadings
    loadings = pd.DataFrame(
    pca.components_.T,  # transpose the matrix of loadings
    columns=component_names,  # so the columns are the principal components
    index=X.columns,  # and the rows are the original features
    )
    return X_pca, loadings
    

In [76]:
X_pca, loadings = create_PC(train_copy,features)

In [77]:
X_pca

In [78]:
loadings

From the first principal component (PC1), notice that it describes some kind of an 'overall size' feature where all the features have a positive sign, and particularly the garage capacity in car is positively related to the garage area

The second principal component (PC2), indicates a contrast in the garage area between properties having a large above ground finished area and large lot area in comparison with properties having relatively small total finished basement area and built earlier 

I'm gonna ignore the other remaining four principle components since they have near zero values and doesn't show any clear variation

In [79]:
# now we add the new features

#inspired by loadings(CP1):
train_copy['PCA_feature'] = (train_copy.GarageCars)*(train_copy.GarageArea)

# use components
train_copy = train_copy.join(X_pca)

In [80]:
X_pca, loadings = create_PC(test_copy,features)
test_copy['PCA_feature'] = (test_copy.GarageCars)*(test_copy.GarageArea)

# use components
test_copy = test_copy.join(X_pca)

In [81]:
score(train_copy,True)

# Now we encode the categorical features

In [82]:
# recall that we dropped the following columns:

# ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature',
#'Utilities','MiscVal','MoSold','Street','YrSold','PoolArea']


In [83]:
# Now we take a  look at the dataset and decide which features to be nominally or ordinally encoded

nominal_features = [ "MSSubClass","MSZoning", "LotConfig",
                "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle",
                "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", 
                "Foundation", "Heating", "CentralAir", "GarageType", "SaleType", "SaleCondition","Functional"]

ordinal_features = [column for column in train_copy.select_dtypes(include= ['object']).columns if column not in nominal_features ]

In [84]:
#Now before ordinal encoding, we create a list of lists containing the unique values in each cat feature
for i in ordinal_features: 
    print(i,train_copy[i].unique())
    

### ordinal encoding

In [85]:
categories = {'LotShape': {'Reg':4 ,'IR1':3, 'IR2':2 ,'IR3':1,'None':0},
              'LandContour':{'Lvl':4, 'Bnk':3, 'HLS':2,'Low':1,'None':0},
              'LandSlope':{'Gtl':3 ,'Mod':2, 'Sev':1,'None':0},
              'ExterQual':{'Ex':5,'Gd':4 ,'TA' :3,'Fa':2,'Po':1,'None':0},
              'ExterCond':{'Ex':5,'Gd':4 ,'TA' :3,'Fa':2,'Po':1,'None':0},
              'BsmtQual':{'Ex':5,'Gd':4 ,'TA' :3,'Fa':2,'Po':1,'None':0},
              'BsmtCond':{'Ex':5,'Gd':4 ,'TA' :3,'Fa':2,'Po':1,'None':0 },
              'BsmtExposure':{'Gd':4,'Av':3,'Mn':2,'No':1,'None':0 }                      ,
              'BsmtFinType1':{'GLQ':6, 'ALQ':5,'BLQ':4,'Rec':3, 'LwQ':2 , 'Unf':1,'None':0},
              'BsmtFinType2':{'GLQ':6, 'ALQ':5,'BLQ':4,'Rec':3, 'LwQ':2 , 'Unf':1,'None':0},
              'HeatingQC':{'Ex':5,'Gd':4 ,'TA' :3,'Fa':2,'Po':1,'None':0},
              'KitchenQual':{'Ex':5,'Gd':4 ,'TA' :3,'Fa':2,'Po':1,'None':0},
              'GarageQual':{'Ex':5,'Gd':4 ,'TA' :3,'Fa':2,'Po':1,'None':0},
              'GarageCond':{'Ex':5,'Gd':4 ,'TA' :3,'Fa':2,'Po':1,'None':0},
              'PavedDrive':{'Y':3,'P':2,'N':1,'None':0},
              'Electrical':{'SBrkr':5,'FuseA':4,'FuseF':3,'FuseP':2,'Mix':1,'None':0},
              'GarageFinish':{'RFn':3, 'Unf':2 ,'Fin':1, 'None':0}}


def ordinal_encode(df):
    """
    this function encodes ordinal categorical features
    """
    df = df.replace(categories)
    return df

In [86]:
train_copy = ordinal_encode(train_copy)
test_copy = ordinal_encode(test_copy)

### nominal encoding

In [87]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown = 'ignore')

In [89]:
ohe.fit(train_copy[nominal_features])

encoded = ohe.transform(train_copy[nominal_features]).toarray()

feature_names = ohe.get_feature_names(nominal_features)

train_copy = pd.concat([train_copy.select_dtypes(exclude='object'), 
               pd.DataFrame(encoded,columns=feature_names).astype(int)], axis=1)

In [90]:
train_copy.shape

In [91]:
encoded = ohe.transform(test_copy[nominal_features]).toarray()

# feature_names = ohe.get_feature_names(nominal_features)

test_copy = pd.concat([test_copy.select_dtypes(exclude='object'), 
               pd.DataFrame(encoded,columns=feature_names).astype(int)], axis=1)

In [92]:
test_copy.shape

# Use mathematical relations to create new features

In [96]:
fix, (both,both2,both3) = plt.subplots(1, 3, figsize = (12, 8))

z = train_copy['SalePrice']



h = ((train_copy['OverallCond']))

v = ((train_copy['OverallQual'])) 

y = ((train_copy['OverallQual']))+ ((train_copy['OverallCond']))

both.scatter(h,z,color = 'red')
both2.scatter(v,z,color = 'blue')
both3.scatter(y,z,color = 'black')



plt.show()

no clear relation here,so it's better not to touch those features

In [97]:
fix, (both,both2,both3) = plt.subplots(1, 3, figsize = (12, 8))

z = train_copy['SalePrice']



h = ((train_copy['GarageQual']))

v = ((train_copy['GarageCond']))  

x = ((train_copy['GarageCond']))+((train_copy['GarageQual']))


both.scatter(h,z,color = 'green')
both2.scatter(v,z,color = 'black')
both3.scatter(x,z,color = 'red')




plt.show()

In [98]:
train_copy.insert(1, 'GarageCond/Qual',((train_copy['GarageCond']))+((train_copy['GarageQual'])))
train_copy.drop(columns = ['GarageCond','GarageQual'],axis = 1,inplace = True)

test_copy.insert(1, 'GarageCond/Qual',((test_copy['GarageCond']))+((test_copy['GarageQual'])))
test_copy.drop(columns = ['GarageCond','GarageQual'],axis = 1,inplace = True)

In [99]:
score(train_copy,True)

In [100]:
fix, (both,both2,both3) = plt.subplots(1, 3, figsize = (12, 8))

z = train_copy['SalePrice']



h = ((train_copy['LotArea']))

v = ((train_copy['GrLivArea']))  

x = train_copy['GrLivArea'] / train_copy['LotArea']


both.scatter(h,z,color = 'green')
both2.scatter(v,z,color = 'black')
both3.scatter(x,z,color = 'red')




plt.show()

In [101]:
train_copy.insert(1, 'LivLotRatio',(train_copy['GrLivArea'] / train_copy['LotArea']))
test_copy.insert(1, 'LivLotRatio',(test_copy['GrLivArea'] / test_copy['LotArea']))




In [102]:
score(train_copy,True)

In [103]:
fix, (both,both2,both3) = plt.subplots(1, 3, figsize = (12, 8))

z = train_copy['SalePrice']



h = ((train_copy['2ndFlrSF']))

v = ((train_copy['1stFlrSF']))  

x = (train_copy['1stFlrSF']+train_copy['2ndFlrSF']) 
/ (train_copy['TotRmsAbvGrd']+train_copy['FullBath']+train_copy['HalfBath'])


both.scatter(h,z,color = 'green')
both2.scatter(v,z,color = 'black')
both3.scatter(x,z,color = 'red')




plt.show()

In [104]:
train_copy.insert(1, 'AreaRoomsRatio',((train_copy['1stFlrSF']+train_copy['2ndFlrSF']) 
/ (train_copy['TotRmsAbvGrd']+train_copy['FullBath']+train_copy['HalfBath'])))


test_copy.insert(1, 'AreaRoomsRatio',((test_copy['1stFlrSF']+test_copy['2ndFlrSF']) 
/ (test_copy['TotRmsAbvGrd']+test_copy['FullBath']+test_copy['HalfBath'])))


In [105]:
score(train_copy,True)

# K-means for feature creation

In [106]:
from sklearn.cluster import KMeans

In [107]:
def cluster_labels(df,features,n_clusters = 20):
    """
    this function assigns cluster labels to each example
    
    """
    
    # we scale(standarize) the data:
    df_scaled = df.loc[:,features]
    df_scaled = (df_scaled - df_scaled.mean(axis=0)) / df_scaled.std(axis=0)
    
    # create a k-means object and fit it
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, random_state=0)
    df["Cluster"] = kmeans.fit_predict(df_scaled)
    
    return df

In [108]:
Area_features = ['TotalBsmtSF','LotArea','GrLivArea','1stFlrSF','2ndFlrSF']
train_copy = cluster_labels(train_copy,Area_features)
test_copy = cluster_labels(test_copy,Area_features)



In [109]:
train_copy.head()

In [110]:
score(train_copy,True)

In [111]:
# now we create features for distance from each cluster:

def cluster_distance(df, features, n_clusters=20):
    X = df.copy()
    # standarize the data
    X_scaled = X.loc[:, features]
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
    
    # fit the k-means object
    kmeans = KMeans(n_clusters=20, n_init=50, random_state=0)
    
    #Create the cluster-distance features using `fit_transform`
    df_cd = kmeans.fit_transform(X_scaled)
    
    # Label features and join to dataset
    df_cd = pd.DataFrame(
        df_cd, columns=[f"Centroid_{i}" for i in range(df_cd.shape[1])]
                        )
    
    df = df.join(df_cd)
    return df

In [112]:
train_copy = cluster_distance(train_copy,Area_features)
test_copy = cluster_distance(test_copy,Area_features)



In [113]:
train_copy.head()

In [114]:
score(train_copy,True)

# Regression Models Development

In [115]:
# import the necessary modules

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics


### Use Extreme Gradient Boosting (XGBoost)

In [116]:
# create a standard scaler object
scaler = StandardScaler()

# create an ensemble regressor model object
XGBRmodel =XGBRegressor()


X = train_copy.drop(columns = ["SalePrice"])
Y = train_copy.SalePrice

# create a pipeline 
XGBRpipe = make_pipeline(scaler, XGBRmodel) # standarizes the data and fit the model with it

score = cross_val_score(XGBRpipe, X, Y, cv=5,scoring="neg_mean_squared_error")

score = -1 * score.mean()
score = np.sqrt(score)

print("avg score is : ",score)



In [117]:
# predictions = cross_val_predict(model, X_validation, Y_validation, cv=5)

predictions = cross_val_predict(XGBRpipe, X, Y, cv=5)

In [118]:
accuracy = metrics.r2_score(Y, predictions)
print ('Cross-Predicted Accuracy:', accuracy)

# quiet a good score

In [119]:
plt.scatter(Y, predictions)

# Hyperparameter tunning with GridSearch

This section takes time to excute, but it selects the optimal values for every model's parameters

In [None]:
# from sklearn.model_selection import GridSearchCV


In [None]:
# # first we create a hold out set

# X = train_copy.drop(columns = ["SalePrice"])
# Y = train_copy.SalePrice

# x_train,x_validation,y_train,y_validation = train_test_split(X,Y,test_size = .3, random_state = 0)

In [None]:
# # now we build a number of models, create a list ofthe hyperparameters and initialize a grid search object

# xgboost = XGBRegressor()
# parameters = { 'learning_rate' :[0.001,0.01,0.1,0.5,1],
#                'n_estimators'  :[100,200,500,700,1000],
#                'subsample'     :[0.7,1],
#                'max_depth'     :[3,4,5],
#                'alpha'         :[0.9,1,1.1]}



In [None]:
# # fit the model with the data and use the grid search object to loop through the parameters to choose the best scoring parameters

# clf = GridSearchCV(xgboost, parameters, cv = 5)
# clf.fit(x_train, y_train)


In [None]:
# # print the best scoring parameters
# print("tuned hpyerparameters :(best parameters) ",clf.best_params_)
# print("accuracy :",clf.best_score_)

In [None]:
# the best parameters are :

# (n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)                             

In [None]:
# # create a standard scaler object
# scaler = StandardScaler()

# # create an ensemble regressor model object
# XGBRmodel =XGBRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)                             


# X = train_copy.drop(columns = ["SalePrice"])
# Y = train_copy.SalePrice

# # create a pipeline 
# XGBRpipe = make_pipeline(scaler, XGBRmodel) # standarizes the data and fit the model with it

# score = cross_val_score(XGBRpipe, X, Y, cv=5,scoring="neg_mean_squared_error")

# score = -1 * score.mean()
# score = np.sqrt(score)

# print("avg score is : ",score)



In [None]:
# # predictions = cross_val_predict(model, X_validation, Y_validation, cv=5)

# predictions = cross_val_predict(XGBRpipe, X, Y, cv=5)

In [None]:
# accuracy = metrics.r2_score(Y, predictions)
# print ('Cross-Predicted Accuracy:', accuracy)

# # quiet a good score

In [None]:
# plt.scatter(Y, predictions)

# Making predictions

In [125]:
# fit the standard scaler object with the training dataset

scaler = StandardScaler()

X = train_copy.drop(columns = ["SalePrice"])
Y = train_copy.SalePrice

X = scaler.fit_transform(X)


In [127]:
# Build the XGBRegressor and fit it with the training dataset

XGBRmodel =XGBRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)                             
XGBRmodel = XGBRmodel.fit(X,Y)

In [128]:
# transform the test data with the standard scaler object

test_copy= scaler.transform(test_copy)


In [129]:
# make predictions

predictions = XGBRmodel.predict(test_copy)

In [140]:
# log transformation is applied to the preddictions, so don't forget to transform them back
# You can use numpy.expm1() which is the inverse of numpy.log1p()

predictions = np.expm1(predictions)


In [146]:
plt.hist(predictions)

# Generate Submission File

In [161]:

AmesSubmission = pd.DataFrame({ 'Id': original_test.Id,
                            'SalePrice': predictions })
AmesSubmission.to_csv("AmesSubmission.csv", index=False)