In [237]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
df = pd.read_csv("../input/train.csv")
y = df.SalePrice.values
cols = ['SalePrice', 'LotFrontage', 'Neighborhood', 'MasVnrArea', 'ExterQual',
       'BsmtQual', 'KitchenQual', 'FireplaceQu', 'GarageYrBlt', 'GarageFinish',
       'OverallQual', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea',
       'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea']
categorical = [x for x in df.select_dtypes("object").columns if x in cols]
df[categorical] = df[categorical].fillna("None") #make the object with nan encoded with nan included


class PrepareData:
    def __init__(self, df, cols):
        self.df = df[cols+["Id"]]
    def encode(self):
        ind = pd.MultiIndex.from_tuples([], names=['variable','value'])
        self.key = pd.DataFrame(columns=["index"], index=ind)
        catDf = self.df[list(self.df.select_dtypes("object").columns) + ["Id","SalePrice"]]
        for x in self.df.select_dtypes("object").columns:
            kf = self.df.groupby(x)["SalePrice"].mean().to_frame().sort_values("SalePrice")
            kf = kf.reset_index().reset_index().drop(columns=["SalePrice"])
            
            tf = kf
            tf["variable"] = x
            self.key = pd.concat([self.key, tf.rename(columns={x:"value"}).set_index(["variable","value"])])
            
            kf = kf.rename(columns={"index":x+"Encoded"})
            catDf = catDf.merge(kf, on=x, how="left")
        self.catDf = catDf
        catDf = catDf[[x for x in catDf.columns if "Encode" in x or x in ["SalePrice","Id"]]]
        catDf.columns = [x.split("Encoded")[0] for x in catDf.columns]
        self.df = catDf.merge(self.df.select_dtypes(include=['float64',"int64"]), on=["Id","SalePrice"])

        
obj = PrepareData(df, cols)
obj.encode()
df = obj.df
key = obj.key

In [3]:
X = df.iloc[:,2:]


In [4]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [5]:
# Necessary imports 
import numpy as np 
import pandas as pd 
import xgboost as xg 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE 
# Instantiation 
xgb_r = xg.XGBRegressor(n_estimators = 100, seed = 123) 
  
# Fitting the model 
xgb_r.fit(x_train, y_train) 
  
# Predict the model 
predictions = xgb_r.predict(x_test) 

# RMSE Computation 
rmse = np.sqrt(MSE(y_test, predictions)) 
print("RMSE : % f" %(rmse)) 

RMSE :  31773.134487


In [6]:
from sklearn.metrics import r2_score
print("Test score:")
r2_score(y_test, predictions)


Test score:


0.868384727346024

In [7]:
# Here we will prepare data for xgboost and train a model.
# and we get, 0.999 which is relly good.
# Note: since we are using xgboost I did not standarize the data.

import xgboost as xgb
from sklearn.metrics import explained_variance_score


xgb = xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=30)

model = xgb.fit(x_train, y_train)

predictions = model.predict(x_test)

score = explained_variance_score(y_test, predictions) 

score

0.902379926539303

In [313]:
tf = pd.read_csv("../input/test.csv")
encoded = key.index.get_level_values(0).unique()
tf[encoded] = tf[encoded].fillna("None")
categorical = [x for x in tf.select_dtypes("object").columns if x in cols]

In [314]:
categorical

['Neighborhood',
 'ExterQual',
 'BsmtQual',
 'KitchenQual',
 'FireplaceQu',
 'GarageFinish']

In [315]:
class PrepareData:
    def __init__(self, tf, cols):
        self.df = tf[cols[1:]+["Id"]]
    def encode_with_key(self,key):
        for x in categorical:
            print(x)
            kf = key[key.index.get_level_values(0)==x]
            kf = kf.rename(columns={"index":x+"Encoded"})
            self.df = self.df.merge(kf, left_on=x, right_on="value", how="left")   
        self.df = self.df[[x+"Encoded" for x in categorical] + list(self.df.select_dtypes(exclude=["object"]))]
        self.df.columns = [x.split("Encoded")[0] for x in self.df.columns]
obj = PrepareData(tf, cols)
obj.encode_with_key(key)
tf = obj.df




Neighborhood
ExterQual
BsmtQual
KitchenQual
FireplaceQu
GarageFinish


In [316]:
tf = tf.sort_values(by='Id')
tf

Unnamed: 0,Neighborhood,ExterQual,BsmtQual,KitchenQual,FireplaceQu,GarageFinish,LotFrontage,MasVnrArea,GarageYrBlt,OverallQual,YearBuilt,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,GarageCars,GarageArea,Id
0,10,1,2,1,1,1,80.0,0.0,1961.0,5,1961,882.0,896,896,1,5,1.0,730.0,1461
1,10,1,2,2,1,1,81.0,108.0,1958.0,6,1958,1329.0,1329,1329,1,6,1.0,312.0,1462
2,14,1,3,1,3,3,74.0,0.0,1997.0,5,1997,928.0,928,1629,2,6,2.0,482.0,1463
3,14,1,2,2,4,3,78.0,20.0,1998.0,6,1998,926.0,926,1604,2,7,2.0,470.0,1464
4,22,2,3,2,1,2,43.0,0.0,1992.0,8,1992,1280.0,1280,1280,2,5,2.0,506.0,1465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0,1,2,1,1,0,21.0,0.0,,4,1970,546.0,546,1092,1,5,0.0,0.0,2915
1455,0,1,2,1,1,1,21.0,0.0,1970.0,4,1970,546.0,546,1092,1,6,1.0,286.0,2916
1456,11,1,2,1,3,1,160.0,0.0,1960.0,5,1960,1224.0,1224,1224,1,7,2.0,576.0,2917
1457,11,1,3,1,1,0,62.0,0.0,,5,1992,912.0,970,970,1,6,0.0,0.0,2918


In [301]:
#tf[tf.isnull().any(axis=1)]

In [302]:
tf = tf.fillna(0)

In [303]:
Xtest = tf[list(X.columns)]

In [318]:
predictions = model.predict(Xtest)

In [337]:
pd.DataFrame({'SalePrice':predictions},tf.Id).to_csv("../output/XGBoost_submission.csv")

In [331]:
tf.Id = tf.Id.astype("int32")

In [334]:
l.to_csv("XGBoost_submission.csv")

In [None]:
key = pd.DataFrame(columns=["index"], index=pd.MultiIndex.from_tuples([], names=['variable','value']))
qualDf = df[qualitative + ["Id","SalePrice"]]
for x in qualitative:
    kf = df.groupby(x)["SalePrice"].mean().to_frame().sort_values("SalePrice").reset_index().reset_index().drop(columns=["SalePrice"])

    tf = kf
    tf["variable"] = x
    key = pd.concat([key, tf.rename(columns={x:"value"}).set_index(["variable","value"])])

    kf = kf.rename(columns={"index":x+"Encoded"})
    qualDf = qualDf.merge(kf, on=x, how="left")

qualDf = qualDf[[x for x in qualDf.columns if "Encode" in x or x in ["SalePrice","Id"]]]
qualDf.columns = [x.split("Encoded")[0] for x in qualDf.columns]
df = qualDf.merge(df[quantitative], on=["Id","SalePrice"])
