In [24]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier


In [2]:
df = pd.read_csv(r"kc_house_data.csv")
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
id               21613 non-null int64
date             21613 non-null object
price            21613 non-null float64
bedrooms         21613 non-null int64
bathrooms        21613 non-null float64
sqft_living      21613 non-null int64
sqft_lot         21613 non-null int64
floors           21613 non-null float64
waterfront       21613 non-null int64
view             21613 non-null int64
condition        21613 non-null int64
grade            21613 non-null int64
sqft_above       21613 non-null int64
sqft_basement    21613 non-null int64
yr_built         21613 non-null int64
yr_renovated     21613 non-null int64
zipcode          21613 non-null int64
lat              21613 non-null float64
long             21613 non-null float64
sqft_living15    21613 non-null int64
sqft_lot15       21613 non-null int64
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB


In [4]:
df["price"].describe()

count    2.161300e+04
mean     5.400881e+05
std      3.671272e+05
min      7.500000e+04
25%      3.219500e+05
50%      4.500000e+05
75%      6.450000e+05
max      7.700000e+06
Name: price, dtype: float64

In [5]:
def quatiles_transform(data):
    if data > 0.0 and data <= np.quantile(df["price"], 0.25):
        return "I quantile"
    elif data > np.quantile(df["price"], 0.25) and data <= np.quantile(df["price"], 0.5):
        return "II quatile"
    elif data > np.quantile(df["price"], 0.5) and data <= np.quantile(df["price"], 0.75):
        return "III quatile"
    elif data > np.quantile(df["price"], 0.75) and data <= np.quantile(df["price"], 1):
        return "IV quatile"

In [6]:
df["price2"] = df["price"].apply(quatiles_transform)
set(df["price2"])

{'I quantile', 'II quatile', 'III quatile', 'IV quatile'}

In [7]:
df["price"] = df["price2"]
df["price"].head(10)

0     I quantile
1    III quatile
2     I quantile
3    III quatile
4    III quatile
5     IV quatile
6     I quantile
7     I quantile
8     I quantile
9     II quatile
Name: price, dtype: object

In [12]:
df.drop(columns=["price2"])
df.head(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price2
0,7129300520,20141013T000000,I quantile,3,1.0,1180,5650,1.0,0,0,...,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,I quantile
1,6414100192,20141209T000000,III quatile,3,2.25,2570,7242,2.0,0,0,...,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,III quatile
2,5631500400,20150225T000000,I quantile,2,1.0,770,10000,1.0,0,0,...,770,0,1933,0,98028,47.7379,-122.233,2720,8062,I quantile
3,2487200875,20141209T000000,III quatile,4,3.0,1960,5000,1.0,0,0,...,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,III quatile
4,1954400510,20150218T000000,III quatile,3,2.0,1680,8080,1.0,0,0,...,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,III quatile


In [22]:
y_data = df["price"]
new_df = df.drop(columns=["price2", "price", "id", "date"])
numeric_columns = new_df.select_dtypes(include =[np.number]).columns.tolist()
cat_columns = new_df.select_dtypes(exclude=[np.number]).columns.tolist()
cat_columns

[]

In [26]:
class MyLabelBinarizer(BaseEstimator,TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = OneHotEncoder(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)

class DataframeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, atrib_names):
        self.atribute_names = atrib_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=0):
        return X[self.atribute_names].values
    
##numerical pipline
num_pipeline = Pipeline([
    ('selector', DataframeSelector(numeric_columns)),
    ('std_scaler', StandardScaler())
])
#categorial pipeline
cat_pipeline = Pipeline([
    ('selector', DataframeSelector(cat_columns)),
    ('onehot', MyLabelBinarizer())
])
##union piplines
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    #("cat_pipline", cat_pipeline)
])

In [28]:
## fitting data
X_data = df[numeric_columns]

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=2, test_size=0.2)

full_pipeline.fit(X_data)
X_train_trans, X_test_trans = full_pipeline.transform(X_train), full_pipeline.transform(X_test)