In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [2]:
df=pd.read_csv("C:\\Users\\Mr\\Desktop\\python\\hose_prediction.csv",parse_dates=["date"])

In [3]:
df.shape

(4600, 18)

In [4]:
train,test=train_test_split(df,test_size=0.3,random_state=42)

In [5]:
train.shape

(3220, 18)

In [6]:
x_train=train.drop(columns=['price'])

In [7]:
y_train=train["price"]

In [8]:
x_train.shape

(3220, 17)

In [9]:
y_train.shape

(3220,)

In [10]:
test.shape

(1380, 18)

In [11]:
isnull_sum=x_train.isnull().sum()

In [12]:
num_var=x_train.select_dtypes(include=['int64','float64']).columns

In [13]:
num_var_miss=[var for var in num_var if isnull_sum[var]>0]

In [14]:
num_var_miss

['sqft_lot', 'view', 'yr_built']

In [15]:
cat_var=x_train.select_dtypes(include=['O']).columns

In [16]:
cat_var_miss=[var for var in cat_var if isnull_sum[var]>0]

In [17]:
#cat_var_miss=[var for var in cat_var if isnull_sum[var]>0]

In [18]:
cat_var_miss

['city', 'statezip']

In [19]:
num_var_mean=['sqft_lot']
num_var_median=['view', 'yr_built']

In [20]:
cat_var_mod=['city']

In [21]:
cat_var_cont=['statezip']

In [22]:
num_var_mean_impute=Pipeline(steps=[("imputer",SimpleImputer(strategy="mean"))])
num_var_median_impute=Pipeline(steps=[("imputer",SimpleImputer(strategy="median"))])
cat_var_mod_impute=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent"))])
cat_var_cont_impute=Pipeline(steps=[("imputer",SimpleImputer(strategy="constant",fill_value="missing"))])

In [23]:
preprocessor=ColumnTransformer(transformers=[("mean_imputer",num_var_mean_impute,num_var_mean),("median_imputer",num_var_median_impute,num_var_median),("mod_imputer",cat_var_mod_impute,cat_var_mod),
                                ("const_imputer",cat_var_cont_impute,cat_var_cont)])


In [24]:
preprocessor.transform

<bound method ColumnTransformer.transform of ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['sqft_lot']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['view', 'yr_built']),
                                ('mod_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['city']),
                                ('const_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='cons

In [25]:
preprocessor.fit(x_train)

In [26]:
preprocessor.transform

<bound method ColumnTransformer.transform of ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['sqft_lot']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['view', 'yr_built']),
                                ('mod_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['city']),
                                ('const_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='cons

In [27]:
preprocessor.named_transformers_["mean_imputer"].named_steps["imputer"].statistics_

array([14485.28338558])

In [28]:
preprocessor.named_transformers_["mod_imputer"].named_steps["imputer"].statistics_

array(['Seattle'], dtype=object)

In [29]:
preprocessor.named_transformers_["const_imputer"].named_steps["imputer"].statistics_

array(['missing'], dtype=object)

In [30]:
X_train_clean = preprocessor.transform(x_train)
test_clean = preprocessor.transform(test)

In [31]:
X_train_clean

array([[8400.0, 0.0, 1971.0, 'Burien', 'WA 98146'],
       [1280.0, 0.0, 2012.0, 'Seattle', 'WA 98144'],
       [14821.0, 0.0, 1958.0, 'Kent', 'WA 98042'],
       ...,
       [8145.0, 0.0, 1932.0, 'Shoreline', 'WA 98155'],
       [2002.0, 0.0, 1900.0, 'Seattle', 'WA 98112'],
       [3825.0, 0.0, 1929.0, 'Seattle', 'WA 98117']], dtype=object)

In [32]:
preprocessor.transformers_

[('mean_imputer',
  Pipeline(steps=[('imputer', SimpleImputer())]),
  ['sqft_lot']),
 ('median_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
  ['view', 'yr_built']),
 ('mod_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]),
  ['city']),
 ('const_imputer',
  Pipeline(steps=[('imputer',
                   SimpleImputer(fill_value='missing', strategy='constant'))]),
  ['statezip']),
 ('remainder', 'drop', [0, 1, 2, 3, 5, 6, 8, 9, 10, 12, 13, 16])]

In [33]:
X_train_clean_miss_var=pd.DataFrame(X_train_clean,columns=num_var_mean+num_var_median+cat_var_mod+cat_var_cont)

In [34]:
X_train_clean_miss_var.head()

Unnamed: 0,sqft_lot,view,yr_built,city,statezip
0,8400.0,0.0,1971.0,Burien,WA 98146
1,1280.0,0.0,2012.0,Seattle,WA 98144
2,14821.0,0.0,1958.0,Kent,WA 98042
3,4004.0,0.0,2004.0,Issaquah,WA 98029
4,9003.0,0.0,1949.0,Kent,WA 98031


In [35]:
X_train_clean_miss_var.isnull().sum().sum()

0

In [36]:
train['sqft_lot'].value_counts()

sqft_lot
5000.0    62
6000.0    48
4000.0    38
7200.0    36
4800.0    23
          ..
3008.0     1
8710.0     1
7297.0     1
6675.0     1
2002.0     1
Name: count, Length: 2266, dtype: int64

In [37]:
X_train_clean_miss_var['sqft_lot'].value_counts()

sqft_lot
5000.000000     62
6000.000000     48
4000.000000     38
7200.000000     36
14485.283386    30
                ..
3008.000000      1
8710.000000      1
7297.000000      1
6675.000000      1
2002.000000      1
Name: count, Length: 2267, dtype: int64

In [38]:
X_train_clean_miss_var['city'].value_counts()

city
Seattle             1166
Renton               193
Bellevue             191
Redmond              165
Kirkland             134
Kent                 130
Issaquah             130
Auburn               116
Sammamish            113
Federal Way          100
Shoreline             85
Woodinville           75
Maple Valley          71
Mercer Island         57
Burien                53
Snoqualmie            47
Kenmore               46
Des Moines            37
Duvall                29
Lake Forest Park      27
North Bend            26
Vashon                25
Covington             24
Enumclaw              22
Newcastle             22
Tukwila               21
Bothell               19
SeaTac                19
Carnation             14
Normandy Park         13
Fall City             10
Medina                 9
Clyde Hill             8
Black Diamond          6
Pacific                4
Skykomish              3
Ravensdale             3
Milton                 2
Yarrow Point           2
Snoqualmie Pass     

In [39]:
# no missing values variables index
remainder_vars_index = [0, 1, 2, 3, 5, 6, 8, 9, 10, 12, 13, 16]   

In [40]:
# get no missing values variables name using there index
remainder_vars =[isnull_sum.keys()[var_index] for var_index in remainder_vars_index]
remainder_vars

['date',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'floors',
 'waterfront',
 'condition',
 'sqft_above',
 'sqft_basement',
 'yr_renovated',
 'street',
 'country']

In [41]:
len(remainder_vars)

12

In [42]:
# concatinate X_train_clean_miss_var df and remainder_vars
X_train =  pd.concat([X_train_clean_miss_var,train[remainder_vars]], axis=1)

In [44]:
X_train.shape

(4208, 17)