In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.encoding import OneHotEncoder
from sklearn.linear_model import (LinearRegression,
                                  LogisticRegression)
from sklearn.metrics import mean_squared_error

import pickle

In [12]:
df = pd.read_csv('housing.csv')

In [13]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [14]:
df.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [15]:
df.isnull().mean()

longitude             0.000000
latitude              0.000000
housing_median_age    0.000000
total_rooms           0.000000
total_bedrooms        0.010029
population            0.000000
households            0.000000
median_income         0.000000
median_house_value    0.000000
ocean_proximity       0.000000
dtype: float64

In [16]:
df['median_house_value']

0        452600.0
1        358500.0
2        352100.0
3        341300.0
4        342200.0
           ...   
20635     78100.0
20636     77100.0
20637     92300.0
20638     84700.0
20639     89400.0
Name: median_house_value, Length: 20640, dtype: float64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('median_house_value', axis=1),
                                                    df['median_house_value'],
                                                    test_size=0.3, random_state=32)

In [23]:
X_test.to_csv('X_test.csv',index=False)

In [19]:
enc_cols = ['ocean_proximity']
imp_cols = ['total_bedrooms']

In [20]:
pipe = Pipeline(
    [
        ('onehot', OneHotEncoder(variables=enc_cols)),
        ('impute', MeanMedianImputer(imputation_method='median' ,variables=imp_cols)),
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ]
)

In [21]:
pipe.fit(X_train, y_train)

In [24]:
y_hat = pipe.predict(X_test)

In [28]:
np.sqrt(mean_squared_error(y_hat, y_test))

70071.79334644547

In [26]:
pickle.dump(pipe, open('pipeline.pkl','wb'))

In [None]:
pickled_model = pickle.load(open('pipeline.pkl', 'rb'))
pickled_model.predict(X_test)