In [1]:
import pandas as pd 
import numpy as np
import sklearn 


In [2]:
URL="https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df=pd.read_csv(URL)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
# 4 -qadam .Machine learning

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train_set,test_set=train_test_split(df,test_size=0.2,random_state=42)

In [6]:
X_train=train_set.drop('median_house_value',axis=1)
y=train_set['median_house_value'].copy()

In [7]:
X_num=X_train.drop('ocean_proximity',axis=1)

In [8]:
X_num.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542


In [9]:
from sklearn.base import BaseEstimator,TransformerMixin

# bizga kerak ustunlar
room_ix,bedroom_ix,population_ix,household_ix=3,4,5,6


class CombineAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self,add_bedrooms_per_room=True):
        self.add_bedrooms_per_room=add_bedrooms_per_room
    def fit(self,X_num,y=None):
        return self
    def transform(self,X_num):
        rooms_per_household=X_num[:,room_ix]/X_num[:,household_ix]
        population_per_household=X_num[:,population_ix]/X_num[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room=X_num[:,bedroom_ix]/X_num[:,room_ix]
            return np.c_[X_num,rooms_per_household,population_per_household,bedrooms_per_room]
        else:
            return np.c_[X_num,rooms_per_household,population_per_household]


In [10]:
# pieline qurish

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler

In [12]:
num_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('attribs_adder',CombineAttributesAdder(add_bedrooms_per_room=True)),
    ('std_scaler',StandardScaler())
])

In [13]:
from sklearn.compose import ColumnTransformer

num_attribs=list(X_num)
cat_attribs=['ocean_proximity']

full_pipeline=ColumnTransformer([
    ('num',num_pipeline,num_attribs),
    ('cat',OneHotEncoder(),cat_attribs)
])

In [14]:
X_prepared=full_pipeline.fit_transform(X_train)

In [15]:
X_prepared

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

In [16]:
# Linear Regression

In [17]:
from sklearn.linear_model import LinearRegression

LR_model=LinearRegression()

In [18]:
LR_model.fit(X_prepared,y)

In [19]:
test_data=X_train.sample(10)
test_label=y.loc[test_data.index]

In [20]:
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
1488,-122.03,37.95,14.0,3287.0,793.0,1601.0,716.0,3.1719,NEAR BAY
12733,-121.35,38.58,20.0,2992.0,378.0,1105.0,368.0,8.6572,INLAND
15094,-116.95,32.82,12.0,5535.0,1434.0,3112.0,1262.0,2.5949,<1H OCEAN
3677,-118.38,34.22,20.0,1176.0,344.0,864.0,318.0,2.375,<1H OCEAN
16717,-120.66,35.5,19.0,1861.0,364.0,1040.0,363.0,3.3125,<1H OCEAN
16394,-121.25,38.03,29.0,2465.0,327.0,859.0,315.0,6.6605,INLAND
19727,-121.47,38.95,34.0,2129.0,350.0,969.0,314.0,2.7039,INLAND
6646,-118.13,34.17,49.0,1962.0,435.0,1329.0,457.0,3.2898,<1H OCEAN
16850,-122.42,37.62,40.0,1545.0,264.0,756.0,282.0,4.4643,NEAR OCEAN
10850,-117.91,33.66,21.0,1708.0,505.0,1099.0,434.0,3.225,<1H OCEAN


In [21]:
test_data_prepared=full_pipeline.transform(test_data)

In [22]:
predicted_labels=LR_model.predict(test_data_prepared)

In [23]:
pd.DataFrame({'Bashorat':predicted_labels,'Asl':test_label})

Unnamed: 0,Bashorat,Asl
1488,200926.386717,220500.0
12733,325206.92021,320200.0
15094,188494.611582,108300.0
3677,165135.919558,177700.0
16717,209119.64627,163900.0
16394,267513.629858,220700.0
19727,91603.029844,106300.0
6646,210275.36513,200000.0
16850,271417.626444,308100.0
10850,210956.724328,193800.0


In [24]:
# Modelni baholash

In [25]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [26]:
X_test=test_set.drop('median_house_value',axis=1)
X_test.head(1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND


In [27]:
y_test=test_set['median_house_value'].copy()
y_test.head(1)

20046    47700.0
Name: median_house_value, dtype: float64

In [28]:
X_test_prepared=full_pipeline.transform(X_test)

In [29]:
y_predicted=LR_model.predict(X_test_prepared)

In [30]:
from sklearn.metrics import mean_absolute_error

In [31]:
mae=mean_absolute_error(y_test,y_predicted)
print('MAE:',mae)

MAE: 50898.73953494079


In [32]:
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_test,y_predicted)
print('RMSE:',np.sqrt(mse))

RMSE: 72701.32600762135


In [33]:
# MODELNI BAXOLASH.RANDOM FOREST

In [34]:
from sklearn.ensemble import RandomForestRegressor

In [35]:
RF_model=RandomForestRegressor()

In [36]:
RF_model.fit(X_prepared,y)

In [37]:
y_predicted=RF_model.predict(X_test_prepared)

In [38]:
mse=mean_squared_error(y_test,y_predicted)
print('RMSE=',np.sqrt(mse))

RMSE= 50137.070143038625


In [39]:
from sklearn.tree import DecisionTreeRegressor

In [40]:
Tree_model=DecisionTreeRegressor()
Tree_model.fit(X_prepared,y)

In [41]:
y_predicted=Tree_model.predict(X_test_prepared)

In [42]:
mse=mean_squared_error(y_test,y_predicted)
print('RMSE=',np.sqrt(mse))

RMSE= 71935.33798394963


In [43]:
X=df.drop('median_house_value',axis=1)
y=df['median_house_value'].copy()

In [44]:
X_prepared=full_pipeline.transform(X)

In [45]:
def display_scores(scores):
    print('Scores:',scores)
    print('Mean:',scores.mean())
    print('Std.dev',scores.std())

In [46]:
from sklearn.model_selection import cross_val_score

mse_scores=cross_val_score(LR_model,X_prepared,y,scoring='neg_mean_squared_error',cv=5)
LR_rmse_scores=np.sqrt(-mse_scores)

In [47]:
display_scores(LR_rmse_scores)

Scores: [73394.92502922 74814.24096819 75431.93119241 76608.78768825
 66196.48128669]
Mean: 73289.27323295095
Std.dev 3694.7136787223567


In [48]:




mse_scores=cross_val_score(RF_model,X_prepared,y,scoring='neg_mean_squared_error',cv=10)
RF_rmse_scores=np.sqrt(-mse_scores)

In [49]:
display_scores(RF_rmse_scores)

Scores: [99074.84877683 47510.68146277 65497.20391683 56673.61977197
 61016.33363333 59615.32162353 47669.69357744 79488.20939498
 74973.62869034 49364.66840816]
Mean: 64088.420925619605
Std.dev 15579.751576196102


In [50]:






mse_scores=cross_val_score(Tree_model,X_prepared,y,scoring='neg_mean_squared_error',cv=10)
Tree_rmse_scores=np.sqrt(-mse_scores)

In [51]:
display_scores(LR_rmse_scores)

Scores: [73394.92502922 74814.24096819 75431.93119241 76608.78768825
 66196.48128669]
Mean: 73289.27323295095
Std.dev 3694.7136787223567


In [95]:
# 6-qadam.taqdimot.Modelni saqlab olish
# PICKLE

In [108]:
import pickle

In [109]:
filename='RF_model.pkl'
with open(filename,'wb') as file:
    pickle.dump(RF_model,file)

In [110]:
with open(filename,'rb') as file:
    model=pickle.load(file)

In [115]:
scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [78304.94926575 64301.04926027 61168.93512022 80901.77771958
 62026.30728724]
Mean: 69340.60373061291
Std.dev 8481.656023155017


In [112]:
# JOBLIB

In [113]:
import joblib

In [114]:
filename='LR_model.jbl'
joblib.dump(LR_model,filename)

['LR_model.jbl']

In [116]:
model1=joblib.load(filename)

In [117]:
scores = cross_val_score(model1, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [73394.92502922 74814.24096819 75431.93119241 76608.78768825
 66196.48128669]
Mean: 73289.27323295095
Std.dev 3694.7136787223567


In [135]:
filename='pipeline.jbl'
joblib.dump(full_pipeline,filename)

['pipeline.jbl']