In [2]:
import pandas as pd
import numpy as np
import sklearn # scikit-learn kutubxonasi

In [3]:
# Onlayn dataset joylashgan manzilini ko'rsatamiaz
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)

In [22]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

In [23]:
from sklearn.base import BaseEstimator, TransformerMixin
# bizga kerak ustunlar indekslari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
    ('imputermedian',SimpleImputer(strategy='median')),
    ('attribs_adder',CombinedAttributesAdder()),
    ('std_scaler',StandardScaler())
])

num_pipeline.fit_transform(X_num)

array([[ 1.27258656, -1.3728112 ,  0.34849025, ..., -0.17491646,
         0.05137609, -0.2117846 ],
       [ 0.70916212, -0.87669601,  1.61811813, ..., -0.40283542,
        -0.11736222,  0.34218528],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.08821601,
        -0.03227969, -0.66165785],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ..., -0.60675918,
         0.02030568,  0.99951387],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.40217517,
         0.00707608, -0.79086209],
       [-1.41489815,  0.99543676,  1.85617335, ..., -0.85144571,
        -0.08535429,  1.69520292]])

In [25]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)

In [26]:
num_attribs

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

In [27]:
cat_attribs = ['ocean_proximity']
cat_attribs

['ocean_proximity']

In [28]:
full_pipeline = ColumnTransformer([
    ('num',num_pipeline,num_attribs),
    ('cat',OneHotEncoder(),cat_attribs)
])

In [29]:
x_prepared = full_pipeline.fit_transform(X_train)

In [30]:
x_prepared

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

# LINEAR REGRESSION

In [33]:
from sklearn.linear_model import LinearRegression

In [35]:
LR_model = LinearRegression()

In [36]:
LR_model.fit(x_prepared,y)

LinearRegression()

In [37]:
test_data = X_train.sample(10)

In [38]:
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
3186,-119.79,36.29,6.0,1265.0,227.0,764.0,246.0,4.2917,INLAND
13904,-116.39,34.15,15.0,5583.0,1149.0,2709.0,964.0,1.9779,INLAND
14154,-117.07,32.77,34.0,2245.0,394.0,1849.0,429.0,3.5446,NEAR OCEAN
5758,-118.28,34.17,22.0,2664.0,651.0,1553.0,629.0,3.6354,<1H OCEAN
16608,-120.64,35.65,9.0,3466.0,673.0,2356.0,619.0,2.9926,<1H OCEAN
6465,-118.04,34.1,38.0,2317.0,451.0,1155.0,426.0,4.1488,INLAND
5121,-118.31,33.96,52.0,2523.0,460.0,1167.0,413.0,3.0625,<1H OCEAN
10848,-117.91,33.66,26.0,5761.0,1326.0,2681.0,1116.0,4.0341,<1H OCEAN
14412,-117.23,32.78,35.0,1649.0,355.0,746.0,360.0,4.6293,NEAR OCEAN
1895,-120.02,38.91,22.0,2138.0,493.0,829.0,330.0,2.2056,INLAND


In [39]:
test_label = y.loc[test_data.index]

In [40]:
test_label

3186     104200.0
13904     73300.0
14154    185500.0
5758     256300.0
16608    158200.0
6465     235300.0
5121     127400.0
10848    243300.0
14412    356500.0
1895     107200.0
Name: median_house_value, dtype: float64

In [41]:
test_data_prepared = full_pipeline.transform(test_data)

In [24]:
test_data_prepared


array([[ 6.59301555e-01, -6.61400745e-01,  3.48490247e-01,
        -3.84903954e-01, -5.62052491e-01, -5.17538826e-01,
        -5.22335113e-01,  8.13893621e-01,  2.35178701e-01,
        -2.70251688e-02, -7.75790968e-01,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 8.03897207e-01, -8.81376346e-01, -4.82684654e-02,
        -5.77584761e-01, -6.36039159e-01, -4.62130924e-01,
        -5.40709918e-01,  4.54589465e-01, -3.01993267e-01,
        -2.79273150e-03, -2.86406915e-01,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 7.09162124e-01, -6.94163068e-01,  1.85617335e+00,
        -4.90211603e-01, -7.14799160e-01, -6.42426477e-01,
        -6.58833663e-01,  1.14967128e+00,  3.74515680e-01,
        -2.60641057e-02, -1.05543010e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 8.33813549e-01, -7.31605724e

In [42]:
predicted_label = LR_model.predict(test_data_prepared)

In [43]:
predicted_label
test_label = y.loc[test_data.index]

In [44]:
pd.DataFrame({'Bashorat':predicted_label, 'Original':test_label})

Unnamed: 0,Bashorat,Original
3186,145746.290077,104200.0
13904,50947.852768,73300.0
14154,180225.476546,185500.0
5758,218574.070986,256300.0
16608,162973.838082,158200.0
6465,198736.373497,235300.0
5121,213056.667959,127400.0
10848,263990.456307,243300.0
14412,275338.251229,356500.0
1895,45286.866117,107200.0


# Evaluating Model


In [45]:
 test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [46]:
 X_test = test_set.drop('median_house_value',axis =1)

In [47]:
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [48]:
y_test = test_set['median_house_value'].copy()

In [49]:
X_test_prepared = full_pipeline.transform(X_test)

In [50]:
y_predicted = LR_model.predict(X_test_prepared)

In [51]:
y_predicted

array([ 61874.25460143, 121853.52511139, 267770.94368091, ...,
       447837.04647878, 117275.9214608 , 185597.46125194])

In [52]:
y_test

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
           ...   
15362    263300.0
16623    266800.0
18086    500001.0
2144      72300.0
3665     151500.0
Name: median_house_value, Length: 4128, dtype: float64

#  MEAN ABSOLUTE ERROR

In [53]:
from sklearn.metrics import mean_absolute_error


mae = mean_absolute_error(y_test,y_predicted)

In [54]:
mae # eror

50898.7395349408

In [56]:
print("Mae is " ,mae)

Mae is  50898.7395349408


In [58]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_predicted)
print("RMSE", np.sqrt(mse))

RMSE 72701.32600762139


# RANDOM FOREST

In [60]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(x_prepared,y)

RandomForestRegressor()

In [61]:
y_predicted = RF_model.predict(X_test_prepared)

In [62]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_predicted)
print("RMSE", np.sqrt(mse))

RMSE 50292.05043271166


# CROSS VALIDATION

In [63]:
X  = df.drop('median_house_value',axis=1)
y = df["median_house_value"].copy()

X_prepared = full_pipeline.transform(X)

In [64]:
from sklearn.model_selection import cross_val_score

In [67]:
mse_scores =cross_val_score(LR_model,X_prepared,y,scoring="neg_mean_squared_error",cv=5)


In [None]:
def display(scores):
    print("Scores: ",scores)
    print("Mean: ", scores.mean())
    print("Std.dev: ", scores.std())
    
display(np.sqrt(-mse_scores))

In [None]:
scores=cross_val_score(RF_model,X_prepared,y,scoring="neg_mean_squared_error",cv=10)
LR_rmse_scores = np.sqrt(-scores)


In [None]:
display(LR_rmse_scores)

# MODEL PRODUCTION

In [71]:
#pickle

filename = 'RF_model.pkl'
with open(filename,'wb') as file:
    pickle.dump(RF_model,file)

NameError: name 'pickle' is not defined

In [74]:
import joblib

filename = 'RF_model.jbl'
joblib.dump(RF_model,filename)

['RF_model.jbl']

In [75]:
model = joblib.load(filename)

In [76]:
model

RandomForestRegressor()