In [1]:
import pandas as pd
import numpy as np
import sklearn # scikit-learn kutubxonasi

In [2]:
# Onlayn dataset joylashgan manzilini ko'rsatamiaz
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)

In [3]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
# bizga kerak ustunlar indekslari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
    ('imputermedian',SimpleImputer(strategy='median')),
    ('attribs_adder',CombinedAttributesAdder()),
    ('std_scaler',StandardScaler())
])

num_pipeline.fit_transform(X_num)

array([[ 1.27258656, -1.3728112 ,  0.34849025, ..., -0.17491646,
         0.05137609, -0.2117846 ],
       [ 0.70916212, -0.87669601,  1.61811813, ..., -0.40283542,
        -0.11736222,  0.34218528],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.08821601,
        -0.03227969, -0.66165785],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ..., -0.60675918,
         0.02030568,  0.99951387],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.40217517,
         0.00707608, -0.79086209],
       [-1.41489815,  0.99543676,  1.85617335, ..., -0.85144571,
        -0.08535429,  1.69520292]])

In [6]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)

In [7]:
num_attribs

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

In [8]:
cat_attribs = ['ocean_proximity']
cat_attribs

['ocean_proximity']

In [9]:
full_pipeline = ColumnTransformer([
    ('num',num_pipeline,num_attribs),
    ('cat',OneHotEncoder(),cat_attribs)
])

In [12]:
x_prepared = full_pipeline.fit_transform(X_train)

In [13]:
x_prepared

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

# LINEAR REGRESSION

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
LR_model = LinearRegression()

In [17]:
LR_model.fit(x_prepared,y)

LinearRegression()

In [19]:
test_data = X_train.sample(10)

In [20]:
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
5705,-118.26,34.23,33.0,1805.0,303.0,838.0,301.0,5.4306,<1H OCEAN
11323,-117.97,33.76,28.0,1386.0,272.0,901.0,294.0,4.7464,<1H OCEAN
6634,-118.16,34.16,52.0,1576.0,239.0,696.0,249.0,6.07,<1H OCEAN
6177,-117.91,34.08,33.0,2325.0,452.0,1170.0,445.0,3.6625,<1H OCEAN
4662,-118.3,34.05,42.0,1476.0,610.0,1605.0,545.0,1.721,<1H OCEAN
18626,-121.95,37.11,21.0,2387.0,357.0,913.0,341.0,7.736,<1H OCEAN
11472,-118.0,33.71,19.0,4808.0,1029.0,2422.0,971.0,4.0121,<1H OCEAN
18787,-122.43,40.47,16.0,3552.0,704.0,1801.0,658.0,2.1496,INLAND
10594,-117.79,33.69,16.0,1532.0,240.0,679.0,248.0,5.7115,<1H OCEAN
113,-122.25,37.83,52.0,2376.0,559.0,939.0,519.0,3.1484,NEAR BAY


In [21]:
test_label = y.loc[test_data.index]

In [22]:
test_label

5705     326600.0
11323    187500.0
6634     261800.0
6177     217100.0
4662     214300.0
18626    397700.0
11472    279700.0
18787     97700.0
10594    313900.0
113      224100.0
Name: median_house_value, dtype: float64

In [23]:
test_data_prepared = full_pipeline.transform(test_data)

In [24]:
test_data_prepared


array([[ 6.59301555e-01, -6.61400745e-01,  3.48490247e-01,
        -3.84903954e-01, -5.62052491e-01, -5.17538826e-01,
        -5.22335113e-01,  8.13893621e-01,  2.35178701e-01,
        -2.70251688e-02, -7.75790968e-01,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 8.03897207e-01, -8.81376346e-01, -4.82684654e-02,
        -5.77584761e-01, -6.36039159e-01, -4.62130924e-01,
        -5.40709918e-01,  4.54589465e-01, -3.01993267e-01,
        -2.79273150e-03, -2.86406915e-01,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 7.09162124e-01, -6.94163068e-01,  1.85617335e+00,
        -4.90211603e-01, -7.14799160e-01, -6.42426477e-01,
        -6.58833663e-01,  1.14967128e+00,  3.74515680e-01,
        -2.60641057e-02, -1.05543010e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 8.33813549e-01, -7.31605724e

In [26]:
predicted_label = LR_model.predict(test_data_prepared)

In [30]:
predicted_label
test_label = y.loc[test_data.index]

In [32]:
pd.DataFrame({'Bashorat':predicted_label, 'Original':test_label})

Unnamed: 0,Bashorat,Original
5705,274418.610096,326600.0
11323,244063.21926,187500.0
6634,316490.551315,261800.0
6177,205480.661061,217100.0
4662,194744.540734,214300.0
18626,385043.330547,397700.0
11472,239977.915622,279700.0
18787,51194.173903,97700.0
10594,264808.580775,313900.0
113,250822.547453,224100.0
