![Imgur](https://i.imgur.com/5pXzCIu.png)

# Data Science va Sun'iy Intellekt Praktikum

## 5-MODUL. Machine Learning

### Portfolio uchun vazifa: Toshkent shahrida uylarning narxini aniqlash.

Ushbu amaliyotda sizning vazifangiz berilgan ma`lumotlar asosida Toshkent shahridagi uylarning narxini aniqlash.

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('https://raw.githubusercontent.com/anvarnarz/praktikum_datasets/main/housing_data_08-02-2021.csv')
df.head()

Unnamed: 0,location,district,rooms,size,level,max_levels,price
0,"город Ташкент, Юнусабадский район, Юнусабад 8-...",Юнусабадский,3,57,4,4,52000
1,"город Ташкент, Яккасарайский район, 1-й тупик ...",Яккасарайский,2,52,4,5,56000
2,"город Ташкент, Чиланзарский район, Чиланзар 2-...",Чиланзарский,2,42,4,4,37000
3,"город Ташкент, Чиланзарский район, Чиланзар 9-...",Чиланзарский,3,65,1,4,49500
4,"город Ташкент, Чиланзарский район, площадь Актепа",Чиланзарский,3,70,3,5,55000


# Ustunlar ta'rifi
- `location` - sotilayotgan uy manzili
- `district` - uy joylashgan tuman
- `rooms` - xonalar soni
- `size` - uy maydoni (kv.m)
- `level` - uy joylashgan qavat
- `max_levels` - ja'mi qavatlar soni
- `price` - uy narxi

## Vazifani CRSIP-DM Metolodgiyasi yordamida bajaring.
<img src="https://i.imgur.com/dzZnnYi.png" alt="CRISP-DM" width="800"/>

# Malumotlarni tahlil qilamiz

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7565 entries, 0 to 7564
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   location    7565 non-null   object
 1   district    7565 non-null   object
 2   rooms       7565 non-null   int64 
 3   size        7565 non-null   object
 4   level       7565 non-null   int64 
 5   max_levels  7565 non-null   int64 
 6   price       7565 non-null   object
dtypes: int64(3), object(4)
memory usage: 413.8+ KB


In [None]:
df.isnull().sum()

location      0
district      0
rooms         0
size          0
level         0
max_levels    0
price         0
dtype: int64

In [None]:
df['size'] = df['size'].astype(float)

ValueError: could not convert string to float: 'Площадьземли:1сот'

In [None]:
df[df['size']== 'Площадьземли:1сот']

Unnamed: 0,location,district,rooms,size,level,max_levels,price
5347,"город Ташкент, Яшнободский район, Дархон",Яшнободский,4,Площадьземли:1сот,3,5,150000


In [None]:
df.loc[5347,'size'] = np.nan

In [None]:
df['size'] = df['size'].astype(float)

In [None]:
df.price = df.price.astype(float)

ValueError: could not convert string to float: 'Договорная'

In [None]:
df.loc[df[df['price']=='Договорная'].index,'price'] = np.nan

In [None]:
df.price = df.price.astype(float)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7565 entries, 0 to 7564
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    7565 non-null   object 
 1   district    7565 non-null   object 
 2   rooms       7565 non-null   int64  
 3   size        7564 non-null   float64
 4   level       7565 non-null   int64  
 5   max_levels  7565 non-null   int64  
 6   price       7466 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 413.8+ KB


In [None]:
df.describe()

Unnamed: 0,rooms,size,level,max_levels,price
count,7565.0,7564.0,7565.0,7565.0,7466.0
mean,2.625644,113.26899,3.699273,6.038202,71334.21
std,1.085201,1491.312092,2.237275,2.613271,640523.7
min,1.0,1.0,1.0,1.0,2.0
25%,2.0,50.0,2.0,4.0,35000.0
50%,3.0,66.0,3.0,5.0,46500.0
75%,3.0,86.0,5.0,9.0,67000.0
max,10.0,70000.0,19.0,25.0,52000000.0


# Malumotlarni modelga tayyorlash

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop(["price"], axis=1)
y = train_set["price"].copy()

X_num = X_train.drop(['location',"district"], axis=1)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

num_pipeline = Pipeline([
      ('imputer',SimpleImputer(strategy='median')),
      ('std_scaler',StandardScaler())
])

In [None]:
num_pipeline.fit_transform(X_num)

array([[-0.57746118, -0.03810144, -0.31266907, -0.78950544],
       [ 1.25131698,  0.06793816, -0.75864437, -0.40330378],
       [-1.49185026, -0.04831267, -0.75864437, -0.40330378],
       ...,
       [-0.57746118, -0.04045788, -1.20461968, -1.1757071 ],
       [-0.57746118, -0.03181762, -0.31266907, -0.78950544],
       [ 0.3369279 , -0.0066823 , -0.75864437,  1.14150286]])

In [None]:
median = y.median()
y = y.fillna(median)

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ["location","district"]

full_pipeline = ColumnTransformer([
    ('num',num_pipeline, num_attribs),
    ('cat',OrdinalEncoder(),cat_attribs)
])


In [None]:
X_prepared = full_pipeline.fit_transform(X_train)

In [None]:
X_prepared

array([[-5.77461178e-01, -3.81014438e-02, -3.12669070e-01,
        -7.89505443e-01,  6.39000000e+02,  5.00000000e+00],
       [ 1.25131698e+00,  6.79381642e-02, -7.58644373e-01,
        -4.03303783e-01,  2.19000000e+02,  2.00000000e+00],
       [-1.49185026e+00, -4.83126653e-02, -7.58644373e-01,
        -4.03303783e-01,  1.15800000e+03,  9.00000000e+00],
       ...,
       [-5.77461178e-01, -4.04578795e-02, -1.20461968e+00,
        -1.17570710e+00,  7.80000000e+02,  6.00000000e+00],
       [-5.77461178e-01, -3.18176152e-02, -3.12669070e-01,
        -7.89505443e-01,  9.10000000e+02,  7.00000000e+00],
       [ 3.36927898e-01, -6.68230066e-03, -7.58644373e-01,
         1.14150286e+00,  9.42000000e+02,  7.00000000e+00]])

# Model yaratish

In [None]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [None]:
LR_model.fit(X_prepared,y)

# Modelni baholash

In [None]:
X_test = test_set.drop(['price'],axis=1)
X_test

Unnamed: 0,location,district,rooms,size,level,max_levels
132,"город Ташкент, Чиланзарский район, Чиланзар 6-...",Чиланзарский,2,37.0,1,4
3771,"город Ташкент, Сергелийский район, Сергели-I Я...",Сергелийский,1,39.0,5,5
65,"город Ташкент, Учтепинский район, Чиланзар 15-...",Учтепинский,4,110.0,3,3
7525,"город Ташкент, Шайхантахурский район, Алишера ...",Шайхантахурский,4,84.0,2,4
6791,"город Ташкент, Мирабадский район, Чимкент",Мирабадский,4,100.0,3,4
...,...,...,...,...,...,...
4834,"город Ташкент, Чиланзарский район, Чиланзар-9",Чиланзарский,2,50.0,2,5
3125,"город Ташкент, Мирзо-Улугбекский район, ц-1 Бу...",Мирзо-Улугбекский,4,92.0,4,4
347,"город Ташкент, Яшнободский район, Фергана Йули",Яшнободский,2,59.0,4,4
6678,"город Ташкент, Сергелийский район, Массив серг...",Сергелийский,3,57.0,3,7


In [None]:
y_test = test_set['price'].copy()

In [None]:
X_test_prepared = full_pipeline.fit_transform(X_test)
median = y_test.median()
y_test = y_test.fillna(median)

In [None]:
X_test_prepared

array([[-5.73375710e-01, -5.91500534e-02, -1.21458600e+00,
        -7.44232141e-01,  3.04000000e+02,  6.00000000e+00],
       [-1.52564514e+00, -5.82206386e-02,  5.90146252e-01,
        -3.74680704e-01,  2.08000000e+02,  4.00000000e+00],
       [ 1.33116315e+00, -2.52264145e-02, -3.12219872e-01,
        -1.11378358e+00,  2.34000000e+02,  5.00000000e+00],
       ...,
       [-5.73375710e-01, -4.89264910e-02,  1.38963190e-01,
        -7.44232141e-01,  5.66000000e+02,  1.10000000e+01],
       [ 3.78893719e-01, -4.98559057e-02, -3.12219872e-01,
         3.64422171e-01,  2.04000000e+02,  4.00000000e+00],
       [-5.73375710e-01, -4.93911983e-02, -1.21458600e+00,
         1.10352505e+00,  3.05000000e+02,  6.00000000e+00]])

In [None]:
y_predict = LR_model.predict(X_test_prepared)

In [None]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_test,y_predict)

lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

240542.67310807278


# Saqlab qo'yish

In [None]:
import joblib

filename = 'LR_model.jbl'
joblib.dump(LR_model,filename)

['LR_model.jbl']