![Imgur](https://i.imgur.com/5pXzCIu.png)

# Data Science and artificial intelligence  Practicum

## 5-MODUL. Machine Learning

### Machine Learning / X_prepared data
<img src="https://www.kdnuggets.com/wp-content/uploads/deploy-machine-learning-models-to-web.jpg"
alt="standartization" width="1800" height="400"/>



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn # scikit-learn library

In [None]:
# online address of our data set.
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)

from sklearn.model_selection import train_test_split   
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop("median_house_value", axis=1) 
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
# indexes of the columns that we need.
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self #Our function is only transformer, not an estimator.
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room column will be freewill.
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')), # will change nan values with medians
          ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)), # will add new columns 
          ('std_scaler', StandardScaler())             
])

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [None]:
X_prepared=full_pipeline.fit_transform(X_train)

In [None]:
X_prepared[0:5, :]

array([[ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
         0.76827628,  0.32290591, -0.326196  , -0.17491646,  0.05137609,
        -0.2117846 ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.70916212, -0.87669601,  1.61811813,  0.34029326,  0.59309419,
        -0.09890135,  0.6720272 , -0.03584338, -0.40283542, -0.11736222,
         0.34218528,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.44760309, -0.46014647, -1.95271028, -0.34259695, -0.49522582,
        -0.44981806, -0.43046109,  0.14470145,  0.08821601, -0.03227969,
        -0.66165785,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 1.23269811, -1.38217186,  0.58654547, -0.56148971, -0.40930582,
        -0.00743434, -0.38058662, -1.01786438, -0.60001532,  0.07750687,
         0.78303162,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.10855122,  0.5320839 ,  1

# Linear Regression 

In [None]:
from sklearn.linear_model import LinearRegression
LR_model = LinearRegression()

In [None]:
LR_model.fit(X_prepared, y)

LinearRegression()

#End of ML model ^

# Let's test our ML model !

In [None]:
test_data = X_train.sample(10)  # I;m taking 10 samples from our main train set

In [None]:
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
9944,-122.26,38.16,23.0,2840.0,491.0,1586.0,466.0,4.0337,NEAR BAY
19008,-121.99,38.35,45.0,1778.0,339.0,839.0,319.0,2.4659,INLAND
6672,-118.11,34.15,40.0,1950.0,509.0,1038.0,438.0,2.6172,<1H OCEAN
16743,-122.45,37.71,50.0,1441.0,283.0,1159.0,286.0,4.5417,NEAR OCEAN
2985,-119.01,35.33,42.0,1120.0,255.0,677.0,213.0,1.5429,INLAND
7634,-118.29,33.84,23.0,3626.0,799.0,2321.0,731.0,4.7393,<1H OCEAN
17749,-121.84,37.34,33.0,1019.0,191.0,938.0,215.0,4.0929,<1H OCEAN
11625,-118.06,33.82,25.0,2637.0,462.0,965.0,415.0,4.5833,<1H OCEAN
19867,-119.34,36.34,5.0,4505.0,834.0,1917.0,775.0,4.0144,INLAND
8316,-118.32,33.33,52.0,2127.0,512.0,733.0,288.0,3.3906,ISLAND


### Separating indexes of our test data

In [None]:
test_label = y.loc[test_data.index] 
test_label

9944     130400.0
19008    102900.0
6672     196100.0
16743    233700.0
2985      39400.0
7634     237900.0
17749    165000.0
11625    190900.0
19867    126600.0
8316     300000.0
Name: median_house_value, dtype: float64

### Our test data has to go through full pipeline 

In [None]:
test_data_prepared = full_pipeline.transform(test_data)
predicted_labels = LR_model.predict(test_data_prepared)

In [None]:
predicted_labels

array([201891.8147681 , 132992.0687647 , 194913.85639387, 274316.55392501,
        90130.6473937 , 253849.74194825, 222020.448086  , 249980.1059327 ,
       148301.18279409, 399449.39338006])

# NOW WE CAN COMPARE OUR PREDICTED PRICES AND REAL PRICES, BY EXECUTING THEM SIDE BY SIDE

In [None]:
pd.DataFrame({"ML PREDICTION":predicted_labels, "REAL PRICE":test_label})

Unnamed: 0,ML PREDICTION,REAL PRICE
9944,201891.814768,130400.0
19008,132992.068765,102900.0
6672,194913.856394,196100.0
16743,274316.553925,233700.0
2985,90130.647394,39400.0
7634,253849.741948,237900.0
17749,222020.448086,165000.0
11625,249980.105933,190900.0
19867,148301.182794,126600.0
8316,399449.39338,300000.0


# MODEL EVALUATION

### From test set I  will drop median house value

In [None]:
X_test = test_set.drop(["median_house_value"], axis=1) 

In [None]:
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


### from test set i will separate median house value

In [None]:
y_test = test_set["median_house_value"].copy()
y_test

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
           ...   
15362    263300.0
16623    266800.0
18086    500001.0
2144      72300.0
3665     151500.0
Name: median_house_value, Length: 4128, dtype: float64

### X_test set has to go through full pipeline 

In [None]:
X_test_predicted = full_pipeline.transform(X_test)

### y_predicted is our new predicted median house values 

In [None]:
y_predicted = LR_model.predict(X_test_predicted)
y_predicted

array([ 61874.25460143, 121853.52511139, 267770.94368091, ...,
       447837.04647878, 117275.9214608 , 185597.46125194])

# Now we have to evaluate our prediction, in this case with mean absolute error

In [None]:
from sklearn.metrics import mean_absolute_error 

mae = mean_absolute_error(y_test, y_predicted)

In [None]:
mae

50898.7395349408

# AS WE CAN SEE FROM MEAN_ABSOLUTE_ERROR RESULTS, THAT OUR ML MODEL IS WORKING WITH MEAN 50 000$ ERROR WHILE PREDICTING.

In [None]:
from sklearn.metrics import mean_squared_error 

mse = mean_squared_error(y_test, y_predicted)

In [None]:
print("RMSE=", np.sqrt(mse))

RMSE= 72701.32600762138


#AS WE CAN SEE FROM MEAN_SQUARED_ERROR RESULTS, THAT OUR ML MODEL IS WORKING WITH MEAN 72 000$ ERROR WHILE PREDICTING.

# SKLEARN HAS MULTIPLE ALGORYTHMS LIKE LINEAR REGRESSION, RANDOM FOREST AND SO ON...  IF THE ACCURACY OF OUR ML MODEL IS NOT SATISFIED, WE CAN GO THROUGH ANOTHER ALGORYTHM. 

# Cross-validation

In [None]:
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"].copy()

X_prepared = full_pipeline.transform(X)


In [None]:
from sklearn.model_selection import cross_val_score 

mse_scores=cross_val_score(LR_model, X_prepared, y, scoring="neg_mean_squared_error", cv=5 )

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

In [None]:
display_scores(np.sqrt(-mse_scores))

Scores: [73394.92502922 74814.24096819 75431.93119241 76608.78768825
 66196.48128669]
Mean: 73289.27323295093
Std.dev: 3694.7136787223626


In [None]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

RandomForestRegressor()

In [None]:
scores = cross_val_score(RF_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)

In [None]:
display_scores(LR_rmse_scores)

Scores: [97265.42402384 47063.5470799  65221.20990222 56521.41849797
 61440.1813655  59994.59517895 46949.11648623 78387.88733136
 74208.77845864 49387.42758346]
Mean: 63643.95859080739
Std.dev: 15125.048935411569


# Saving file/ pickle

In [None]:
import pickle

filename = 'RF_model.pkl' # faylga istalgan nom beramiz
with open(filename, 'wb') as file:
    pickle.dump(RF_model, file)

In [None]:
with open(filename, 'rb') as file:
    model = pickle.load(file)

# Testing our model

In [None]:
scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [77396.86452823 64344.10345222 61099.20636724 82712.6078699
 62102.15434144]
Mean: 69530.98731180481
Std.dev: 8818.315012439547


# Saving with joblib

In [None]:
import joblib

filename = 'RF_model.jbl' # faylga istalgan nom beramiz
joblib.dump(RF_model, filename)

['RF_model.jbl']

# Reading our model

In [None]:
model = joblib.load(filename)

# Testing our model

In [None]:
scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [76751.81301967 64129.88771023 61166.92746367 82151.04980834
 62501.62578379]
Mean: 69340.26075713856
Std.dev: 8482.519858717022


# Saving Pipeline

In [None]:
filename = 'pipeline.jbl'
joblib.dump(full_pipeline, filename)

['pipeline.jbl']