<a href="https://colab.research.google.com/github/MLandML/MLandML/blob/main/supervised_scikit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import tarfile
from six.moves import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
        tgz_path = os.path.join(housing_path, "housing.tgz")
        urllib.request.urlretrieve(housing_url, tgz_path)
        housing_tgz = tarfile.open(tgz_path)
        housing_tgz.extractall(path=housing_path)
        housing_tgz.close()


In [None]:
import pandas as pd
import numpy as np

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path=os.path.join(housing_path,'housing.csv')
    return pd.read_csv(csv_path)

In [None]:
fetch_housing_data()
housing = load_housing_data()
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
import matplotlib.pyplot as plt
housing.hist(bins=50,figsize=(20,15))
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
train_set,test_set=train_test_split(housing,test_size=0.2,random_state=42)

In [None]:
housing["income_cat"]=np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)

for train_index,test_index in split.split(housing,housing["income_cat"]):
    strat_train_set=housing.loc[train_index]
    strat_test_set=housing.loc[test_index]

In [None]:
housing["income_cat"].value_counts()/len(housing["households"])

In [None]:
for set in (strat_train_set,strat_test_set):
    set.drop("income_cat",axis=1,inplace=True)

In [None]:
housing=strat_train_set.copy()

In [None]:
housing.plot(kind="scatter",x="latitude",y="longitude",alpha=0.4,s=housing["population"]/100,c="median_house_value",cmap=plt.get_cmap("jet"),
             colorbar=True)
plt.legend()

In [None]:
corr_matrix=housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
attributes=["median_house_value","median_income","total_rooms","housing_median_age"]
scatter_matrix(housing[attributes],figsize=(12,8))

In [None]:
housing.plot(kind="scatter",x="median_income",y="median_house_value",alpha=0.1)

In [None]:
housing["rooms_per_household"]=housing["total_rooms"]/housing["households"]
housing["bedrooms_per_household"]=housing["total_bedrooms"]/housing["households"]
housing["population_per_household"]=housing["population"]/housing["households"]

corr_matrix=housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
housing_labels=strat_train_set["median_house_value"].copy()
housing=strat_train_set.drop("median_house_value",axis=1)



In [None]:
#Data Cleaning using Imputer
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy="median")

housing_num=housing.drop("ocean_proximity",axis=1)

x=imputer.fit_transform(housing_num)
housing_tr=pd.DataFrame(x,columns=housing_num.columns)

In [None]:
#One_hot_encoding and sparse matrix
"""from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
housing_cat=housing["ocean_proximity"]
housing_cat_encoded=encoder.fit_transform(housing_cat)

from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder()
housing_cat_1hot=encoder.fit_transform(housing_cat_encoded.reshape(-1,1))"""

In [None]:
#One_hot_encoding and sparse matrix (efficient way)
from sklearn.preprocessing import LabelBinarizer
encoder=LabelBinarizer(sparse_output=True)
housing_cat=housing["ocean_proximity"]
housing_cat_1hot=encoder.fit_transform(housing_cat)
print(housing_cat_1hot)

In [None]:
#Custom Transformations
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix,bedrooms_ix,population_ix,households_ix=3,4,5,6

class CombinedAttributesAddr(BaseEstimator, TransformerMixin):
    def __init__(self,add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        rooms_per_household=X[:,rooms_ix]/X[:,households_ix]
        population_per_household=X[:,population_ix]/X[:,households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room=X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,bedrooms_per_room,rooms_per_household,population_per_household]
        else:
            return np.c_[X,rooms_per_household,population_per_household]
        
attr_addr=CombinedAttributesAddr(add_bedrooms_per_room=False)
housing_extra_attr=attr_addr.transform(housing.values)


In [None]:
#Feature Scaling
#Transformational Pipelines


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

piepline=Pipeline([('imputer',SimpleImputer(strategy="median")),
                   ('attr_addr',CombinedAttributesAddr(add_bedrooms_per_room=False)),
                   ('scaler',StandardScaler())])
housing_num_tr=piepline.fit_transform(housing_num)

from sklearn.pipeline import FeatureUnion

class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

class MyLabelBinarizer(TransformerMixin):
    def __init__(self,*args,**kwargs):
        self.encoder=LabelBinarizer()
    def fit(self,X,y=0):
        self.encoder.fit(X)
        return self
    def transform(self,X,y=0):
        return self.encoder.transform(X)

num_attr=list(housing_num)
cat_attr=["ocean_proximity"]

num_pipeline=Pipeline([('selector', DataFrameSelector(num_attr)),
                       ('imputer',SimpleImputer(strategy="median")),
                       ('attr_addr',CombinedAttributesAddr(add_bedrooms_per_room=False)),
                       ('scaler',StandardScaler())])

cat_pipeline=Pipeline([('selector', DataFrameSelector(cat_attr)),
                       ('label_binarizer',MyLabelBinarizer())])

full_pipeline=FeatureUnion(transformer_list=[('num_pipeline',num_pipeline),
                                         ('cat_pipeline',cat_pipeline)])


In [None]:
housing_prepared=full_pipeline.fit_transform(housing)
housing_prepared
housing_prepared.shape

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)

In [None]:
some_data=housing.iloc[:5]
some_labels=housing_labels.iloc[:5]
some_data_prepared=full_pipeline.transform(some_data)

print("Predictions:\t\t\t",lin_reg.predict(some_data_prepared))
print("Labels:\t\t\t",list(some_labels))

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions=lin_reg.predict(housing_prepared)
lin_mse=mean_squared_error(housing_labels,housing_predictions)
lin_rmse= np.sqrt(lin_mse)
lin_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor 
tree_reg=DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)

In [None]:

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels,housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
from sklearn.model_selection import cross_val_score
tree_score= cross_val_score(tree_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
tree_score_rmse=np.sqrt(-tree_score)
tree_score_rmse

In [None]:
def displayscore(score):
    print("Scores: ",score)
    print("Mean: ",score.mean())
    print("Standard_Deviation: ",score.std())

displayscore(tree_score_rmse)

In [None]:
from sklearn.model_selection import cross_val_score
lin_score= cross_val_score(lin_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
lin_score_rmse=np.sqrt(-lin_score)

displayscore(lin_score_rmse)

In [None]:
from sklearn.ensemble import RandomForestRegressor 
rf_reg=RandomForestRegressor()
rf_reg.fit(housing_prepared,housing_labels)

housing_predictions=rf_reg.predict(housing_prepared)
rf_mse=mean_squared_error(housing_predictions,housing_labels)
rf_rmse = np.sqrt(rf_mse)

rf_score = cross_val_score(rf_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
rf_score_rmse= np.sqrt(-rf_score)

displayscore(rf_score_rmse)

In [None]:
!pip install joblib

In [None]:
import joblib
joblib.dump(lin_reg,'lin_reg.pkl')
joblib.dump(tree_reg,'tree_reg.pkl')
joblib.dump(rf_reg,'rf_reg.pkl')

In [None]:
#GridSearchCV
from sklearn.model_selection import GridSearchCV
param_grid=[{'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
            {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]}]

grid_search= GridSearchCV(rf_reg,param_grid,scoring="neg_mean_squared_error",cv=10)
grid_search.fit(housing_prepared,housing_labels)



In [None]:
grid_search.best_params_
grid_search.best_estimator_

In [None]:
cvres= grid_search.cv_results_
for mean_score , params in zip(cvres["mean_test_score"],cvres["params"]):
  print(np.sqrt(-mean_score),params)

In [None]:
#RandomSearch

from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(rf_reg,param_grid,scoring='neg_mean_squared_error',cv=10,n_iter=10)
random_search.fit(housing_prepared,housing_labels)

In [None]:
random_search.best_estimator_

In [None]:
cvrandom_res = random_search.cv_results_
for mean_score,params in zip(cvrandom_res["mean_test_score"],cvrandom_res["params"]):
  print(np.sqrt(-mean_score),params)