In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


%matplotlib inline


In [None]:
def load_the_data(path):
    return pd.read_csv(path)

### Take a quick look at the data

In [None]:
path = r'C:\Users\kk\Documents\Python Projects\California Housing Price\handson-ml\datasets\housing\housing.csv'
df = load_the_data(path)
df.head()

In [None]:
df.columns

In [None]:
len(df.columns)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.ocean_proximity.unique()

In [None]:
df.ocean_proximity.value_counts()

In [None]:
df.hist(bins=50, figsize = (20,15))

In [None]:
len(df[df.median_house_value.between(99000, 110000, inclusive=False)])

### create a test set

#### how to share df for train and test

In [None]:

# we check which indexes we have. we do random permutation for indexes. Then we set how big/ long should be our test set.Here we use our test_ratio.
# we split out random permutation array for test and train vector. we take test and train from df
def split_train_test(data,test_ratio):
    np.random.seed(42)# we need to set random number generator seed's because without generator after some time we will see whole data set in test set, which is what we want to avoid
    shuffled_indices = np.random.permutation(len(df))
    test_set_size = int(len(df)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return df.iloc[train_indices],df.iloc[test_indices]
train_df, test_df = split_train_test(df,0.2)

#### random share with sklearn

In [None]:
#with sklearn

train_set,test_set =train_test_split(df,test_size=0.2,random_state=42)

# here we generate train and test set randomly. We need to check if we don't have sampling bias. We always need to do stratified sampling to correct represent f.exp. whole population


In [None]:
df.median_income.hist()

In [None]:
np.ceil((df.median_income/1.5)).hist()

In [None]:
df['median_income_cat'] = np.ceil(df.median_income/1.5)
df['median_income_cat'] = df.median_income_cat.where((df.median_income_cat)<5,5)

#### stratified shuffle split

In [None]:
#train & test set which we will be use in the project


split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split.split(df,df.median_income_cat):
    strat_train_set= df.loc[train_index]
    strat_test_set= df.loc[test_index]

In [None]:
strat_test_set.median_income_cat.value_counts()/len(strat_test_set.median_income_cat)

In [None]:

train_set,test_set =train_test_split(df,test_size=0.2,random_state=42)
test_set.median_income_cat.value_counts()/len(strat_test_set.median_income_cat)

### check which test set is better

In [None]:
df.median_income_cat.value_counts()/len(df)

In [None]:
for set_ in (df,strat_test_set,strat_train_set):
    set_.drop('median_income_cat',axis=1,inplace=True)

In [None]:
strat_train_set.shape

### Discover and Visualize

In [None]:
housing = strat_train_set.copy()
housing.columns

#s- shows popultion and c-shows a median house value
housing.plot(kind = 'scatter',x='longitude',y='latitude',alpha=0.1,s=housing['population']/100,label='population',c='median_house_value',cmap=plt.get_cmap('jet'),figsize=(10,7))

In [None]:
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
att = ['median_house_value','median_income','total_rooms','housing_median_age']
scatter_matrix(strat_train_set[att])

In [None]:
housing.plot(kind='scatter',x='median_house_value',y='median_income')

In [None]:
housing['rooms_per_household'] = housing['total_rooms']/housing['households']
housing['population_per_household'] = housing['population']/housing['households']
housing['bedroom_per_room'] = housing['total_bedrooms']/housing['total_rooms']

In [None]:
housing.corr()

### train set and labels

In [None]:
housing = strat_train_set.drop('median_house_value',axis=1)
housing_label = strat_train_set['median_house_value'].copy()

- load the data
- discover the data (.info(),.describe(),.shape(),data type, value_counts,correlation between data)
- create test and train set -> if it is important that we have good representation we use shuffled
- visualize train set,  here important correlation with the label, plots

### NaN and missing values

In [None]:
housing.info()
# here we see that total bedroom has missing values. We have 3 options:
# -get rid of the corresponding district # dropna()
# -get rid of the whole attribute # drop()
# -set the missing value to some value #fillna()
# -Imputer()- works only on numeric data. 


In [None]:
#!!!!!!!!!!!!!!!!!!!!! we need to use this value to replace the NaN value in test set
# i will use option 3 with fillna()
total_bedrooms_median = housing['total_bedrooms'].median()
housing['total_bedrooms'].fillna(total_bedrooms_median,inplace=True)

In [None]:
housing.info()

In [None]:

imputer = Imputer(strategy='median')

housing_num = housing.drop('ocean_proximity',axis=1)
imputer.fit(housing_num)
X = imputer.transform(housing_num)

In [None]:
imputer.statistics_

In [None]:
housing_tr = pd.DataFrame(X,columns= housing_num.columns)
housing_tr.info()

### how Imputer works
- we we create Imputer instance ans we need to specify that we want to replace each attribute's missing values with the median
- we fit imputer to our numerical attributes
- our results are stored in imputer.statistic_
- we transform our df and will NaN values with median
- after that we can create new df where NaN values are change to median

### Sckit-Learn Design
#### Consistency
- Estimator- to estimate values on df. F.exp. Imputer with fit() function. We can set hyperparametr (like stratergy -> 'median')
- Transformers- some estimators (f.exp. Imputer) can also transform df. The transform is performed by transform() method with df to transform as a parameter.  Transformation relies on the learned parameters, as in the case for an imputer. We have also fit_transform() method with is equivalent to calling fit() and transform()
- Predictors- some estimetors can make predictions for a given df. they are called predictors. F.exp. LinearRegression model was predictor. predict() method take a df of new instances and predict a dataset of corresponding predictions. It has also score() method to measure the quality of the predictions
#### Inspection- all hyperparameters are accessible direcltly via public instance variables (imputer.strategy) and all learned parameter are accessible via public instance variables with an underscore suffix (imputer.statistics_) 
#### Nonproliferation of classes. df are represented as np arrays or scipy sparse martices


### Handling text and categorical attributes

In [None]:
# factorize()
housing_cat = housing['ocean_proximity']
housing_cat_encoded, housing_categories = housing_cat.factorize()#encoded categorical value and list of categories 

# OneHotEncoder()

encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))
housing_cat_1hot

#CatogoricalEncoder -both transformation cat-> int and int cat -> one-hot in one shot
#only avaible in 0.20.dev

#from sklearn.preprocessing import CategoricalEncoder
#cat_encoder = CategoricalEncoder()
#housing_cat_reshape = housing_cat.values.reshape(-1,1)
#hosuing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshape)
#hosuing_cat_1hot

#### transform categorical value 
- factorize()- useful function to transform cat to int. but here our model can think that 0 and 4 is less similar than 0-1- It it mistake and it is a reason why we use binary encoding
- OneHotEncoder()- to binary encoding
- Categorical
- Endocer() can transform cat to int cat and int cat to intin one shot. Only in 0.20.dev
- LabelEndocer()

### Castom Transformers
- although sklearn provides many useful transformation we need sometimes write own for tasks such as custom cleanup or combining new attributes
##### New transformers need to:
- work seamlessly with sckit-learn functions (such as pipelines) 
All you need is to create a class and implament three methods: fit(), transform(), fit_transform()ßßßßßßßßßßßßßß

#### Function

In [None]:
def volume(r):
    """Returns the volume of a sphere with radius r. """
    return (4/3)*np.pi*r**3
#help(volume)

def triangle_area(b,h):
    """Return triangle are with h-high and b- base"""
    return 0.5*b*h
d = triangle_area(3,6)
d

In [None]:
# key word arguments help you to write clean code
# 1 inch = 2.54 cm
# 1 foot = 12 inches
def cm(feet = 0,inches=0):#deflaut values, deflaut arguments
    """Converts a length from feet and inches to centimeters"""
    inches_to_cm = inches *2.54
    feet_to_cm = feet*12*2.54
    return inches_to_cm+feet_to_cm

cm(feet=5)
cm(inches=70)
cm(5,8)
    
# if we have keyword arguments (f.exp feet =5) and required arguments (r- without - sign) we need to specify required arguments firts
#required arguments are specify with its position

#### Class

In [None]:
rooms_ix,bedrooms_ix,population_ix, household_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self,X, y=None):
        return self
    def transform(self,X, y=None):
        rooms_per_household = X[:,rooms_ix]/X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_household,bedrooms_per_room]
        else:
            return np.c_[X,rooms_per_household]
            
        

In [None]:
r1 = CombinedAttributesAdder()
r1

In [None]:
class Circle:
    def __init__(self,radius):
        self.radius = radius
    def circle_area(self):
        return np.pi*self.radius**2
    def perimeter(self):
        return np.pi*self.radius*2


In [None]:
circle1 = Circle(4)
circle1.circle_area()
circle1.perimeter()

In [None]:
l =[Circle(i) for i in range(20)]

- NaN
- ocean_proximity obj -> one_hot
- dodatkowe artybuty

In [None]:
class RemoveNaN(TransformerMixin):
    def __init__(self,filling_strategy = 'median'):
        self.filling_strategy = filling_strategy
    def fit(self,X):
        return self
    def transform(self,X):
        if self.filling_strategy == 'median':
            for col in X.select_dtypes('float64'):
                col_median = X[col].median()
                X[col].fillna(col_median)
            return X
                
        else:
            for col in X.select_dtypes('float64'):
                col_mean = X[col].mean()
                X[col].fillna(col_mean)
            return X
            

        
        
        

In [None]:
d1 = RemoveNaN()
d1 = d1.fit_transform(housing)

In [None]:

class ChangeObjectToInt:
    def __init__(self):
        pass
    def fit(self,X):
        return self
    def transform(self,X):
        for col in X.select_dtypes('object'):
            cat_to_int = pd.get_dummies(X[col],prefix = col)
            X = X.join(cat_to_int)
            X = X.drop(col,axis=1)
        return X

In [None]:
d = ChangeObjectToInt()

In [None]:
d.transform(housing)

### Feauture scaling
- normalization- we scale the data between 0-1. We substract min value and divide max-min
- standarization- we substract mean and divide std. Standarization is less affected by outliners

### Transformation Pipelines
- pipeline helps by execute sequence of transformation


In [None]:
num_pipeline = Pipeline([('imputer',Imputer(strategy='median')),
                         ('attribs_adder',CombinedAttributesAdder)
                         ('std_scaler',StandardScaler())])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attributes_name):
        self.attributes_name = attributes_name
        
    def fit(self,X):
        return self
    def transform(self,X):
        
        return X[self.attributes_name].values

In [None]:

num_attr = list(housing_num)
cat_attr = ['ocean_proximity']
num_pipeline = Pipeline([('selector',DataFrameSelector(num_attr)),
                         ('imputer',Imputer(strategy='median')),
                         ('attribs_adder',CombinedAttributesAdder()),
                         ('std_scaler',StandardScaler())])

cat_pipeline = Pipeline([('selector',DataFrameSelector(cat_attr)),
                         ('onehot',OneHotEncoder())])

## to connect two pipelines in one
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[('num_pipeline',num_pipeline),('cat_pipeline',cat_pipeline)])
housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
full_pipeline.fit_transform(housing).shape
full_pipeline.transform(housing).shape

### Select and Train Model

In [None]:
lin_reg = LinearRegression()
decision_tree = DecisionTreeRegressor()
random_forest = RandomForestRegressor()
svm = SVR()

model_list = [lin_reg, decision_tree, random_forest,svm]
rmse_results = []
param_grid =[{'normalize':[True,False]},
             {'min_samples_split':[4,12],'max_features':[2,4,6,10]},
             {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
             {'C': [1, 10, 100, 1000], 'kernel': ['linear','rbf']}]

param = [{'normalize':[True,False]},
         {'min_samples_split':randint(4,12),'max_features':randint(2,15)},
         {'n_estimators':randint(10,30),'max_features':randint(2,15)},
         {'C': randint(500,2000), 'kernel': ['linear','rbf'],'gamma':[0.0001,0.0005,0.001]}]

for c,model in enumerate(model_list):
    #rmse on training set
    model.fit(housing_prepared,housing_label)
    housing_prediction = model.predict(housing_prepared)
    rmse_training = np.sqrt(mean_squared_error(housing_label,housing_prediction))
    #rmse on validation set
    scores = cross_val_score(model,housing_prepared,housing_label,scoring='neg_mean_squared_error',cv=10)
    rmse_validation = np.sqrt(-scores)
    #grid serach
    grid_search = GridSearchCV(model,param_grid[c],cv=5,scoring='neg_mean_squared_error')
    grid_search.fit(housing_prepared,housing_label)
    grid_results = grid_search.cv_results_
    rmse_grid = min(np.sqrt(-grid_results['mean_test_score']))
    #randomized search
    random_search = RandomizedSearchCV(model,param[c],cv=30,scoring='neg_mean_squared_error')
    random_search.fit(housing_prepared,housing_label)
    random_results = random_search.cv_results_
    rmse_random = min(np.sqrt(-random_results['mean_test_score']))
    rmse_results.append({'model_name':str(model_list[c]).split('(')[0],'val_rmse:':np.mean(rmse_validation),'train_rmse':rmse_training,'rmse_grid':rmse_grid, 'rmse_randomized':rmse_random})
pd.DataFrame(rmse_results)
    

    

In [None]:
rmse = pd.DataFrame(rmse_results)
rmse.to_csv('rmse.csv')

## Exercises

### SVM


In [None]:
params =  [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
grid_svm = GridSearchCV(svm,params,cv=5,scoring='neg_mean_squared_error')
grid_svm.fit(housing_prepared,housing_label)
grid_results = grid_svm.cv_results_
rmse_svr = min(np.sqrt(-grid_results['mean_test_score']))

In [None]:
for mean, param in zip(np.sqrt(-grid_results['mean_test_score']),grid_results['params']):
    print(mean,param)

In [None]:
pd.DataFrame(rmse_results)

## GridSearchCV for transformer -> CombinedAttributesAdder

In [None]:
# create pipeline with a scaler 
combined = CombinedAttributesAdder()
steps = [('combined_attributes',combined),('random_forest',random_forest)]
pipeline = Pipeline(steps)
#parameters
param =  {'combined_attributes__add_bedrooms_per_room': [True,False]}
#do search
grid_search_extra_param = GridSearchCV(pipeline,param,cv=5,scoring='neg_mean_squared_error')
grid_search_extra_param.fit(housing_prepared.toarray(), housing_label)
grid_results = grid_search_extra_param.cv_results_
grid_results['mean_test_score']
rmse_grid_extra_param = min(np.sqrt(-grid_results['mean_test_score']))
rmse_grid_extra_param



