In [None]:
print("Hello world")

In [None]:
import os
import tarfile
from six.moves import urllib

In [None]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
fetch_housing_data()

In [None]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path=os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
housing= load_housing_data()

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing.describe()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
print(len(train_set), len(test_set))

In [None]:
import numpy as np

In [None]:
housing['income_cat']=np.ceil(housing['median_income']/1.5)

In [None]:
housing['income_cat'].where(housing['income_cat']<5,5.0,inplace=True)

In [None]:
housing['income_cat'].value_counts()

In [None]:
housing.drop(['income_cat'])

In [None]:
housing.head()

In [None]:
for set in (train_set,test_set):
    set.drop(["income_cat"], axis=1, inplace=True)

In [None]:
housing.head()

In [None]:
housing.drop(['income_cat'],axis=1)

In [None]:
housing.head()

In [None]:
housing=housing.drop(['income_cat'],axis=1)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude")

In [None]:
housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.1)

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [None]:
corr_matrix=housing.corr()

In [None]:
corr_matrix.sort_values(ascending=False)

In [None]:
corr_matrix.sort_values(by,ascending=False)

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
corr_matrix.value_counts()

In [None]:
housing=train_set.drop(['median_house_value'],axis=1)
housing_labels=train_set['median_house_value'].copy()

In [None]:
median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median) 

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
housing_cat=housing['ocean_proximity']
housing_cat_encoded=encoder.fit_transform(housing_cat)


In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder()
housing_cat_1hot=encoder.fit_transform(housing_cat_encoded.reshape(-1,1))

In [None]:
print(housing_cat_1hot.toarray())

In [72]:
housing['ocean_proximity1hot']=housing_cat_1hot.toarray()

In [78]:
housing=housing.join(pd.DataFrame(housing_cat_1hot.toarray()))

In [80]:
housing.fillna(-99999,inplace=True)

In [82]:
housing=housing.drop(['ocean_proximity'],axis=1)

In [89]:
housing.columns

Index([         'longitude',           'latitude', 'housing_median_age',
              'total_rooms',     'total_bedrooms',         'population',
               'households',      'median_income',                    0,
                          1,                    2,                    3,
                          4, 'median_house_value'],
      dtype='object')

In [90]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
X=housing.drop(['median_house_value'],axis=1)
y=housing['median_house_value']

In [91]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)

In [92]:
model.fit(X_train, y_train)



In [93]:
confidence=model.score(X_test, y_test)
print(confidence)

0.6243361377022907




In [94]:
import pickle

In [96]:
with open('california_housing_linearRG.pickle','wb')as files:
    pickle.dump(model,files)

In [97]:
from sklearn.tree import DecisionTreeRegressor
model=DecisionTreeRegressor()
model.fit(X_train, y_train)



In [99]:
confidence=model.score(X_test, y_test)
print(confidence)

0.6206014497046393




In [104]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()
model.fit(X_train, y_train)



In [105]:
confidence=model.score(X_test, y_test)
print(confidence)



0.8054918100177271


In [106]:
with open('california_housing_RFR.pickle','wb') as files:
    pickle.dump(model, files)

In [103]:
from sklearn.svm import SVR
knels=['linear','poly','rbf','sigmoid','precomputed']
for i in knels:
    model=SVR(kernel=i)
    model.fit(X_train, y_train)
    confidence=model.score(X_test,y_test)
    print(i, confidence)



linear 0.31923292377738754




poly -0.0383292823959962




rbf -0.04658835752956292




sigmoid -0.046864006348099396




ValueError: Precomputed matrix must be a square matrix. Input is a 13209x13 matrix.

In [None]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', LabelBinarizer()),
])
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
num_pipeline = Pipeline([
('imputer', imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)