In [192]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
import numpy as np

housnig = pd.read_csv('./housing.csv', header=0, sep=',')

train_set, test_set = train_test_split(housnig, test_size=0.2, random_state=42)
# train_set.shape
train_set.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0,NEAR OCEAN
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0,INLAND


In [193]:

data = train_set.copy()
data.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0,NEAR OCEAN
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0,INLAND


In [194]:
# standard correlation coefficient [-1,1]
corr_matrix = data.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)


median_house_value    1.000000
median_income         0.690647
total_rooms           0.133989
housing_median_age    0.103706
households            0.063714
total_bedrooms        0.048029
population           -0.026032
longitude            -0.046349
latitude             -0.142983
Name: median_house_value, dtype: float64

In [195]:
data.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0,NEAR OCEAN
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0,INLAND


In [196]:
data['total_rooms_per_households'] = data['total_rooms'] / data['households']
data['total_bedrooms_per_total_rooms'] = data['total_bedrooms'] / data['total_rooms']
data['population_per_total_rooms'] = data['population'] / data['total_rooms']

# standard corrieation coefficient [-1,1]
corr_matrix = data.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)


# seaborn , matplotlib  => readMore


median_house_value                1.000000
median_income                     0.690647
total_rooms_per_households        0.158485
total_rooms                       0.133989
housing_median_age                0.103706
households                        0.063714
total_bedrooms                    0.048029
population                       -0.026032
population_per_total_rooms       -0.031160
longitude                        -0.046349
latitude                         -0.142983
total_bedrooms_per_total_rooms   -0.257183
Name: median_house_value, dtype: float64

In [197]:
# ============== Prepare the Data ====================
# ======== Number Data --> missing value
# ======== categorical and text data ---> labelEncoder, oneHotEncoder
# ======== Numerical data ===> Feature scaling
# ======== Numerical date ===> Custom transformers

df = train_set.copy()

# چون میخوایم روی این قسمت از داده ماشین رو اموزش بدیم پس خودش رو باید حذف کنیم
df_label = df['median_house_value'].copy()
df = df.drop('median_house_value', axis=1)
df.info()

# ocean_proximity value is nan because remove this from data-set
df_num = df.drop('ocean_proximity', axis=1)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 14196 to 15795
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16508 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   ocean_proximity     16512 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.3+ MB


In [198]:
# ============= missing data (1. drop rows, 2.drop columns, 3.replace)
# df_num = df_num.dropna(subset=['total_bedrooms']) #option 1
# df_num = df_num.drop('total_bedrooms', axis=1)  #option 2

# option 3 - time: 20:17
# median = df_num['total_bedrooms'].median()
# df_num['total_bedrooms'].fillna(median)

# use simple inputer
# imputer = SimpleImputer(missing_values=np.nan, strategy="median")

# fit get all data and learn for transform
# imputer.fit(df_num)

# transform for change face
# X is np arra should change to pandas
# X = imputer.transform(df_num)
# df_num_impute_tr = pd.DataFrame(X, columns=df_num.columns)
# df_num_impute_tr.info()
# df_num.info()
# df_num_impute_tr.head()


In [199]:
class FitAndTransform():
    def fit(self, input, strategy="median"):
        # fix all null row value by stragegy
        imputer = SimpleImputer(missing_values=np.nan, strategy=strategy)
        # fit get all data and learn for transform
        imputer.fit(input)
        input_impute_tr = self.transform(input, imputer)
        return input_impute_tr

    def transform(self, input, imputerValue):
        # transform for change face
        # X is np arra should change to pandas
        X = imputerValue.transform(input)
        input_impute_tr = pd.DataFrame(X, columns=input.columns)
        return input_impute_tr


def columnsAppender(mainData, columnsNameList):
    columns = list(mainData.columns)
    for columnName in columnsNameList:
        columns.append(columnName)
    return columns


def combinedAttributesAddr(inputValue):
    rooms_per_households = inputValue[:,
                                      rooms_ix] / inputValue[:, household_ix]
    population_per_household = inputValue[:,
                                          population_ix] / inputValue[:, household_ix]
    bedrooms_per_rooms = inputValue[:, bedrooms_ix] / inputValue[:, rooms_ix]
    data_custom_tr_tmp = np.c_[
        inputValue, rooms_per_households, population_per_household, bedrooms_per_rooms]
    return pd.DataFrame(data_custom_tr_tmp)


In [200]:
# 40:00 freez for create Custom transform
# ================================ custom transformers ======================
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
custom_columns_list = ['rooms_per_households',
                       'population_per_household', 'bedrooms_per_rooms']

fitAndTransform = FitAndTransform()
data_custom_impure_tr = fitAndTransform.fit(df_num)
data_custom_impure_tr.info()

data_custom_tr = combinedAttributesAddr(data_custom_impure_tr.values)
data_custom_tr.columns = columnsAppender(
    data_custom_impure_tr, custom_columns_list)
data_custom_tr.head(10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
dtypes: float64(8)
memory usage: 1.0 MB


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_households,population_per_household,bedrooms_per_rooms
0,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,5.017657,3.691814,0.200576
1,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,4.473545,1.738095,0.232703
2,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,5.645833,2.723214,0.174486
3,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,4.002817,3.994366,0.258269
4,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,6.268421,2.3,0.18094
5,-121.86,37.42,20.0,5032.0,808.0,2695.0,801.0,6.6227,6.282147,3.364544,0.160572
6,-117.97,34.04,28.0,1686.0,417.0,1355.0,388.0,2.5192,4.345361,3.492268,0.247331
7,-122.53,37.91,37.0,2524.0,398.0,999.0,417.0,7.9892,6.052758,2.395683,0.157686
8,-117.9,34.13,5.0,1126.0,316.0,819.0,311.0,1.5,3.620579,2.633441,0.280639
9,-117.79,34.02,5.0,18690.0,2862.0,9427.0,2777.0,6.4266,6.730284,3.394671,0.15313


In [208]:
# =============== feature scaling ================
# Standardization => range of -n,+n      Normalization => [0, 1]
data_custom_tr.describe()
# وقتی اختلاف مین و مکس حیلی زیاد باشه. مثلا از 2 به 3900 الگوریتم های ما نمیتونه درست نصمیم بگیره.
# برای همین اسکیل میکنیم تا دز مقیاس منطقی تری الگوریتم رو در بیاریم.

from sklearn.preprocessing import StandardScaler
feature_scal = StandardScaler()
data_num_scaled_tr = pd.DataFrame(feature_scal.fit_transform(data_custom_tr.values), columns=data_custom_tr.columns)
# data_num_scaled_tr.describe()
data_num_scaled_tr.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_households,population_per_household,bedrooms_per_rooms
0,1.272587,-1.372811,0.34849,0.222569,0.211261,0.768276,0.322906,-0.326196,-0.174916,0.051376,-0.212034
1,0.709162,-0.876696,1.618118,0.340293,0.593164,-0.098901,0.672027,-0.035843,-0.402835,-0.117362,0.341808
2,-0.447603,-0.460146,-1.95271,-0.342597,-0.495259,-0.449818,-0.430461,0.144701,0.088216,-0.03228,-0.661803
3,1.232698,-1.382172,0.586545,-0.56149,-0.409331,-0.007434,-0.380587,-1.017864,-0.600015,0.077507,0.782552
4,-0.108551,0.532084,1.142008,-0.119565,-0.25657,-0.485877,-0.314962,-0.171488,0.349007,-0.068832,-0.550535


In [213]:
 # تبدیل داده های متنی به عددی
 # ماشین فقط با اعداد کار میکنند. متن رو نمیفهمند.
# option -> labelEncoder -> convert to index 0,1,2,3,4,...

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data_cat = df["ocean_proximity"]
data_cat_encoded = encoder.fit_transform(data_cat)
data_cat_encoded = pd.DataFrame(data_cat_encoded, columns=['ocean_proximity'])
data_cat_encoded.head(10)

# ایرادی که دارد این است که فکر میکنه اونی که 4 هست بزرگتر از 0 هستش.
# ذز صوزتی که هدف ما مقدار دهی نبوده بلکه ایندکس فقط میخواستیم این رو بدیم




Unnamed: 0,ocean_proximity
0,4
1,4
2,4
3,4
4,1
5,0
6,0
7,3
8,0
9,0


In [229]:
# option -> OneHotEncoder -> add any  columns like checkList => 0,1
from sklearn.preprocessing import OneHotEncoder

# sparse = false for null number
encoder_1hot = OneHotEncoder(sparse=False)
data_cat_1hot_tmp = encoder_1hot.fit_transform(df[['ocean_proximity']])
data_cat_1hot = pd.DataFrame(data_cat_1hot_tmp)
data_cat_1hot.columns = encoder_1hot.get_feature_names_out(
    input_features=['ocean_proximity'])

data_cat_1hot.head()








Unnamed: 0,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0


In [230]:
final = pd.concat([data_num_scaled_tr, data_cat_1hot], axis=1)
final.head(10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_households,population_per_household,bedrooms_per_rooms,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,1.272587,-1.372811,0.34849,0.222569,0.211261,0.768276,0.322906,-0.326196,-0.174916,0.051376,-0.212034,0.0,0.0,0.0,0.0,1.0
1,0.709162,-0.876696,1.618118,0.340293,0.593164,-0.098901,0.672027,-0.035843,-0.402835,-0.117362,0.341808,0.0,0.0,0.0,0.0,1.0
2,-0.447603,-0.460146,-1.95271,-0.342597,-0.495259,-0.449818,-0.430461,0.144701,0.088216,-0.03228,-0.661803,0.0,0.0,0.0,0.0,1.0
3,1.232698,-1.382172,0.586545,-0.56149,-0.409331,-0.007434,-0.380587,-1.017864,-0.600015,0.077507,0.782552,0.0,0.0,0.0,0.0,1.0
4,-0.108551,0.532084,1.142008,-0.119565,-0.25657,-0.485877,-0.314962,-0.171488,0.349007,-0.068832,-0.550535,0.0,1.0,0.0,0.0,0.0
5,-1.135679,0.831625,-0.683082,1.09906,0.643289,1.115675,0.790151,1.439919,0.354757,0.023111,-0.901665,1.0,0.0,0.0,0.0,0.0
6,0.803897,-0.750327,-0.048268,-0.439627,-0.289986,-0.062842,-0.293963,-0.715013,-0.45653,0.034142,0.593991,1.0,0.0,0.0,0.0,0.0
7,-1.469745,1.060961,0.665897,-0.054266,-0.335337,-0.375941,-0.217838,2.157529,0.25867,-0.060568,-0.95142,0.0,0.0,0.0,1.0,0.0
8,0.8388,-0.708204,-1.873359,-0.697148,-0.531062,-0.534249,-0.496085,-1.250241,-0.760128,-0.040033,1.168206,1.0,0.0,0.0,0.0,0.0
9,0.893646,-0.759688,-1.873359,7.379811,5.545968,7.036405,5.977096,1.336938,0.542474,0.025712,-1.029966,1.0,0.0,0.0,0.0,0.0


In [None]:
# pipeLine 1:00:00