### Import library

In [491]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

### Data Splitting

In [492]:
dataset_split = pd.read_csv("./dataset/housing_preprocessing.csv")
dataset_split

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
2,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
3,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
4,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20643,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,INLAND
20644,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,INLAND
20645,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,INLAND
20646,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,INLAND


In [493]:
dataset_split.dtypes

longitude             float64
latitude              float64
housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
median_house_value      int64
ocean_proximity        object
dtype: object

### Bagi ke dua Variabel X dan Y

In [494]:
X = dataset_split.iloc[:, :-1]
y = dataset_split.iloc[:, -1]

In [495]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [496]:
print("Dimensi X_train : ", X_train.shape)
print("Dimensi X_test  : ", X_test.shape)
print("Dimensi y_train : ", y_train.shape)
print("Dimensi y_test  : ", y_test.shape)

Dimensi X_train :  (14453, 9)
Dimensi X_test  :  (6195, 9)
Dimensi y_train :  (14453,)
Dimensi y_test  :  (6195,)


### Data Transforming
#### Normalisasi

In [497]:
from sklearn.preprocessing import MinMaxScaler

In [498]:
min_max_scaler = MinMaxScaler()

In [499]:
x_norm = min_max_scaler.fit_transform(
    dataset_split[["longitude"]]
    )

In [500]:
data_norm = pd.DataFrame(x_norm)
data_norm

Unnamed: 0,0
0,0.211155
1,0.211155
2,0.211155
3,0.211155
4,0.211155
...,...
20643,0.324701
20644,0.312749
20645,0.311753
20646,0.301793


#### Standarisasi

In [501]:
print("Nilai Standar Deviasi sebelum distandarisasi : ")
print(np.std(dataset_split))

Nilai Standar Deviasi sebelum distandarisasi : 
longitude                  2.003779
latitude                   2.135945
housing_median_age        12.585166
total_rooms             2181.413416
total_bedrooms           421.369907
population              1132.423531
households               382.317100
median_income              1.901430
median_house_value    115491.530067
dtype: float64


  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [502]:
from sklearn.preprocessing import StandardScaler

In [503]:
standard_scaler = StandardScaler()

In [504]:
x_standard = standard_scaler.fit_transform(
    dataset_split[["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income", "median_house_value"]]
)

In [505]:
print("Rata2 sesudah di standarisasi : ", np.nanmean(x_standard))
print("Nilai sesudah di standarisasi : ", np.nanstd(x_standard))

Rata2 sesudah di standarisasi :  2.413070117835244e-16
Nilai sesudah di standarisasi :  1.0


In [506]:
data_standarisasi = pd.DataFrame(x_standard)
data_standarisasi

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-1.327125,1.052119,0.981769,-0.804562,-0.969957,-0.974061,-0.976663,2.341819,2.126784
1,-1.327125,1.052119,0.981769,-0.804562,-0.969957,-0.974061,-0.976663,2.341819,2.126784
2,-1.327125,1.052119,0.981769,-0.804562,-0.969957,-0.974061,-0.976663,2.341819,2.126784
3,-1.327125,1.052119,0.981769,-0.804562,-0.969957,-0.974061,-0.976663,2.341819,2.126784
4,-1.327125,1.052119,0.981769,-0.804562,-0.969957,-0.974061,-0.976663,2.341819,2.126784
...,...,...,...,...,...,...,...,...,...
20643,-0.758200,1.801202,-0.289569,-0.444704,-0.388520,-0.512219,-0.443074,-1.215978,-1.115878
20644,-0.818087,1.805883,-0.845780,-0.888453,-0.920119,-0.944037,-1.008051,-0.691899,-1.124536
20645,-0.823077,1.777793,-0.925238,-0.174695,-0.125093,-0.369163,-0.173665,-1.142507,-0.992925
20646,-0.872983,1.777793,-0.845780,-0.355312,-0.305457,-0.604058,-0.393378,-1.054573,-1.058731


### Data Cleaning

In [507]:
from sklearn.impute import SimpleImputer

In [529]:
dataset_split_broken = pd.read_csv("./dataset/housing_preprocessing.csv")
dataset_split_broken

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
2,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
3,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
4,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20643,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,INLAND
20644,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,INLAND
20645,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,INLAND
20646,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,INLAND


### Menangani Nilai NULL

#### Subtitusi Nilai NULL (Scikit)

In [509]:
imputer = SimpleImputer(strategy='mean')

In [510]:
print("Jumlah record yang memiliki nilai null: ")
print(dataset_split_broken.isna().sum())

Jumlah record yang memiliki nilai null: 
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


In [511]:
dataset_split_broken[dataset_split_broken.isnull().any(axis = 1)]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
298,-122.16,37.77,47,1256,,570,218,4.3750,161900,NEAR BAY
349,-122.17,37.75,38,992,,732,259,1.6196,85100,NEAR BAY
546,-122.28,37.78,29,5154,,3741,1273,2.5762,173400,NEAR BAY
571,-122.24,37.75,45,891,,384,146,4.9489,247100,NEAR BAY
704,-122.10,37.69,41,746,,387,161,3.9063,178400,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20275,-119.19,34.20,18,3620,,3171,779,3.3409,220500,NEAR OCEAN
20276,-119.18,34.19,19,2393,,1938,762,1.6953,167400,NEAR OCEAN
20380,-118.88,34.17,15,4260,,1701,669,5.1033,410700,<1H OCEAN
20468,-118.75,34.29,17,5512,,2734,814,6.6073,258100,<1H OCEAN


In [512]:
dataset_split_broken["total_bedrooms"] = imputer.fit_transform(dataset_split_broken[["total_bedrooms"]])

In [513]:
print("Jumlah record yang memiliki nilai null: ")
print(dataset_split_broken.isna().sum())

Jumlah record yang memiliki nilai null: 
longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64


In [514]:
dataset_split_broken[dataset_split_broken.isnull().any(axis=1)]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity


### Menangani Nilai Duplikat

In [515]:
dataset_split_broken[dataset_split_broken.duplicated()]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
1,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
2,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
3,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
4,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
5,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
6,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
7,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
8,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY


In [516]:
dataset_split_broken.drop_duplicates(inplace=True)

In [517]:
dataset_split_broken[dataset_split_broken.duplicated()]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity


In [518]:
dataset_split_broken.duplicated().sum()

0

### Mengubah tipe data angka menjadi tipe data objek

In [519]:
dataset_split_broken.dtypes

longitude             float64
latitude              float64
housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
median_house_value      int64
ocean_proximity        object
dtype: object

In [530]:
dataset_split_broken["total_rooms"] = dataset_split_broken["total_rooms"].astype("float64")

In [531]:
dataset_split_broken.dtypes

longitude             float64
latitude              float64
housing_median_age      int64
total_rooms           float64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
median_house_value      int64
ocean_proximity        object
dtype: object

### Data Encoding
#### One-Hot Encoding

In [522]:
from sklearn.preprocessing import OneHotEncoder

In [532]:
df = pd.DataFrame(dataset_split["ocean_proximity"])
df

Unnamed: 0,ocean_proximity
0,NEAR BAY
1,NEAR BAY
2,NEAR BAY
3,NEAR BAY
4,NEAR BAY
...,...
20643,INLAND
20644,INLAND
20645,INLAND
20646,INLAND


In [524]:
Onehot_Encoder = OneHotEncoder(sparse=False)

In [525]:
onehot = Onehot_Encoder.fit_transform(df[["ocean_proximity"]])

In [526]:
df_onehot = pd.DataFrame(onehot)

In [527]:
df = df.join(df_onehot)

In [528]:
df

Unnamed: 0,ocean_proximity,0,1,2,3,4
0,NEAR BAY,0.0,0.0,0.0,1.0,0.0
1,NEAR BAY,0.0,0.0,0.0,1.0,0.0
2,NEAR BAY,0.0,0.0,0.0,1.0,0.0
3,NEAR BAY,0.0,0.0,0.0,1.0,0.0
4,NEAR BAY,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...
20643,INLAND,0.0,1.0,0.0,0.0,0.0
20644,INLAND,0.0,1.0,0.0,0.0,0.0
20645,INLAND,0.0,1.0,0.0,0.0,0.0
20646,INLAND,0.0,1.0,0.0,0.0,0.0
