### Import library

In [164]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

### Import dataset 

In [165]:
dfHouse = pd.read_csv('House_Rent_Dataset.csv')
dfHouse.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Posted On          4746 non-null   object
 1   BHK                4746 non-null   int64 
 2   Rent               4746 non-null   int64 
 3   Size               4746 non-null   int64 
 4   Floor              4746 non-null   object
 5   Area Type          4746 non-null   object
 6   Area Locality      4746 non-null   object
 7   City               4746 non-null   object
 8   Furnishing Status  4746 non-null   object
 9   Tenant Preferred   4746 non-null   object
 10  Bathroom           4746 non-null   int64 
 11  Point of Contact   4746 non-null   object
dtypes: int64(4), object(8)
memory usage: 445.1+ KB


### Menentukan target dan feature, serta melakukan pembagian data menjadi data training dan testing

In [166]:
x = dfHouse.drop("Tenant Preferred", axis=1) #menentukan feature dengan meng-drop column target
y = dfHouse["Tenant Preferred"] #menentukan target dengan memilih column target
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)


In [167]:
print(

f"""
x_train : {x_train.shape}
x_test  : {x_test.shape}
y_train : {y_train.shape}
y_test  : {y_test.shape}

"""
)


x_train : (3322, 11)
x_test  : (1424, 11)
y_train : (3322,)
y_test  : (1424,)




### Normalisasi data menggunakan MinMaxScaler()

In [168]:
dfHouse_minmaxed = dfHouse.copy()

minmaxed_data = MinMaxScaler().fit_transform(dfHouse_minmaxed[["Rent"]])
dfHouse_minmaxed["Rent"] = minmaxed_data

print("Data sebelum normalisasi")
print(dfHouse["Rent"].describe())
print("")
print("Data sesudah normalisasi")
print(dfHouse_minmaxed["Rent"].describe())

Data sebelum normalisasi
count    4.746000e+03
mean     3.499345e+04
std      7.810641e+04
min      1.200000e+03
25%      1.000000e+04
50%      1.600000e+04
75%      3.300000e+04
max      3.500000e+06
Name: Rent, dtype: float64

Data sesudah normalisasi
count    4746.000000
mean        0.009659
std         0.022324
min         0.000000
25%         0.002515
50%         0.004230
75%         0.009089
max         1.000000
Name: Rent, dtype: float64


### Standarisasi data menggunakan StandardScaler()

In [169]:
dfHouse_standard = dfHouse.copy()

scaled_data = StandardScaler().fit_transform(dfHouse_standard[["Size"]])
dfHouse_standard["Size"] = scaled_data

print("Data sebelum standarisasi")
print(dfHouse["Size"].describe())
print("")
print("Data sesudah standarisasi")
print(dfHouse_standard["Size"].describe())

Data sebelum standarisasi
count    4746.000000
mean      967.490729
std       634.202328
min        10.000000
25%       550.000000
50%       850.000000
75%      1200.000000
max      8000.000000
Name: Size, dtype: float64

Data sesudah standarisasi
count    4.746000e+03
mean     8.982841e-17
std      1.000105e+00
min     -1.509915e+00
25%     -6.583620e-01
50%     -1.852770e-01
75%      3.666555e-01
max      1.108992e+01
Name: Size, dtype: float64


### Membuat nilai null didalam dataset

In [170]:
dfHouse_nan = dfHouse.copy()

#mengubah tipe data column "Size" dari int menjadi float
dfHouse_nan["Size"] = dfHouse_nan["Size"].astype(float)

#mengubah nilai "Rent" mulai dari record ke 2600 yang bertipe int menjadi kosong
dfHouse_nan.loc[2600:, "Rent"] = np.nan 

#mengubah nilai "Size" mulai dari record ke 2600 yang bertipe float menjadi kosong
dfHouse_nan.loc[2600:, "Size"] = np.nan 

#mengubah nilai "Furnishing Status" mulai dari record ke 2600 yang bertipe obj menjadi kosong
dfHouse_nan.loc[2600:, "Furnishing Status"] = np.nan 

#Mengecek nilai null dalam dataset
dfHouse_nan.isna().sum()

Posted On               0
BHK                     0
Rent                 2146
Size                 2146
Floor                   0
Area Type               0
Area Locality           0
City                    0
Furnishing Status    2146
Tenant Preferred        0
Bathroom                0
Point of Contact        0
dtype: int64

### Mengatasi nilai null dalam dataset menggunakan .median(), .mean(), dan .mode()

In [171]:
# Mengatasi nilai nan dengan cara mengisi nilai nan dengan median dari data
dfHouse_nan["Rent"].fillna(dfHouse_nan["Rent"].median(), inplace=True)
# Mengatasi nilai nan dengan cara mengisi nilai nan dengan mean dari data
dfHouse_nan["Size"].fillna(dfHouse_nan["Size"].mean(), inplace=True)
# Mengatasi nilai nan dengan mengisi nilai nan dengan modus dari data
dfHouse_nan["Furnishing Status"].fillna(dfHouse_nan["Furnishing Status"].mode(), inplace=True)

#Mengecek nilai null dalam dataset
dfHouse_nan.isna().sum()

Posted On               0
BHK                     0
Rent                    0
Size                    0
Floor                   0
Area Type               0
Area Locality           0
City                    0
Furnishing Status    2146
Tenant Preferred        0
Bathroom                0
Point of Contact        0
dtype: int64

### Membuat nilai duplikat pada dataset

In [172]:
dfHouse.iloc[100,:] = dfHouse.iloc[200,:] 
dfHouse.duplicated().sum()

1

### Mengatasi nilai duplikat pada dataset 

In [173]:
dfHouse.drop_duplicates(inplace=True) #menghapus nilai dupe
dfHouse.duplicated().sum()


0

### Mengubah tipe data salah satu tipe atribut 

In [174]:
print("Sebelum", dfHouse["Size"].dtype)
dfHouse["Size"] = dfHouse["Size"].astype(float)
print("Sesudah", dfHouse["Size"].dtype)


Sebelum int64
Sesudah float64


### One-Hot encoding 

In [175]:
dfEncoded = dfHouse.join(pd.get_dummies(dfHouse[["Tenant Preferred"]]))

dfEncoded

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Tenant Preferred_Bachelors,Tenant Preferred_Bachelors/Family,Tenant Preferred_Family
0,2022-05-18,2,10000,1100.0,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner,False,True,False
1,2022-05-13,2,20000,800.0,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,False,True,False
2,2022-05-16,2,17000,1000.0,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,False,True,False
3,2022-07-04,2,10000,800.0,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner,False,True,False
4,2022-05-09,2,7500,850.0,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2022-05-18,2,15000,1000.0,3 out of 5,Carpet Area,Bandam Kommu,Hyderabad,Semi-Furnished,Bachelors/Family,2,Contact Owner,False,True,False
4742,2022-05-15,3,29000,2000.0,1 out of 4,Super Area,"Manikonda, Hyderabad",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Owner,False,True,False
4743,2022-07-10,3,35000,1750.0,3 out of 5,Carpet Area,"Himayath Nagar, NH 7",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Agent,False,True,False
4744,2022-07-06,3,45000,1500.0,23 out of 34,Carpet Area,Gachibowli,Hyderabad,Semi-Furnished,Family,2,Contact Agent,False,False,True
