In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder


### Cek data dari air_quality_dataset.csv, Preprocessing Data

In [44]:
Tes_data = pd.read_csv('./air_quality_dataset.csv')

In [45]:
Tes_data.head()

Unnamed: 0,PM10,SO2,NO2,CO,O3,TEMP,RAIN,wd,WSPM,station,datetime
0,4.0,4.0,7.0,300.0,77.0,-0.7,0.0,N,4.4,Aotizhongxin,2013-03-01 00:00:00
1,8.0,4.0,7.0,300.0,77.0,-1.1,0.0,N,4.7,Aotizhongxin,2013-03-01 01:00:00
2,7.0,5.0,10.0,300.0,73.0,-1.1,0.0,N,5.6,Aotizhongxin,2013-03-01 02:00:00
3,6.0,11.0,11.0,300.0,72.0,-1.4,0.0,W,3.1,Aotizhongxin,2013-03-01 03:00:00
4,3.0,12.0,12.0,300.0,72.0,-2.0,0.0,N,2.0,Aotizhongxin,2013-03-01 04:00:00


In [46]:
# Tes data info
Tes_data.info()
print(f'\n{Tes_data.isna().sum()}')

print("\nJumlah duplikasi: ", Tes_data.duplicated().sum())
Tes_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420768 entries, 0 to 420767
Data columns (total 11 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   PM10      420768 non-null  float64
 1   SO2       420768 non-null  float64
 2   NO2       420768 non-null  float64
 3   CO        420768 non-null  float64
 4   O3        420768 non-null  float64
 5   TEMP      420768 non-null  float64
 6   RAIN      420768 non-null  float64
 7   wd        420768 non-null  object 
 8   WSPM      420768 non-null  float64
 9   station   420768 non-null  object 
 10  datetime  420768 non-null  object 
dtypes: float64(8), object(3)
memory usage: 35.3+ MB

PM10        0
SO2         0
NO2         0
CO          0
O3          0
TEMP        0
RAIN        0
wd          0
WSPM        0
station     0
datetime    0
dtype: int64

Jumlah duplikasi:  0


Unnamed: 0,PM10,SO2,NO2,CO,O3,TEMP,RAIN,WSPM
count,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0
mean,104.235659,15.642997,50.384959,1212.909829,56.925748,13.538931,0.064416,1.729444
std,91.123375,21.456742,34.714968,1134.271612,55.830557,11.430737,0.820626,1.245961
min,2.0,0.2856,1.0265,100.0,0.2142,-19.9,0.0,0.0
25%,36.0,3.0,23.0,500.0,12.0,3.1,0.0,0.9
50%,82.0,7.0,43.0,900.0,44.0,14.5,0.0,1.4
75%,144.0,19.0,70.0,1500.0,80.0,23.2,0.0,2.2
max,999.0,500.0,290.0,10000.0,1071.0,41.6,72.5,13.2


Tidak ada missing value pada data tersebut, dikarenakan memang data sudah saya preprocessing di project sebelumnya :v

In [47]:
# Hapus kolom 'wd', 'station', dan 'datetime'
Tes_data = Tes_data.drop(['wd', 'station'], axis=1)

In [48]:
# Ubah kolom RAIN menjadi kategori biner: 0 (Tidak Hujan), 1 (Hujan)
Tes_data['RAIN'] = Tes_data['RAIN'].apply(lambda x: 1 if x > 0 else 0)

In [49]:
Tes_data['datetime'] = pd.to_datetime(Tes_data['datetime'])

# Ekstrak fitur waktu
Tes_data['year'] = Tes_data['datetime'].dt.year
Tes_data['month'] = Tes_data['datetime'].dt.month
Tes_data['day'] = Tes_data['datetime'].dt.day
Tes_data['hour'] = Tes_data['datetime'].dt.hour
Tes_data['dayofweek'] = Tes_data['datetime'].dt.dayofweek

# (Opsional) Hapus kolom datetime asli jika sudah tidak diperlukan
Tes_data = Tes_data.drop('datetime', axis=1)


In [50]:
# Simpan dataframe Tes_data ke file CSV baru
Tes_data.to_csv('Tes_data_clean.csv', index=False)
Tes_data.head()

Unnamed: 0,PM10,SO2,NO2,CO,O3,TEMP,RAIN,WSPM,year,month,day,hour,dayofweek
0,4.0,4.0,7.0,300.0,77.0,-0.7,0,4.4,2013,3,1,0,4
1,8.0,4.0,7.0,300.0,77.0,-1.1,0,4.7,2013,3,1,1,4
2,7.0,5.0,10.0,300.0,73.0,-1.1,0,5.6,2013,3,1,2,4
3,6.0,11.0,11.0,300.0,72.0,-1.4,0,3.1,2013,3,1,3,4
4,3.0,12.0,12.0,300.0,72.0,-2.0,0,2.0,2013,3,1,4,4


In [51]:
# Tes data info
Tes_data.info()
print(f'\n{Tes_data.isna().sum()}')

print("\nJumlah duplikasi: ", Tes_data.duplicated().sum())
Tes_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420768 entries, 0 to 420767
Data columns (total 13 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   PM10       420768 non-null  float64
 1   SO2        420768 non-null  float64
 2   NO2        420768 non-null  float64
 3   CO         420768 non-null  float64
 4   O3         420768 non-null  float64
 5   TEMP       420768 non-null  float64
 6   RAIN       420768 non-null  int64  
 7   WSPM       420768 non-null  float64
 8   year       420768 non-null  int32  
 9   month      420768 non-null  int32  
 10  day        420768 non-null  int32  
 11  hour       420768 non-null  int32  
 12  dayofweek  420768 non-null  int32  
dtypes: float64(7), int32(5), int64(1)
memory usage: 33.7 MB

PM10         0
SO2          0
NO2          0
CO           0
O3           0
TEMP         0
RAIN         0
WSPM         0
year         0
month        0
day          0
hour         0
dayofweek    0
dtype: int64

Jum

Unnamed: 0,PM10,SO2,NO2,CO,O3,TEMP,RAIN,WSPM,year,month,day,hour,dayofweek
count,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0
mean,104.235659,15.642997,50.384959,1212.909829,56.925748,13.538931,0.039262,1.729444,2014.66256,6.52293,15.729637,11.5,3.000684
std,91.123375,21.456742,34.714968,1134.271612,55.830557,11.430737,0.194217,1.245961,1.177198,3.448707,8.800102,6.922195,2.0012
min,2.0,0.2856,1.0265,100.0,0.2142,-19.9,0.0,0.0,2013.0,1.0,1.0,0.0,0.0
25%,36.0,3.0,23.0,500.0,12.0,3.1,0.0,0.9,2014.0,4.0,8.0,5.75,1.0
50%,82.0,7.0,43.0,900.0,44.0,14.5,0.0,1.4,2015.0,7.0,16.0,11.5,3.0
75%,144.0,19.0,70.0,1500.0,80.0,23.2,0.0,2.2,2016.0,10.0,23.0,17.25,5.0
max,999.0,500.0,290.0,10000.0,1071.0,41.6,1.0,13.2,2017.0,12.0,31.0,23.0,6.0


In [53]:
# Hapus baris duplikasi
Tes_data = Tes_data.drop_duplicates()

# Simpan dataframe Tes_data ke file CSV baru
Tes_data.to_csv('Tes_data_clean.csv', index=False)

# Tes data info
Tes_data.info()
print(f'\n{Tes_data.isna().sum()}')

print("\nJumlah duplikasi: ", Tes_data.duplicated().sum())
Tes_data.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 420763 entries, 0 to 420767
Data columns (total 13 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   PM10       420763 non-null  float64
 1   SO2        420763 non-null  float64
 2   NO2        420763 non-null  float64
 3   CO         420763 non-null  float64
 4   O3         420763 non-null  float64
 5   TEMP       420763 non-null  float64
 6   RAIN       420763 non-null  int64  
 7   WSPM       420763 non-null  float64
 8   year       420763 non-null  int32  
 9   month      420763 non-null  int32  
 10  day        420763 non-null  int32  
 11  hour       420763 non-null  int32  
 12  dayofweek  420763 non-null  int32  
dtypes: float64(7), int32(5), int64(1)
memory usage: 36.9 MB

PM10         0
SO2          0
NO2          0
CO           0
O3           0
TEMP         0
RAIN         0
WSPM         0
year         0
month        0
day          0
hour         0
dayofweek    0
dtype: int64

Jumlah d

Unnamed: 0,PM10,SO2,NO2,CO,O3,TEMP,RAIN,WSPM,year,month,day,hour,dayofweek
count,420763.0,420763.0,420763.0,420763.0,420763.0,420763.0,420763.0,420763.0,420763.0,420763.0,420763.0,420763.0,420763.0
mean,104.236275,15.643136,50.385113,1212.916875,56.925714,13.538917,0.03926,1.729445,2014.662554,6.522931,15.729724,11.500006,3.000682
std,91.123578,21.45683,34.715117,1134.27604,55.830479,11.430744,0.194212,1.245967,1.177201,3.448716,8.800059,6.922198,2.001204
min,2.0,0.2856,1.0265,100.0,0.2142,-19.9,0.0,0.0,2013.0,1.0,1.0,0.0,0.0
25%,36.0,3.0,23.0,500.0,12.0,3.1,0.0,0.9,2014.0,4.0,8.0,6.0,1.0
50%,82.0,7.0,43.0,900.0,44.0,14.5,0.0,1.4,2015.0,7.0,16.0,11.0,3.0
75%,144.0,19.0,70.0,1500.0,80.0,23.2,0.0,2.2,2016.0,10.0,23.0,17.5,5.0
max,999.0,500.0,290.0,10000.0,1071.0,41.6,1.0,13.2,2017.0,12.0,31.0,23.0,6.0
