### Import Library

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler


### Import Dataset

In [3]:
df = pd.read_csv('Steel_industry_data.csv')

### Melihat 5 Data Teratas

In [4]:
df.head()

Unnamed: 0,date,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type
0,01/01/2018 00:15,3.17,2.95,0.0,0.0,73.21,100.0,900,Weekday,Monday,Light_Load
1,01/01/2018 00:30,4.0,4.46,0.0,0.0,66.77,100.0,1800,Weekday,Monday,Light_Load
2,01/01/2018 00:45,3.24,3.28,0.0,0.0,70.28,100.0,2700,Weekday,Monday,Light_Load
3,01/01/2018 01:00,3.31,3.56,0.0,0.0,68.09,100.0,3600,Weekday,Monday,Light_Load
4,01/01/2018 01:15,3.82,4.5,0.0,0.0,64.72,100.0,4500,Weekday,Monday,Light_Load


### Menampilkan Informasi dari Dataset

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35040 entries, 0 to 35039
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   date                                  35040 non-null  object 
 1   Usage_kWh                             35040 non-null  float64
 2   Lagging_Current_Reactive.Power_kVarh  35040 non-null  float64
 3   Leading_Current_Reactive_Power_kVarh  35040 non-null  float64
 4   CO2(tCO2)                             35040 non-null  float64
 5   Lagging_Current_Power_Factor          35040 non-null  float64
 6   Leading_Current_Power_Factor          35040 non-null  float64
 7   NSM                                   35040 non-null  int64  
 8   WeekStatus                            35040 non-null  object 
 9   Day_of_week                           35040 non-null  object 
 10  Load_Type                             35040 non-null  object 
dtypes: float64(6), 

### Membagi Dataset menjadi Training Set dan Testing Set dengan Proporsi 70:30

In [6]:
X = df.drop(['Load_Type'], axis=1)
Y = df['Load_Type']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
# Membagi data menjadi 70% data training dan 30% data testing

In [7]:
print(f"Dimensi X_train : {X_train.shape}")
print(f"Dimensi X_test  : {X_test.shape}")
print(f"Dimensi Y_train : {Y_train.shape}")
print(f"Dimensi Y_test  : {Y_test.shape}")

Dimensi X_train : (24528, 10)
Dimensi X_test  : (10512, 10)
Dimensi Y_train : (24528,)
Dimensi Y_test  : (10512,)


## Data Transformation

#### Normalisasi Data

In [8]:
df_2 = df.copy()

ss = MinMaxScaler()
scaledData = ss.fit_transform(df_2[['NSM']])
df_2['NSM'] = scaledData

In [9]:
print('Hasil Sebelum  di Normalisasi Data')
print(df['NSM'].describe())
print()
print('Hasil Normalisasi Data')
print(df_2['NSM'].describe())

Hasil Sebelum  di Normalisasi Data
count    35040.000000
mean     42750.000000
std      24940.534317
min          0.000000
25%      21375.000000
50%      42750.000000
75%      64125.000000
max      85500.000000
Name: NSM, dtype: float64

Hasil Normalisasi Data
count    35040.000000
mean         0.500000
std          0.291702
min          0.000000
25%          0.250000
50%          0.500000
75%          0.750000
max          1.000000
Name: NSM, dtype: float64


#### Standarisasi Data

In [10]:
df_3 = df.copy()
ss = StandardScaler()
scaledData = ss.fit_transform(df_3[['NSM']])
df_3['NSM'] = scaledData

In [11]:
print("(Data sebelum Di Standarisasi)")
print(np.std(df['NSM']))
print()
print("(Data sesudah Di Standarisasi)")
print(np.std(df_3['NSM']))


(Data sebelum Di Standarisasi)
24940.178427589486

(Data sesudah Di Standarisasi)
1.0


## Data Cleaning

#### Mengecek Jumlah data Null atau data Kosong

In [12]:
df.isna().sum()

date                                    0
Usage_kWh                               0
Lagging_Current_Reactive.Power_kVarh    0
Leading_Current_Reactive_Power_kVarh    0
CO2(tCO2)                               0
Lagging_Current_Power_Factor            0
Leading_Current_Power_Factor            0
NSM                                     0
WeekStatus                              0
Day_of_week                             0
Load_Type                               0
dtype: int64

#### Karena tidak ada data NULL maka harus dibuat

In [13]:
df_4 = df.copy()

# Nilai Null pada data bertipe int64
df_4.loc[28000:, 'NSM'] = None

# Nilai NUll pada data bertipe float64
df_4.loc[28000:, 'Leading_Current_Power_Factor'] = None

# Nilai Null pada data bertipe Object
df_4.loc[28000:, 'WeekStatus'] = None

In [14]:
df_4.isna().sum()

date                                       0
Usage_kWh                                  0
Lagging_Current_Reactive.Power_kVarh       0
Leading_Current_Reactive_Power_kVarh       0
CO2(tCO2)                                  0
Lagging_Current_Power_Factor               0
Leading_Current_Power_Factor            7040
NSM                                     7040
WeekStatus                              7040
Day_of_week                                0
Load_Type                                  0
dtype: int64

#### Mengatasi Nilai Null yang telah kita buat sebelumnya dengan strategy mean, median, modus

In [15]:
#Strategy median untuk int64
df_4["NSM"].fillna(df_4["NSM"].median(), inplace=True)

#Strategy median untuk float64
df_4["Leading_Current_Power_Factor"].fillna(df_4["Leading_Current_Power_Factor"].mean(), inplace=True)

#Strategy median untuk object
df_4["WeekStatus"].fillna(df_4["WeekStatus"].mode()[0], inplace=True)
# Pakai Indeks 0 untuk menentukan dari banyak modus yang ada kita akan menggunakan yang ada di indeks 0

df_4.isna().sum()

date                                    0
Usage_kWh                               0
Lagging_Current_Reactive.Power_kVarh    0
Leading_Current_Reactive_Power_kVarh    0
CO2(tCO2)                               0
Lagging_Current_Power_Factor            0
Leading_Current_Power_Factor            0
NSM                                     0
WeekStatus                              0
Day_of_week                             0
Load_Type                               0
dtype: int64

#### Membuat Nilai Duplicate pada salah satu datanya

In [16]:
df_4.iloc[0,:] = df_4.iloc[1,:]
df_4.duplicated().sum()

1

#### Mengecek data yang duplicate

In [17]:
df_4[df_4.duplicated()]

Unnamed: 0,date,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type
1,01/01/2018 00:30,4.0,4.46,0.0,0.0,66.77,100.0,1800.0,Weekday,Monday,Light_Load


#### Menghapus Data yang Duplicate

In [18]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

#### Mengganti Tipe Data pada salah satu attribute angka

In [19]:
print(f"Tipe Data dari Attribute NSM adalah : {df_4['NSM'].dtypes}")

Tipe Data dari Attribute NSM adalah : float64


In [20]:
df_4['NSM'] = df_4['NSM'].astype('object')
print(f"Tipe Data dari Attribute NSM setelah diubah adalah : {df_4['NSM'].dtypes}")

Tipe Data dari Attribute NSM setelah diubah adalah : object


### One Hot Encoding Menggunakan Pandas

In [21]:
df.Load_Type.unique()

array(['Light_Load', 'Medium_Load', 'Maximum_Load'], dtype=object)

In [22]:
ohe = pd.get_dummies(df[["Load_Type"]])
df_5 = df.join(ohe.astype(bool)) # Pakai int jika ingin mengubah menjadi angka 0 dan 1
df_5

Unnamed: 0,date,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type,Load_Type_Light_Load,Load_Type_Maximum_Load,Load_Type_Medium_Load
0,01/01/2018 00:15,3.17,2.95,0.00,0.0,73.21,100.00,900,Weekday,Monday,Light_Load,True,False,False
1,01/01/2018 00:30,4.00,4.46,0.00,0.0,66.77,100.00,1800,Weekday,Monday,Light_Load,True,False,False
2,01/01/2018 00:45,3.24,3.28,0.00,0.0,70.28,100.00,2700,Weekday,Monday,Light_Load,True,False,False
3,01/01/2018 01:00,3.31,3.56,0.00,0.0,68.09,100.00,3600,Weekday,Monday,Light_Load,True,False,False
4,01/01/2018 01:15,3.82,4.50,0.00,0.0,64.72,100.00,4500,Weekday,Monday,Light_Load,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35035,31/12/2018 23:00,3.85,4.86,0.00,0.0,62.10,100.00,82800,Weekday,Monday,Light_Load,True,False,False
35036,31/12/2018 23:15,3.74,3.74,0.00,0.0,70.71,100.00,83700,Weekday,Monday,Light_Load,True,False,False
35037,31/12/2018 23:30,3.78,3.17,0.07,0.0,76.62,99.98,84600,Weekday,Monday,Light_Load,True,False,False
35038,31/12/2018 23:45,3.78,3.06,0.11,0.0,77.72,99.96,85500,Weekday,Monday,Light_Load,True,False,False
