In [88]:
# Input file name with path
input_file_name = 'Indeks Standar Pencemar Udara di Provinsi DKI Jakarta 2021.csv'

# Target class name
input_target_class = "categori"

# Columns to be removed
input_drop_col = "tanggal"

# Col datatype selection
input_datatype_selection = 'auto'  # use auto if you don't want to provide column names by data type else use 'manual'

# Categorical columns
input_cat_columns = ['critical', 'location']

# Numerical columns
input_num_columns = ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max']

# Encoding technique
input_encoding = 'OneHotEncoder' # choose the encoding technique from 'LabelEncoder', 'OneHotEncoder', 'OrdinalEncoder' and 'FrequencyEncoder'

# Handle missing value
input_treat_missing_value = 'drop' # choose how to handle missing values from 'drop','inpute' and 'ignore'

# Machine learning algorithm
input_ml_algo = 'LogisiticRegression' # choose the ML algorithm from 'LogisiticRegression', 'DecisionTreeClassifier', 'RandomForestClassifier', 'XGBClassifier' and LGBMClassifier'

In [70]:
# Import libraries 

# Data Manipulation
import numpy as np 
import pandas as pd
from   pandas import DataFrame

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
from   sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from   sklearn.impute import SimpleImputer
from   sklearn.model_selection import train_test_split, GridSearchCV
from   scikitplot.metrics import confusion_matrix , plot_roc_curve
from   sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from   sklearn.linear_model import LogisticRegression
from   sklearn.tree import DecisionTreeClassifier
from   sklearn.ensemble import RandomForestClassifier
from   xgboost import XGBClassifier
from   lightgbm import LGBMClassifier
from   imblearn.over_sampling import RandomOverSampler
import pickle
from category_encoders import OneHotEncoder

# Maths
import math

# Set the options
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 500)
%matplotlib inline

### Load the datasets

Load the dataset using pd.read_csv()

In [21]:
# Read data in form of a csv file
df = pd.read_csv(input_file_name)

# First 5 rows of the dataset
df.head()

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
0,1/1/2021,43,,58,29,35,65,65,O3,SEDANG,DKI2
1,1/2/2021,58,,86,38,64,80,86,PM25,SEDANG,DKI3
2,1/3/2021,64,,93,25,62,86,93,PM25,SEDANG,DKI3
3,1/4/2021,50,,67,24,31,77,77,O3,SEDANG,DKI2
4,1/5/2021,59,,89,24,35,77,89,PM25,SEDANG,DKI3


### Descriptive Statistics

As the name says descriptive statistics describes the data. It gives you information about
- Mean, median, mode 
- Min, max
- Count etc

Let's understand the data we have

In [22]:
# Dimension of the data
df.shape

(365, 11)

In [23]:
# Summary of the dataset
df.describe()

Unnamed: 0,pm10,pm25,so2,co,o3,no2,max
count,365.0,334.0,365.0,365.0,365.0,365.0,365.0
mean,60.506849,94.694611,52.753425,15.391781,49.805479,34.115068,94.030137
std,15.155896,24.153839,11.193823,5.857975,12.23479,15.974152,24.408647
min,19.0,33.0,37.0,7.0,20.0,9.0,45.0
25%,53.0,78.25,45.0,11.0,41.0,24.0,77.0
50%,62.0,94.5,52.0,14.0,49.0,31.0,93.0
75%,68.0,108.75,55.0,18.0,57.0,39.0,108.0
max,179.0,174.0,126.0,47.0,151.0,134.0,179.0


In [24]:
# Missing values for every column
df.isna().sum()

tanggal      0
pm10         0
pm25        31
so2          0
co           0
o3           0
no2          0
max          0
critical     0
categori     0
location     0
dtype: int64

In [41]:
dfNew = df.dropna()

In [61]:
# Missing values for every column
dfNew.head()

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
31,2/1/2021,73,126.0,38,26,46,34,126,PM25,TIDAK SEHAT,DKI5
32,2/2/2021,53,70.0,40,14,55,25,70,PM25,SEDANG,DKI3
33,2/3/2021,32,53.0,40,11,42,19,53,PM25,SEDANG,DKI3
34,2/4/2021,36,59.0,40,14,47,24,59,PM25,SEDANG,DKI5
35,2/5/2021,29,51.0,40,14,45,35,51,PM25,SEDANG,DKI3


In [75]:
X = dfNew.loc[:, ['critical', 'location']]
X = OneHotEncoder(cols=['critical', 'location']).fit_transform(X)
X.head()

  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():


Unnamed: 0,critical_1,critical_2,critical_3,critical_4,critical_5,location_1,location_2,location_3,location_4,location_5
31,1,0,0,0,0,1,0,0,0,0
32,1,0,0,0,0,0,1,0,0,0
33,1,0,0,0,0,0,1,0,0,0
34,1,0,0,0,0,1,0,0,0,0
35,1,0,0,0,0,0,1,0,0,0


In [74]:
dfMerge = pd.concat([dfNew.iloc[:, 0:6], X], axis=1)
dfMerge.head()

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,critical_1,critical_2,critical_3,critical_4,critical_5,location_1,location_2,location_3,location_4,location_5
31,2/1/2021,73,126.0,38,26,46,1,0,0,0,0,1,0,0,0,0
32,2/2/2021,53,70.0,40,14,55,1,0,0,0,0,0,1,0,0,0
33,2/3/2021,32,53.0,40,11,42,1,0,0,0,0,0,1,0,0,0
34,2/4/2021,36,59.0,40,14,47,1,0,0,0,0,1,0,0,0,0
35,2/5/2021,29,51.0,40,14,45,1,0,0,0,0,0,1,0,0,0


In [77]:
dfMerge = dfMerge.drop(['tanggal'], axis=1)
dfMerge.head()

Unnamed: 0,pm10,pm25,so2,co,o3,critical_1,critical_2,critical_3,critical_4,critical_5,location_1,location_2,location_3,location_4,location_5
31,73,126.0,38,26,46,1,0,0,0,0,1,0,0,0,0
32,53,70.0,40,14,55,1,0,0,0,0,0,1,0,0,0
33,32,53.0,40,11,42,1,0,0,0,0,0,1,0,0,0
34,36,59.0,40,14,47,1,0,0,0,0,1,0,0,0,0
35,29,51.0,40,14,45,1,0,0,0,0,0,1,0,0,0


In [79]:
dictForClasses = {k: v for k, v in zip(['TIDAK SEHAT', 'SEDANG', 'BAIK'], list(range(len(['TIDAK SEHAT', 'SEDANG', 'BAIK']))))}
dictForClasses

{'TIDAK SEHAT': 0, 'SEDANG': 1, 'BAIK': 2}

In [82]:
for i in range(334):
    dfNew.iloc[i, 9] = dictForClasses[dfNew.iloc[i, 9]]

In [83]:
dfNew

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
31,2/1/2021,73,126.0,38,26,46,34,126,PM25,0,DKI5
32,2/2/2021,53,70.0,40,14,55,25,70,PM25,1,DKI3
33,2/3/2021,32,53.0,40,11,42,19,53,PM25,1,DKI3
34,2/4/2021,36,59.0,40,14,47,24,59,PM25,1,DKI5
35,2/5/2021,29,51.0,40,14,45,35,51,PM25,1,DKI3
36,2/6/2021,34,53.0,40,8,57,15,57,O3,1,DKI2
37,2/7/2021,33,55.0,40,10,57,13,57,O3,1,DKI2
38,2/8/2021,26,44.0,39,10,54,17,54,O3,1,DKI2
39,2/9/2021,33,57.0,40,13,47,22,57,PM25,1,DKI4
40,2/10/2021,50,64.0,40,13,49,16,64,PM25,1,DKI3


In [84]:
X = np.array(dfMerge)
y = np.asarray(dfNew.iloc[:, 9]).astype('int64')

In [85]:
# Split the dataset into the training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [90]:
# Select the algorithm according to the input provided
if input_ml_algo == 'DecisionTreeClassifier':
    model = DecisionTreeClassifier()
elif input_ml_algo == 'RandomForestClassifier':
    model = RandomForestClassifier()
elif input_ml_algo == 'XGBClassifier':
    model = XGBClassifier()
elif input_ml_algo == 'LGBMClassifier':
    model = LGBMClassifier()

In [91]:
# Training the model:
model.fit(X_train, y_train)

model