In [5]:
from __future__ import print_function

import IPython
print('IPython:', IPython.__version__)

import numpy
print('numpy:', numpy.__version__)

import pandas
print('pandas:', pandas.__version__)

import sklearn
print('scikit-learn:', sklearn.__version__)

IPython: 8.16.1
numpy: 1.26.1
pandas: 2.1.1
scikit-learn: 1.5.0


In [6]:
#load data
from sklearn.datasets import load_iris

iris = load_iris()

# Perbedaan Numpy dan Pandas
# Numpy tidak punya fitur (nama atribut/nama fitur)
# Pandas punya fitur (nama atribut/nama fitur)

In [7]:
# ini jenis tipe datanya numpy array
X, y = iris.data, iris.target

print(X[:5])
print(y[:5])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
[0 0 0 0 0]


In [8]:
# kalau yang ini pandas dataframe

import pandas as pd

df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [9]:
# pembuatan pipeline

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

# pipeline numpy array
pipe_numpy = make_pipeline(StandardScaler(), LinearSVC())

# pipeline pandas dataframe
pipe_pandas = make_pipeline(StandardScaler(), LinearSVC())

In [10]:
# pemisahan data (splitting)

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# split dataset
# numpy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=46)

# pandas
df_train, df_test = train_test_split(df, test_size=0.2, random_state=46)

In [11]:
# training

# numpy
pipe_numpy.fit(X_train, y_train)
# untuk mengevaluasi
y_pred = pipe_numpy.predict(X_test)
print("Hasil Evaluasi Model Numpy")
print(classification_report(y_test, y_pred))

# pandas
# untuk memisahkan antara data (X_train_df/df_train) dengan target (y)
# jadinya pada X_train dihapus target, sedangkan pada y_train disisakan target
X_train_df, y_train_df = df_train.drop('target', axis=1), df_train['target']
X_test_df, y_test_df = df_test.drop('target', axis=1), df_test['target']

pipe_pandas.fit(X_train_df, y_train_df)
y_pred_df = pipe_pandas.predict(X_test_df)
print("Hasil Evaluasi Model Pandas")
print(classification_report(y_test_df, y_pred_df))

Hasil Evaluasi Model Numpy
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.88      0.78      0.82         9
           2       0.80      0.89      0.84         9

    accuracy                           0.90        30
   macro avg       0.89      0.89      0.89        30
weighted avg       0.90      0.90      0.90        30

Hasil Evaluasi Model Pandas
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.88      0.78      0.82         9
           2       0.80      0.89      0.84         9

    accuracy                           0.90        30
   macro avg       0.89      0.89      0.89        30
weighted avg       0.90      0.90      0.90        30



In [12]:
# menyimpan model dengan pkl (pickle)
import pickle

# numpy
with open("pemodelan_numpy.pkl", "wb") as model_file:
    pickle.dump(pipe_numpy, model_file)

# wb = write binary (untuk dump)

# pandas
with open("pemodelan_pandas.pkl", "wb") as model_file:
    pickle.dump(pipe_pandas, model_file)

In [13]:
# load model, dari model development menuju model deployment

In [14]:
# numpy
with open("pemodelan_numpy.pkl", "rb") as model_file:
    loaded_numpy_model = pickle.load(model_file)

# rb = read binary (untuk load)

In [15]:
# predict/prediksi menggunakan numpy

# kurungnya dua karena agar satu baris empat kolom new_data ini jadi matriks
new_data = [[1,1,1,1]]
loaded_numpy_model.predict(new_data)

array([1])

In [16]:
# array([1]) itu apa?
# berdasarkan target names:
iris.target_names
#1 adalah bunga versicolor, kalau 0 setosa, 2 virginica

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [17]:
# pandas
with open("pemodelan_pandas.pkl", "rb") as model_file:
    loaded_pandas_model = pickle.load(model_file)

In [18]:
# predict/prediksi menggunakan pandas

new_data = [[1,1,1,1]]
# data ini diubah ke dalam pandas dataFrame
new_data = pd.DataFrame(new_data, columns=iris.feature_names)
loaded_pandas_model.predict(new_data)

array([1])

In [19]:
new_data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,1,1,1,1


In [20]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']