# Import Modules

In [1]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_curve

# Initialize and load dataset

In [2]:
train = pd.read_csv('../dataset/multiclass_classification/train.csv')
test  = pd.read_csv('../dataset/multiclass_classification/test.csv')

# Show in table

## Train dataset

In [3]:
print(f'Number of row    : { train.shape[0] }')
print(f'Number of column : { train.shape[1] }')

Number of row    : 800
Number of column : 20


In [4]:
train.head()

Unnamed: 0,invoice_id,branch,city,customer_type,gender,unit_price,quantity,tax_5_percents,total,month,day,year,hours,minutes,payment,cogs,gross_margin_percentage,gross_income,rating,product_line
0,848-62-7243,0,2,0,1,24.89,9,11.2005,235.2105,3,15,2019,15,36,0,224.01,4.761905,11.2005,7.4,3
1,583-41-4548,2,1,0,1,16.67,7,5.8345,122.5245,2,7,2019,11,36,2,116.69,4.761905,5.8345,7.4,4
2,800-09-8606,0,2,1,0,87.37,5,21.8425,458.6925,1,29,2019,19,45,0,436.85,4.761905,21.8425,6.6,4
3,283-26-5248,2,1,1,0,98.52,10,49.26,1034.46,1,30,2019,20,23,2,985.2,4.761905,49.26,4.5,2
4,288-62-1085,0,2,1,1,38.54,5,9.635,202.335,1,9,2019,13,34,2,192.7,4.761905,9.635,5.6,1


## Test dataset

In [5]:
print(f'Number of row    : { test.shape[0] }')
print(f'Number of column : { test.shape[1] }')

Number of row    : 200
Number of column : 20


In [6]:
test.head()

Unnamed: 0,invoice_id,branch,city,customer_type,gender,unit_price,quantity,tax_5_percents,total,month,day,year,hours,minutes,payment,cogs,gross_margin_percentage,gross_income,rating,product_line
0,451-28-5717,2,1,1,0,83.17,6,24.951,523.971,3,20,2019,11,23,0,499.02,4.761905,24.951,7.3,4
1,137-63-5492,2,1,0,1,58.76,10,29.38,616.98,1,29,2019,14,26,2,587.6,4.761905,29.38,9.0,0
2,733-29-1227,2,1,0,1,55.61,7,19.4635,408.7335,3,23,2019,12,41,0,389.27,4.761905,19.4635,8.5,4
3,322-02-2271,1,0,0,0,42.97,3,6.4455,135.3555,2,3,2019,11,46,0,128.91,4.761905,6.4455,9.3,5
4,569-71-4390,1,0,0,1,21.87,2,2.187,45.927,1,25,2019,14,29,2,43.74,4.761905,2.187,6.9,5


# Split dataset into train and test set

## Train dataset

In [7]:
X_train = train.loc[:, ['branch', 'city', 'customer_type', 'gender', 'unit_price',
                        'quantity', 'tax_5_percents', 'total', 'month', 'day', 'year', 'hours',
                        'minutes', 'payment', 'cogs', 'gross_margin_percentage', 'gross_income',
                        'rating',]
                   ]

y_train = train.iloc[:, 19:20]

## Test dataset

In [8]:
X_test = test.loc[:, ['branch', 'city', 'customer_type', 'gender', 'unit_price',
                      'quantity', 'tax_5_percents', 'total', 'month', 'day', 'year', 'hours',
                      'minutes', 'payment', 'cogs', 'gross_margin_percentage', 'gross_income',
                      'rating',]
                 ]

y_test = test.iloc[:, 19:20]

# Create Model

## Train model

### Logistic Regression

In [9]:
# ovr_clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, multi_class='ovr', solver='liblinear', random_state=42))
# ovr_clf.fit(X_train, y_train)

### Support Vector Classifier

In [10]:
ovr_clf = OneVsRestClassifier(SVC(gamma=0.001, random_state=42))
ovr_clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=SVC(gamma=0.001, random_state=42))

## Test model
Hasil prediksi dari model tidak bagus

### Predicted

In [11]:
ovr_clf.predict(X_test.loc[:, :])

array([2, 0, 2, 2, 0, 0, 3, 1, 3, 5, 1, 1, 4, 0, 1, 4, 0, 0, 0, 3, 2, 4,
       1, 0, 5, 1, 4, 0, 5, 2, 3, 0, 5, 2, 0, 1, 0, 3, 2, 0, 0, 5, 0, 2,
       1, 1, 0, 2, 4, 3, 3, 4, 5, 3, 2, 4, 0, 0, 5, 2, 0, 0, 0, 5, 1, 5,
       5, 4, 2, 0, 1, 2, 0, 1, 1, 1, 2, 5, 0, 1, 1, 1, 3, 3, 5, 3, 0, 3,
       5, 1, 2, 2, 2, 3, 3, 0, 0, 1, 4, 0, 3, 2, 2, 1, 1, 2, 5, 5, 2, 5,
       5, 1, 5, 5, 1, 3, 0, 3, 5, 0, 4, 4, 2, 0, 2, 3, 2, 1, 3, 5, 2, 3,
       1, 1, 2, 1, 2, 1, 1, 0, 0, 3, 3, 2, 1, 4, 5, 4, 1, 5, 0, 2, 5, 3,
       3, 0, 1, 0, 2, 1, 0, 5, 3, 1, 4, 2, 2, 3, 2, 2, 4, 2, 0, 3, 0, 5,
       0, 1, 4, 1, 5, 0, 4, 2, 1, 5, 5, 5, 0, 1, 4, 2, 0, 3, 3, 5, 4, 0,
       0, 2], dtype=int64)

### Actual

In [12]:
np.ravel(y_test)

array([4, 0, 4, 5, 5, 3, 5, 0, 2, 0, 0, 1, 3, 4, 3, 2, 4, 4, 0, 3, 0, 2,
       1, 1, 0, 5, 1, 3, 3, 1, 0, 5, 2, 5, 0, 3, 4, 5, 4, 0, 3, 3, 2, 1,
       4, 1, 5, 1, 0, 5, 4, 0, 1, 3, 0, 1, 0, 1, 2, 0, 0, 4, 5, 0, 1, 3,
       1, 5, 2, 0, 5, 4, 3, 2, 0, 4, 4, 4, 3, 4, 2, 3, 1, 1, 3, 5, 4, 1,
       4, 2, 2, 0, 4, 1, 3, 5, 3, 5, 5, 3, 1, 0, 3, 0, 3, 0, 0, 2, 2, 1,
       2, 3, 1, 1, 1, 0, 5, 0, 4, 3, 1, 1, 3, 0, 1, 4, 1, 3, 3, 5, 4, 1,
       5, 3, 5, 0, 0, 4, 4, 2, 3, 1, 5, 4, 0, 2, 3, 4, 5, 3, 2, 3, 2, 4,
       1, 5, 1, 4, 5, 2, 3, 5, 0, 0, 3, 3, 4, 2, 0, 5, 2, 0, 5, 3, 1, 4,
       4, 0, 1, 2, 1, 1, 5, 1, 0, 4, 2, 1, 4, 3, 4, 0, 0, 2, 5, 4, 1, 1,
       5, 2], dtype=int64)

# Performance Measurement

## Confusion Matrix

In [13]:
y_train_pred = cross_val_predict(ovr_clf, X_train, np.ravel(y_train), cv=3)
print(f'True positive  : {confusion_matrix(np.ravel(y_train), y_train_pred)[1, 1]}')
print(f'True negative  : {confusion_matrix(np.ravel(y_train), y_train_pred)[0, 0]}')
print(f'False positive : {confusion_matrix(np.ravel(y_train), y_train_pred)[0, 1]}')
print(f'False negative : {confusion_matrix(np.ravel(y_train), y_train_pred)[1, 0]}')

True positive  : 43
True negative  : 15
False positive : 28
False negative : 23


## Accuracy Score

In [14]:
accuracy_score(y_test, ovr_clf.predict(X_test))

0.14

## Cross Validation Score

In [15]:
cross_val_score(ovr_clf, X_train, np.ravel(y_train), cv=3, scoring="accuracy")

array([0.16479401, 0.19101124, 0.15789474])

## Precision

In [16]:
y_train_pred = cross_val_predict(ovr_clf, X_train, np.ravel(y_train), cv=3)


- average = micro says the function to compute f1 by considering total true positives, false negatives and false positives (no matter of the prediction for each label in the dataset)
- average = macro says the function to compute f1 for each label, and returns the average without considering the proportion for each label in the dataset.
- average = weighted says the function to compute f1 for each label, and returns the average considering the proportion for each label in the dataset.
- average = samples says the function to compute f1 for each instance, and returns the average. Use it for multilabel classification.


In [17]:
precision_score(np.ravel(y_train), y_train_pred, average='weighted')

0.16711327031080614

## Recall

In [18]:
recall_score(np.ravel(y_train), y_train_pred, average='weighted')

0.17125

## F-1 Score

In [19]:
f1_score(np.ravel(y_train), y_train_pred, average='weighted')

0.1683891189088838

## Classification Report

In [20]:
print(classification_report(y_test, ovr_clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.20      0.24      0.22        38
           1       0.16      0.16      0.16        38
           2       0.11      0.16      0.13        25
           3       0.17      0.14      0.16        35
           4       0.11      0.06      0.08        34
           5       0.06      0.07      0.07        30

    accuracy                           0.14       200
   macro avg       0.13      0.14      0.13       200
weighted avg       0.14      0.14      0.14       200



# Note
Kemungkinan penyebab akurasi, recall, presisi, dan F-1 score rendah, antara lain:
- Jumlah dataset kurang, sekarang menggunakan 1000 cases.
- Proporsi kelas pada train dan test tidak sama (Bukan penyebab, sudah dicek).
- Hubungan antar fitur tidak kuat (bisa lihat heatmap correlation).
- Srategi One versus Rest (OVR) menggunakan model ML SVC, tidak cocok dengan kasus dataset yang digunakan.