# Import Modules

In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, precision_recall_curve, accuracy_score, classification_report
from sklearn.metrics import roc_curve, auc

# Initialize and load dataset

In [54]:
test = pd.read_csv("dataset/multiclass_classification/test.csv")
train = pd.read_csv("dataset/multiclass_classification/train.csv")

# Show in table

In [55]:
train.head()

Unnamed: 0,invoice_id,branch,city,customer_type,gender,unit_price,quantity,tax_5_percents,total,month,day,year,hours,minutes,payment,cogs,gross_margin_percentage,gross_income,rating,product_line
0,848-62-7243,0,2,0,1,24.89,9,11.2005,235.2105,3,15,2019,15,36,0,224.01,4.761905,11.2005,7.4,3
1,583-41-4548,2,1,0,1,16.67,7,5.8345,122.5245,2,7,2019,11,36,2,116.69,4.761905,5.8345,7.4,4
2,800-09-8606,0,2,1,0,87.37,5,21.8425,458.6925,1,29,2019,19,45,0,436.85,4.761905,21.8425,6.6,4
3,283-26-5248,2,1,1,0,98.52,10,49.26,1034.46,1,30,2019,20,23,2,985.2,4.761905,49.26,4.5,2
4,288-62-1085,0,2,1,1,38.54,5,9.635,202.335,1,9,2019,13,34,2,192.7,4.761905,9.635,5.6,1


## Set feature and target to train and test dataset

### Train dataset

In [56]:
X_train = train.loc[:, ['branch', 'city', 'gender', 'customer_type', 'unit_price',
                        'quantity', 'tax_5_percents', 'total', 'payment', 'cogs',
                        'gross_margin_percentage', 'gross_income', 'rating', 'month',
                        'day', 'year', 'hours', 'minutes']
                   ]

y_train = train.iloc[:, 19:20]

In [57]:
X_train.head()

Unnamed: 0,branch,city,gender,customer_type,unit_price,quantity,tax_5_percents,total,payment,cogs,gross_margin_percentage,gross_income,rating,month,day,year,hours,minutes
0,0,2,1,0,24.89,9,11.2005,235.2105,0,224.01,4.761905,11.2005,7.4,3,15,2019,15,36
1,2,1,1,0,16.67,7,5.8345,122.5245,2,116.69,4.761905,5.8345,7.4,2,7,2019,11,36
2,0,2,0,1,87.37,5,21.8425,458.6925,0,436.85,4.761905,21.8425,6.6,1,29,2019,19,45
3,2,1,0,1,98.52,10,49.26,1034.46,2,985.2,4.761905,49.26,4.5,1,30,2019,20,23
4,0,2,1,1,38.54,5,9.635,202.335,2,192.7,4.761905,9.635,5.6,1,9,2019,13,34


In [58]:
y_train.head()

Unnamed: 0,product_line
0,3
1,4
2,4
3,2
4,1


### Test dataset

In [59]:
X_test = test.loc[:, ['branch', 'city', 'gender', 'customer_type', 'unit_price',
                      'quantity', 'tax_5_percents', 'total', 'payment', 'cogs',
                      'gross_margin_percentage', 'gross_income', 'rating', 'month',
                      'day', 'year', 'hours', 'minutes']
                 ]

y_test = test.iloc[:, 19:20]

In [60]:
X_test.shape

(200, 18)

# Multi Classification with OneVsRestClassifier

## Prepare Model

### Initialize parameter model

In [61]:
multiclass_classification = OneVsRestClassifier(SVC()).fit(X_test, y_test)


### Train Model

In [62]:
X_train.head()

Unnamed: 0,branch,city,gender,customer_type,unit_price,quantity,tax_5_percents,total,payment,cogs,gross_margin_percentage,gross_income,rating,month,day,year,hours,minutes
0,0,2,1,0,24.89,9,11.2005,235.2105,0,224.01,4.761905,11.2005,7.4,3,15,2019,15,36
1,2,1,1,0,16.67,7,5.8345,122.5245,2,116.69,4.761905,5.8345,7.4,2,7,2019,11,36
2,0,2,0,1,87.37,5,21.8425,458.6925,0,436.85,4.761905,21.8425,6.6,1,29,2019,19,45
3,2,1,0,1,98.52,10,49.26,1034.46,2,985.2,4.761905,49.26,4.5,1,30,2019,20,23
4,0,2,1,1,38.54,5,9.635,202.335,2,192.7,4.761905,9.635,5.6,1,9,2019,13,34


In [63]:
multiclass_classification.fit(X_train, np.ravel(y_train))

OneVsRestClassifier(estimator=SVC())

### Accuracy Score

In [64]:
accuracy_score(y_test, multiclass_classification.predict(X_test))

0.19

In [65]:
predicted = multiclass_classification.predict(X_test)

## Use Model for predict target class

### Prediction with Features

In [66]:
train.head()

Unnamed: 0,invoice_id,branch,city,customer_type,gender,unit_price,quantity,tax_5_percents,total,month,day,year,hours,minutes,payment,cogs,gross_margin_percentage,gross_income,rating,product_line
0,848-62-7243,0,2,0,1,24.89,9,11.2005,235.2105,3,15,2019,15,36,0,224.01,4.761905,11.2005,7.4,3
1,583-41-4548,2,1,0,1,16.67,7,5.8345,122.5245,2,7,2019,11,36,2,116.69,4.761905,5.8345,7.4,4
2,800-09-8606,0,2,1,0,87.37,5,21.8425,458.6925,1,29,2019,19,45,0,436.85,4.761905,21.8425,6.6,4
3,283-26-5248,2,1,1,0,98.52,10,49.26,1034.46,1,30,2019,20,23,2,985.2,4.761905,49.26,4.5,2
4,288-62-1085,0,2,1,1,38.54,5,9.635,202.335,1,9,2019,13,34,2,192.7,4.761905,9.635,5.6,1


In [67]:
branch                  = 0
city                    = 2
gender                  = 1
customer_type           = 3
unit_price              = 24.89
quantity                = 9
tax_5_percents          = 11.2005
total                   = 235.2105
payment                 = 0
cogs                    = 224.01
gross_margin_percentage = 4.761905
gross_income            = 11.2005
rating                  = 7.4
month                   = 3
day                     = 15
year                    = 2019
hours                   = 15
minutes                 = 36


multiclass_classification.predict([[branch, city, gender, customer_type, unit_price, quantity, tax_5_percents, total, payment, cogs, gross_margin_percentage, gross_income, rating, month, day, year, hours, minutes]])

array([0], dtype=int64)

### Train with 3-Folds Cross Validation

In [68]:
cross_val_score(multiclass_classification, X_train, np.ravel(y_train), cv=3, scoring="accuracy")

array([0.16104869, 0.16853933, 0.18045113])

# Evaluate with Precision, Recall, & F1-Measure

## Confusion Matrix

In [69]:
y_train_pred = cross_val_predict(multiclass_classification, X_train, np.ravel(y_train), cv=3)
print(f'True positive  : {confusion_matrix(np.ravel(y_train), y_train_pred)[1, 1]}')
print(f'True negative  : {confusion_matrix(np.ravel(y_train), y_train_pred)[0, 0]}')
print(f'False positive : {confusion_matrix(np.ravel(y_train), y_train_pred)[0, 1]}')
print(f'False negative : {confusion_matrix(np.ravel(y_train), y_train_pred)[1, 0]}')

True positive  : 34
True negative  : 28
False positive : 23
False negative : 24


## Precision

In [70]:
print(f'Precision: {round(precision_score(np.ravel(y_train), y_train_pred, average="weighted") * 100, 2)}%')

Precision: 17.22%


## Recall

In [71]:
print(f'Recall: {round(recall_score(np.ravel(y_train), y_train_pred, average="weighted") * 100, 2)}%')

Recall: 17.0%


## F-1 Score

In [72]:
print(f'F-1 Score: {round(f1_score(np.ravel(y_train), y_train_pred, average="weighted") * 100, 2)}%')

F-1 Score: 16.14%


## Classification Report

In [73]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.22      0.50      0.30        38
           1       0.33      0.08      0.13        38
           2       0.14      0.20      0.17        25
           3       0.17      0.11      0.14        35
           4       0.14      0.18      0.15        34
           5       0.50      0.03      0.06        30

    accuracy                           0.19       200
   macro avg       0.25      0.18      0.16       200
weighted avg       0.25      0.19      0.16       200



# Precision-Recall Tradeoff

## Precision Recall VS Threshold Plot

In [74]:
y_scores = cross_val_predict(multiclass_classification, X_train, np.ravel(y_train), cv = 3, method = "decision_function")

In [89]:
n_classes = y_train.shape[1]

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[i], y_train_pred[i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_train_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

KeyError: 0

In [77]:
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

KeyError: 2

Unnamed: 0,product_line
0,4
1,0
2,4
3,5
4,5
