In [2]:
##Introduction:
##This program aims to provide a simple demonstration on how to use Jupyter notebook to conduct the machine learning. 

##Objective:
##To classify a fruit into Barhee, Deglet Nour, Sukkary, Rotab Mozafati, Ruthana, Safawi, and Sagai
##based on 34 features including morphological features, shape, and color, were extracted from these images.
## i.e. a multi-class problem

#Sample data from: DATASET: https://www.muratkoklu.com/datasets/
#Reference: https://www.hindawi.com/journals/mpe/2021/4793293/
#KOKLU, M., KURSUN, R., TASPINAR, Y. S., and CINAR, I. (2021). Classification of Date Fruits into Genetic Varieties Using Image Analysis. Mathematical Problems in Engineering, Vol.2021

import os
import pandas as pd
from tkinter import *
import sys 
from matplotlib import pyplot as plt

In [3]:
#Load the data file from local storage
os.chdir("../dataset")
df_target = pd.read_csv('date_fruit_datasets_1.csv', encoding='utf-8')
#Quick overview on the sample data
df_target.head()

Unnamed: 0,AREA,PERIMETER,MAJOR_AXIS,MINOR_AXIS,ECCENTRICITY,EQDIASQ,SOLIDITY,CONVEX_AREA,EXTENT,ASPECT_RATIO,...,KurtosisRR,KurtosisRG,KurtosisRB,EntropyRR,EntropyRG,EntropyRB,ALLdaub4RR,ALLdaub4RG,ALLdaub4RB,Class
0,422163,2378.908,837.8484,645.6693,0.6373,733.1539,0.9947,424428,0.7831,1.2976,...,3.237,2.9574,4.2287,-59191260000.0,-50714214400,-39922372608,58.7255,54.9554,47.84,BERHI
1,338136,2085.144,723.8198,595.2073,0.569,656.1464,0.9974,339014,0.7795,1.2161,...,2.6228,2.635,3.1704,-34233070000.0,-37462601728,-31477794816,50.0259,52.8168,47.8315,BERHI
2,526843,2647.394,940.7379,715.3638,0.6494,819.0222,0.9962,528876,0.7657,1.315,...,3.7516,3.8611,4.7192,-93948350000.0,-74738221056,-60311207936,65.4772,59.286,51.9378,BERHI
3,416063,2351.21,827.9804,645.2988,0.6266,727.8378,0.9948,418255,0.7759,1.2831,...,5.0401,8.6136,8.2618,-32074310000.0,-32060925952,-29575010304,43.39,44.1259,41.1882,BERHI
4,347562,2160.354,763.9877,582.8359,0.6465,665.2291,0.9908,350797,0.7569,1.3108,...,2.7016,2.9761,4.4146,-39980970000.0,-35980042240,-25593278464,52.7743,50.908,42.6666,BERHI


In [4]:
#Check the basic info of data columns and the data type
from sklearn.metrics import mean_absolute_percentage_error
df_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898 entries, 0 to 897
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   AREA           898 non-null    int64  
 1   PERIMETER      898 non-null    float64
 2   MAJOR_AXIS     898 non-null    float64
 3   MINOR_AXIS     898 non-null    float64
 4   ECCENTRICITY   898 non-null    float64
 5   EQDIASQ        898 non-null    float64
 6   SOLIDITY       898 non-null    float64
 7   CONVEX_AREA    898 non-null    int64  
 8   EXTENT         898 non-null    float64
 9   ASPECT_RATIO   898 non-null    float64
 10  ROUNDNESS      898 non-null    float64
 11  COMPACTNESS    898 non-null    float64
 12  SHAPEFACTOR_1  898 non-null    float64
 13  SHAPEFACTOR_2  898 non-null    float64
 14  SHAPEFACTOR_3  898 non-null    float64
 15  SHAPEFACTOR_4  898 non-null    float64
 16  MeanRR         898 non-null    float64
 17  MeanRG         898 non-null    float64
 18  MeanRB    

In [5]:
#To remove dataframe with duplicate rows, only keep the first occurrence when necessary
df_target.drop_duplicates(keep = 'first' , inplace = True)
df_target.shape

(898, 35)

In [6]:
#Count the no. of classified labels, i.e. in this sample, 7 fruit types
df_target.Class.value_counts()

DOKOL     204
SAFAVI    199
ROTANA    166
DEGLET     98
SOGAY      94
IRAQI      72
BERHI      65
Name: Class, dtype: int64

In [7]:
#Start trining the classification model
from sklearn.model_selection import train_test_split
train, test = train_test_split( df_target, test_size= 0.2, random_state = 42, shuffle = True)
train.shape, test.shape

((718, 35), (180, 35))

In [8]:
train.isnull().sum().sort_values(ascending=False)

AREA             0
KurtosisRG       0
StdDevRG         0
StdDevRB         0
SkewRR           0
SkewRG           0
SkewRB           0
KurtosisRR       0
KurtosisRB       0
MeanRB           0
EntropyRR        0
EntropyRG        0
EntropyRB        0
ALLdaub4RR       0
ALLdaub4RG       0
ALLdaub4RB       0
StdDevRR         0
MeanRG           0
PERIMETER        0
EXTENT           0
MAJOR_AXIS       0
MINOR_AXIS       0
ECCENTRICITY     0
EQDIASQ          0
SOLIDITY         0
CONVEX_AREA      0
ASPECT_RATIO     0
MeanRR           0
ROUNDNESS        0
COMPACTNESS      0
SHAPEFACTOR_1    0
SHAPEFACTOR_2    0
SHAPEFACTOR_3    0
SHAPEFACTOR_4    0
Class            0
dtype: int64

In [9]:
pd.options.display.max_columns = None
train[train.isna().any(axis=1)]

Unnamed: 0,AREA,PERIMETER,MAJOR_AXIS,MINOR_AXIS,ECCENTRICITY,EQDIASQ,SOLIDITY,CONVEX_AREA,EXTENT,ASPECT_RATIO,ROUNDNESS,COMPACTNESS,SHAPEFACTOR_1,SHAPEFACTOR_2,SHAPEFACTOR_3,SHAPEFACTOR_4,MeanRR,MeanRG,MeanRB,StdDevRR,StdDevRG,StdDevRB,SkewRR,SkewRG,SkewRB,KurtosisRR,KurtosisRG,KurtosisRB,EntropyRR,EntropyRG,EntropyRB,ALLdaub4RR,ALLdaub4RG,ALLdaub4RB,Class


In [10]:
test[test.isna().any(axis=1)]

Unnamed: 0,AREA,PERIMETER,MAJOR_AXIS,MINOR_AXIS,ECCENTRICITY,EQDIASQ,SOLIDITY,CONVEX_AREA,EXTENT,ASPECT_RATIO,ROUNDNESS,COMPACTNESS,SHAPEFACTOR_1,SHAPEFACTOR_2,SHAPEFACTOR_3,SHAPEFACTOR_4,MeanRR,MeanRG,MeanRB,StdDevRR,StdDevRG,StdDevRB,SkewRR,SkewRG,SkewRB,KurtosisRR,KurtosisRG,KurtosisRB,EntropyRR,EntropyRG,EntropyRB,ALLdaub4RR,ALLdaub4RG,ALLdaub4RB,Class


In [11]:
from flaml import AutoML
automl = AutoML()
y = train.pop('Class')
X = train

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.3, random_state = 42,shuffle = True)

In [13]:
automl.fit(X_train, y_train, task="classification",metric='log_loss',time_budget=300)

[flaml.automl: 08-24 14:55:47] {2444} INFO - task = classification
[flaml.automl: 08-24 14:55:47] {2446} INFO - Data split method: stratified
[flaml.automl: 08-24 14:55:47] {2449} INFO - Evaluation method: cv
[flaml.automl: 08-24 14:55:47] {2568} INFO - Minimizing error metric: log_loss
[flaml.automl: 08-24 14:55:47] {2708} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl: 08-24 14:55:47] {3010} INFO - iteration 0, current learner lgbm


In [None]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best log_loss on validation data: {0:.4g}'.format(automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

NameError: name 'automl' is not defined

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_train, automl.predict(X_train)))

              precision    recall  f1-score   support

       BERHI       1.00      0.94      0.97        36
      DEGLET       0.98      0.91      0.94        55
       DOKOL       0.96      1.00      0.98       114
       IRAQI       0.95      1.00      0.98        41
      ROTANA       1.00      0.99      0.99        93
      SAFAVI       1.00      1.00      1.00       111
       SOGAY       0.98      0.98      0.98        52

    accuracy                           0.98       502
   macro avg       0.98      0.97      0.98       502
weighted avg       0.98      0.98      0.98       502

