# Import libraries

In [1]:
import pickle
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy as sa

%matplotlib inline

# Get the Dataset

In [2]:
sql_con_str = 'mysql+mysqldb://mercenary:Flxi8571@40.69.142.165:3306/Sustayn'  # NST02 / PROD
ENGINE = sa.create_engine(sql_con_str, pool_recycle = 3600)

# Stract the data from the Database
SQL = """
SELECT * 
FROM sustayn.v_ml_baler_productor_history
WHERE audit_date IS NULL
UNION
SELECT DeviceType, Null, Null, max(package_date), device_id, Null, Null, Null, Null, NUll, Null, NUll, 'KG', 'A', Null, Null, Null, Null
FROM sustayn.v_ml_baler_productor_history
WHERE audit_date IS NOT NULL
GROUP BY DeviceType, device_id;
"""

# read data from db to DataFrame
df = pd.read_sql_query(SQL, ENGINE)
df.to_csv('data_frame_src_06292020.csv', index=False)
df.shape

(243, 18)

# Analyse Dataset

In [3]:
#df = pd.read_csv('data_frame_src_06292020-bk.csv', index_col=False)
#df.shape

In [4]:
# filling the missing values
df.fillna(0, inplace=True)

In [5]:
# Convert package_date to date time to order for the most recent
df['package_date']   = df['package_date'].astype('datetime64')

In [6]:
# sort dataframe by device_code and package_date
df.sort_values(by=['device_id','package_date'], ascending=[True, True], inplace=True,)

In [7]:
# reset index with the new order
df.reset_index(inplace=True, drop=True)

In [8]:
df[df['device_id'] == 'd0e3f2d5-e865-40f4-897e-9b517bd394a6']

Unnamed: 0,DeviceType,id,img_url,package_date,device_id,DeviceCode,package_id,barcode,material_type,material_description_from_original,ir_original_class,net_weight,unit,audit_status,audit_date,audit_userid,ir_class,ir_confidence
186,Baler,0,0,2020-06-29 14:48:22,d0e3f2d5-e865-40f4-897e-9b517bd394a6,0,0.0,0,0,0,0,0.0,KG,A,0,0,0,0.0


# Create extra labels

## material_description_from_original  

In [9]:
# Redefine MaterialDescription to group in 3 categories fewer categories
def material_description(material):
    
    if 'OCC' in material:
        return 'BALED CARDBOARD'
    elif 'CARTON' in material:
        return 'BALED CARDBOARD'
    elif 'CARDBOARD' in material:
        return 'BALED CARDBOARD'
        
    elif 'FILM' in material:
        return 'BALED FILM'
    elif 'LDP' in material:
        return 'BALED FILM'
    elif 'PLAYO' in material:
        return 'BALED FILM'
    elif material == 'PLMX':
        return 'BALED FILM'
    elif material == 'SHRINK WRAP':
        return 'BALED FILM'
    elif material == 'BOMA':
        return 'BALED FILM'
    
    else:
        return 'BALED EMPTY'

In [10]:
df['material_description_f'] = df['material_description_from_original'].astype(str).str.upper().apply(material_description)

In [11]:
df['material_description_f'].value_counts()

BALED EMPTY        185
BALED CARDBOARD     48
BALED FILM          10
Name: material_description_f, dtype: int64

## material_description_prev

In [12]:
df['material_description_prev'] = df.groupby(by=['device_id'])['material_description_f'].shift(-1)

In [13]:
df['material_description_prev'] = df['material_description_prev'].fillna(df['material_description_f'])

## package_date

In [14]:
# create a new field from the package_date but truncating the secs
df['package_date_f'] = df['package_date'].dt.strftime("%m/%d/%Y %H:%M").astype('datetime64')

In [15]:
# Creata a time interval delta
def diff_func(df):
    return abs(df.diff().dt.total_seconds() / 60)

# Now call the function using .apply
df['time_delta_f'] = df.groupby(['device_id'])['package_date'].apply(diff_func)

# Fill in any NaN values
df['time_delta_f'].fillna('0', inplace=True)

# Convert the output into a float
df['time_delta_f'] = pd.to_numeric(df['time_delta_f']).astype(int)

## standard_weight

In [16]:
# Standardize all the weights by making everything KGs
def standard_weight(row):
    if row['unit'] == 'LB':
        return round(row['net_weight'] * 0.453592 , 0)
    else:
        return row['net_weight']
    
df['standard_weight_f'] = df.apply(standard_weight, axis =1).astype(int)

## barcode to label

In [17]:
# Create an aditional field that contains if the bale has a barcode or not.
df['barcode_f'] = np.where(df['barcode'] == 'null', 0, df['barcode'])

In [18]:
df['label_f'] = np.where(df['barcode_f'] == 0, 0, 1)

## img_url to image

In [19]:
df['image_f'] = np.where(df['img_url'] == 0, 0, 1)

In [20]:
df.drop(columns=['DeviceType','DeviceCode','package_id','material_type','barcode','audit_date','audit_userid','ir_confidence'], inplace=True)

## duplicates in manual audit

In [21]:
# Duplicates are marked from zero to one
df['audit_duplicates_f'] = df.groupby(by=['device_id'])['time_delta_f'].shift(-1, fill_value=60)
df['audit_duplicates_f'] = np.where(df['audit_duplicates_f'] <= 10, 0, 1)

In [22]:
df[df['device_id'] == 'd0e3f2d5-e865-40f4-897e-9b517bd394a6'].sort_values(by='package_date')

Unnamed: 0,id,img_url,package_date,device_id,material_description_from_original,ir_original_class,net_weight,unit,audit_status,ir_class,material_description_f,material_description_prev,package_date_f,time_delta_f,standard_weight_f,barcode_f,label_f,image_f,audit_duplicates_f
186,0,0,2020-06-29 14:48:22,d0e3f2d5-e865-40f4-897e-9b517bd394a6,0,0,0.0,KG,A,0,BALED EMPTY,BALED EMPTY,2020-06-29 14:48:00,0,0,0,0,0,1


## Eliminate records audited

In [23]:
df.drop(df[df['audit_status'] == 'A'].index, inplace=True)
df.shape

(66, 19)

## distinct the group with similar items

In [24]:
df['audit_duplicates_groups'] = np.where(df['material_description_prev'] != df['material_description_f'], 1, df['audit_duplicates_f'])

In [25]:
df[df['device_id'] == '461bfe45-a57f-4713-bde0-6388586f1507'][['material_description_f','material_description_prev']]

Unnamed: 0,material_description_f,material_description_prev
61,BALED CARDBOARD,BALED CARDBOARD
62,BALED CARDBOARD,BALED CARDBOARD


In [26]:
df[df['device_id'] == '9fb44f7b-c0db-495d-b99a-77850eb4d385'][['material_description_f','material_description_prev']]

Unnamed: 0,material_description_f,material_description_prev


In [27]:
df[df['device_id'] == '461bfe45-a57f-4713-bde0-6388586f1507'].T
df[df['device_id'] == 'd0e3f2d5-e865-40f4-897e-9b517bd394a6'].T

id
img_url
package_date
device_id
material_description_from_original
ir_original_class
net_weight
unit
audit_status
ir_class
material_description_f


In [28]:
df[df['device_id'] == '9fb44f7b-c0db-495d-b99a-77850eb4d385'].T

id
img_url
package_date
device_id
material_description_from_original
ir_original_class
net_weight
unit
audit_status
ir_class
material_description_f


# Load Model

In [29]:
pickle_in = open('baler_classifier.pkl', 'rb')
classifier = pickle.load(pickle_in)

# Dataset to predict

In [30]:
df_src = df[['device_id',
             'id',
             'label_f',
             'image_f',
             'time_delta_f',
             'standard_weight_f',
             'material_description_f']]

In [31]:
df_model = pd.get_dummies(df_src,
                          columns=["material_description_f"],
                          drop_first=False)

df_model.drop(columns=['material_description_f_BALED EMPTY'], 
              inplace=True)

df_model.head(2)

Unnamed: 0,device_id,id,label_f,image_f,time_delta_f,standard_weight_f,material_description_f_BALED CARDBOARD,material_description_f_BALED FILM
3,04152770-fbc7-41a8-9b61-b19eaf55dc8f,57073249-2b8d-45e3-a331-52d4d3d34c89,1,1,1471,444,1,0
9,0dae3c1a-8ebb-49ef-a0a3-8440a804a6fd,a0f7f522-423f-4d1b-aad7-f7f240d51327,1,1,3928,497,0,1


In [32]:
# Define the X and Y variables
X = df_model[['label_f',
              'image_f',
              'time_delta_f',
              'standard_weight_f',
              'material_description_f_BALED CARDBOARD',
              'material_description_f_BALED FILM']]

In [33]:
pred_y = classifier.predict(X)
pred_y

df['audit_status_pred'] = pred_y

In [34]:
df[df['device_id'] == '461bfe45-a57f-4713-bde0-6388586f1507'].T

Unnamed: 0,61,62
id,c4b8224e-90d7-44b9-a3d1-25b09015e7b1,9e033e83-ecee-4629-9724-f8a31eebba7f
img_url,c4b8224e-90d7-44b9-a3d1-25b09015e7b1,9e033e83-ecee-4629-9724-f8a31eebba7f
package_date,2020-06-30 14:36:11,2020-06-30 15:06:03
device_id,461bfe45-a57f-4713-bde0-6388586f1507,461bfe45-a57f-4713-bde0-6388586f1507
material_description_from_original,CARTON,CARTON
ir_original_class,BALED CARDBOARD,BALED CARDBOARD
net_weight,373,348
unit,KG,KG
audit_status,0,0
ir_class,BALED EMPTY,BALED CARDBOARD


In [35]:
df.shape

(66, 21)

In [36]:
df['audit_status_imgs'] = np.where(df['ir_class'] != df['material_description_f'], 'R', 'A')

In [37]:
df[(df['ir_class'] != df['material_description_f'])].count()

id                                    18
img_url                               18
package_date                          18
device_id                             18
material_description_from_original    18
ir_original_class                     18
net_weight                            18
unit                                  18
audit_status                          18
ir_class                              18
material_description_f                18
material_description_prev             18
package_date_f                        18
time_delta_f                          18
standard_weight_f                     18
barcode_f                             18
label_f                               18
image_f                               18
audit_duplicates_f                    18
audit_duplicates_groups               18
audit_status_pred                     18
audit_status_imgs                     18
dtype: int64

## audit result of the group of duplicates

In [38]:
def find_duplicates_in_groups(temp_df):
    
    temp_df['audit_status_group'] = np.where((temp_df['audit_status_pred'] == 'A') & (temp_df['audit_status_imgs'] == 'A'), 'A', 'R')
    
    if len(temp_df) == 1:
        return temp_df

    print(temp_df['device_id'].unique(), ' -- ', len(temp_df))
    
    group_flag = False
    lower_pointer = 0
    upper_pointer = 0

    for i in range(len(temp_df)):

        if temp_df['audit_duplicates_groups'].iloc[i] == 0:

            if group_flag == False:
                group_flag = True
                lower_pointer = i

        else:
            
            if group_flag == True:
                group_flag = False
                upper_pointer = i
                
                print('lower_pointer', lower_pointer, ': upper_pointer', upper_pointer)
                print(temp_df[lower_pointer : upper_pointer + 1][['id',
                                                                  'audit_status_pred',
                                                                  'audit_status_imgs',
                                                                  'package_date_f',
                                                                  'time_delta_f']])
                                
                audit_pre = list(temp_df[lower_pointer:upper_pointer+1]['audit_status_pred'])
                audit_img = list(temp_df[lower_pointer:upper_pointer+1]['audit_status_imgs'])
                
                if ('A' in audit_pre) & ('A' in audit_img):
                    
                    if audit_pre.count('A') > 1:
                        
                        print('\n --- Update the record ---')
                        
                        for j in range(lower_pointer, upper_pointer + 1):
                            
                            temp_df.iloc[j, -1] = temp_df.iloc[j]['audit_status_pred']
                            
                            if temp_df.iloc[j]['audit_status_pred'] == 'A':
                                print('Just accept one record...')
                                temp_df.iloc[j+1:upper_pointer + 1, -1] = 'R'
                                break
                        
                        print(temp_df[lower_pointer : upper_pointer + 1][['id','audit_status_pred',
                                                                  'audit_status_group']])
                        
                    else:
                        print('\n>>> The record is valid <<<\n')
                        
                        ####
                        # Think in the logic for this scenario: 1f3a5d6e-13d2-45a9-866a-bbe337fa6114
                        ###
                else:
                    print('Reject all the records..')
                    for j in range(lower_pointer, upper_pointer + 1):
                            temp_df.iloc[j, -1] = 'R'
                            
    return temp_df

In [39]:
#df = df[df['device_id'] == 'd0e3f2d5-e865-40f4-897e-9b517bd394a6']

In [40]:
df_ = df.groupby(by=['device_id']).apply(find_duplicates_in_groups)

['19c04220-530e-4391-9033-aaeba9c18c6a']  --  2
['1f3a5d6e-13d2-45a9-866a-bbe337fa6114']  --  2
['36464b5e-ae31-4de9-a74d-36d6c3b117f1']  --  2
['3e9a4460-7218-40f8-9572-e20008795361']  --  2
['461bfe45-a57f-4713-bde0-6388586f1507']  --  2
['7849428d-2323-408b-81a0-9048d573841e']  --  2
['7b111958-f1d3-4b15-a169-be6b138bcba0']  --  2
['817631f2-66c6-47c5-82a7-90ab8e023dae']  --  4
['84fde9c4-43fc-4e65-9ed4-52f7d50d8e4c']  --  2
['9f76a584-b71c-4e26-a2fc-1938618ea34c']  --  2
['a6fe2bcc-c031-439b-a89a-604b3e979aa8']  --  4
['ae37f11a-3b99-4f63-9f83-e90b7980d244']  --  2
['bfd582e7-322b-465f-83c8-3e56b0085e31']  --  2
['d1e3e4e0-00dc-4e59-9c50-d0cdb3dee2e0']  --  5
['d6a92243-6e95-4678-ac07-15b51ab71546']  --  5
['f9165f5d-335c-40d6-8ee3-9e6ef7cc4bbf']  --  2


In [45]:
df_[df_['device_id'] == 'a6fe2bcc-c031-439b-a89a-604b3e979aa8'].T

Unnamed: 0,146,147,148,149
id,46e4e988-50ee-407c-a388-58cafdc2a7c7,529fbdd3-52ad-41b9-8a96-996b97afae43,59c02d31-2d05-4861-b81d-1ea6cdb70a81,603e53ac-f377-485b-aab8-d7c4550a68b4
img_url,46e4e988-50ee-407c-a388-58cafdc2a7c7,529fbdd3-52ad-41b9-8a96-996b97afae43,59c02d31-2d05-4861-b81d-1ea6cdb70a81,603e53ac-f377-485b-aab8-d7c4550a68b4
package_date,2020-06-30 10:51:18,2020-06-30 14:34:16,2020-06-30 16:16:45,2020-06-30 16:58:54
device_id,a6fe2bcc-c031-439b-a89a-604b3e979aa8,a6fe2bcc-c031-439b-a89a-604b3e979aa8,a6fe2bcc-c031-439b-a89a-604b3e979aa8,a6fe2bcc-c031-439b-a89a-604b3e979aa8
material_description_from_original,CARTON,CARTON,CARTON,PLAYO
ir_original_class,BALED CARDBOARD,BALED CARDBOARD,BALED CARDBOARD,BALED FILM
net_weight,376,373,390,280
unit,KG,KG,KG,KG
audit_status,0,0,0,0
ir_class,BALED EMPTY,BALED CARDBOARD,BALED CARDBOARD,BALED FILM


In [46]:
df_['audit_status_group'].value_counts()

A    43
R    23
Name: audit_status_group, dtype: int64

In [47]:
df_[df_['audit_status_group'] == 'R']

Unnamed: 0,id,img_url,package_date,device_id,material_description_from_original,ir_original_class,net_weight,unit,audit_status,ir_class,...,time_delta_f,standard_weight_f,barcode_f,label_f,image_f,audit_duplicates_f,audit_duplicates_groups,audit_status_pred,audit_status_imgs,audit_status_group
9,a0f7f522-423f-4d1b-aad7-f7f240d51327,a0f7f522-423f-4d1b-aad7-f7f240d51327,2020-06-30 14:35:43,0dae3c1a-8ebb-49ef-a0a3-8440a804a6fd,SHRINK WRAP,BALED SHRINK,1095.0,LB,0,BALED CARDBOARD,...,3928,497,KROH10022006300B109510950165,1,1,1,1,A,R,R
27,23afd9e4-e92a-4ba2-bc7b-fbd7f235ad18,23afd9e4-e92a-4ba2-bc7b-fbd7f235ad18,2020-06-30 08:07:39,1f3a5d6e-13d2-45a9-866a-bbe337fa6114,CARTON,BALED CARDBOARD,460.0,KG,0,BALED EMPTY,...,1001,460,SORM10012006300A046004600101,1,1,1,1,A,R,R
42,de455474-d0c9-45f0-9198-ac25aac5c447,de455474-d0c9-45f0-9198-ac25aac5c447,2020-06-30 13:21:26,36464b5e-ae31-4de9-a74d-36d6c3b117f1,CARTON,BALED CARDBOARD,438.0,KG,0,BALED EMPTY,...,1196,438,WALM10162006300A043804381220,1,1,1,1,A,R,R
43,4e0fdc3c-2165-428b-8996-76726a27077b,4e0fdc3c-2165-428b-8996-76726a27077b,2020-06-30 14:10:02,36464b5e-ae31-4de9-a74d-36d6c3b117f1,PLAYO,BALED FILM,189.0,KG,0,BALED CARDBOARD,...,48,189,WALM10162006300B018901891221,1,1,1,1,A,R,R
61,c4b8224e-90d7-44b9-a3d1-25b09015e7b1,c4b8224e-90d7-44b9-a3d1-25b09015e7b1,2020-06-30 14:36:11,461bfe45-a57f-4713-bde0-6388586f1507,CARTON,BALED CARDBOARD,373.0,KG,0,BALED EMPTY,...,449,373,SORM10032006300A037303730136,1,1,1,1,A,R,R
76,ac231700-8b76-47fe-b5dd-e0f096a6951d,ac231700-8b76-47fe-b5dd-e0f096a6951d,2020-06-30 13:03:21,590f6785-9d36-47d6-81ea-fabb801cf0f5,0,0,450.0,KG,0,BALED EMPTY,...,1385,450,SORM10062006300A045004500022,1,1,1,1,R,A,R
102,61370301-c58a-4a58-af36-c31a85631f99,0,2020-06-30 10:20:00,7849428d-2323-408b-81a0-9048d573841e,FILM,BALED FILM,439.0,LB,0,0,...,1057,199,HFCC10012006300A043904390450,1,0,1,1,R,R,R
103,dba58c5b-7131-42aa-adbc-e46f388add3c,0,2020-06-30 11:44:54,7849428d-2323-408b-81a0-9048d573841e,FILM,BALED FILM,397.0,LB,0,0,...,84,180,HFCC10012006300A039703970451,1,0,1,1,R,R,R
125,0c069cf7-e1bb-479c-9d66-72f8dfef23c2,0c069cf7-e1bb-479c-9d66-72f8dfef23c2,2020-06-30 15:39:18,8a1bf975-8106-4833-bde4-1d1fea9d2897,CARTON,BALED CARDBOARD,442.0,KG,0,BALED EMPTY,...,435,442,HEMM10032006300A044204422165,1,1,1,1,A,R,R
134,7bd97ad7-1122-43b9-b081-8423025f89ae,7bd97ad7-1122-43b9-b081-8423025f89ae,2020-06-30 12:11:50,9f76a584-b71c-4e26-a2fc-1938618ea34c,0,0,443.0,KG,0,BALED EMPTY,...,5924,443,SORM10082006300A044304430013,1,1,1,1,R,A,R


In [57]:
df_[(df_['ir_class'] == 'BALED EMPTY') & (df_['audit_status_group'] == 'R') & (df_['audit_status_pred'] == 'A') &
   ((df_['material_description_f'] == 'BALED CARDBOARD') | (df_['material_description_f'] =='BALED FILM'))].count()

id                                    10
img_url                               10
package_date                          10
device_id                             10
material_description_from_original    10
ir_original_class                     10
net_weight                            10
unit                                  10
audit_status                          10
ir_class                              10
material_description_f                10
material_description_prev             10
package_date_f                        10
time_delta_f                          10
standard_weight_f                     10
barcode_f                             10
label_f                               10
image_f                               10
audit_duplicates_f                    10
audit_duplicates_groups               10
audit_status_pred                     10
audit_status_imgs                     10
audit_status_group                    10
dtype: int64