# Import libraries

In [1]:
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy as sa

%matplotlib inline

# Get the Dataset

In [2]:
sql_con_str = 'mysql+mysqldb://mercenary:Flxi8571@40.69.142.165:3306/Sustayn'  # NST02 / PROD
ENGINE = sa.create_engine(sql_con_str, pool_recycle = 3600)

# Stract the data from the Database
SQL = """
SELECT * 
FROM sustayn.v_ml_baler_productor_history
WHERE audit_date IS NULL
UNION
SELECT DeviceType, Null, Null, max(package_date), device_id, Null, Null, Null, Null, NUll, Null, NUll, 'KG', 'A', Null, Null
FROM sustayn.v_ml_baler_productor_history
WHERE audit_date IS NOT NULL
GROUP BY DeviceType, device_id;
"""

# read data from db to DataFrame
df = pd.read_sql_query(SQL, ENGINE)
#df.to_csv('data_frame-predict.csv')
df.shape

(262, 16)

# Analyse Dataset

In [52]:
df = pd.read_csv('data_frame-predict.csv')

In [53]:
# filling the missing values
df.fillna(0, inplace=True)

In [54]:
df.head(3).T

Unnamed: 0,0,1,2
Unnamed: 0,0,1,2
DeviceType,Baler,Baler,Baler
id,014dac81-12ab-499f-9c69-adaa43409e44,06689642-d633-4d30-a7a0-6e7711190362,09002e5e-6596-4dc4-9e3b-9f86d155f4cd
img_url,014dac81-12ab-499f-9c69-adaa43409e44,06689642-d633-4d30-a7a0-6e7711190362,09002e5e-6596-4dc4-9e3b-9f86d155f4cd
package_date,6/19/2020 20:53,6/26/2020 15:39,6/26/2020 14:50
device_id,b9e48ef9-1a7b-41c2-bad4-3c70ac2b14e9,175ded97-27a5-4c71-b084-2dccaa759b8c,fc7dacf9-8960-4a88-b4a9-02c36b6df9e0
DeviceCode,LEYM1010,SYSC1002,HEMM1008
package_id,1154,86,997
barcode,0,SYSC10022006260A064006400086,HEMM10082006260A023102310997
material_type,A,A,A


In [55]:
# sort dataframe by device_code and package_date
df.sort_values(by=['device_id','package_date'], ascending=[True, True], inplace=True, )

In [56]:
# reset index with the new order
df.reset_index(inplace=True, drop=True)

In [57]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,DeviceType,id,img_url,package_date,device_id,DeviceCode,package_id,barcode,material_type,material_description_from_original,ir_original_class,net_weight,unit,audit_status,audit_date,audit_userid
0,85,Baler,0,0,6/15/2019 9:25,02467072-ffd6-432e-ab02-97e477ee3f94,0,0.0,0,0,0,0,0.0,KG,A,0.0,0.0
1,86,Baler,0,0,5/11/2020 18:18,039309c9-b9e6-4ca9-ada9-f52937087a79,0,0.0,0,0,0,0,0.0,KG,A,0.0,0.0
2,87,Baler,0,0,6/25/2020 13:33,04152770-fbc7-41a8-9b61-b19eaf55dc8f,0,0.0,0,0,0,0,0.0,KG,A,0.0,0.0


# Create extra labels

## material_description_from_original  

In [58]:
df['material_description_from_original'].value_counts()

0            178
CARTON        54
OCC BALED     10
FILM           9
PLAYO          5
OCC            4
LLD FILM       2
Name: material_description_from_original, dtype: int64

In [59]:
df['material_description_from_original'].unique()

array([0, 'PLAYO', 'CARTON', 'OCC', 'OCC BALED', 'FILM', 'LLD FILM'],
      dtype=object)

In [60]:
# Redefine MaterialDescription to group in 3 categories fewer categories
def material_description(material):
    
    if 'OCC' in material:
        return 'BALED CARDBOARD'
    elif 'CARTON' in material:
        return 'BALED CARDBOARD'
    elif 'CARDBOARD' in material:
        return 'BALED CARDBOARD'
        
    elif 'FILM' in material:
        return 'BALED FILM'
    elif 'LDP' in material:
        return 'BALED FILM'
    elif 'PLAYO' in material:
        return 'BALED FILM'
    elif material == 'PLMX':
        return 'BALED FILM'
    elif material == 'SHRINK WRAP':
        return 'BALED FILM'
    elif material == 'BOMA':
        return 'BALED FILM'
    
    else:
        return 'BALED EMPTY'

In [61]:
df['material_description_f'] = df['material_description_from_original'].astype(str).str.upper().apply(material_description)

In [62]:
df['material_description_f'].value_counts()

BALED EMPTY        178
BALED CARDBOARD     68
BALED FILM          16
Name: material_description_f, dtype: int64

## package_date

In [63]:
# create a new field from the package_date but truncating the secs
df['package_date']   = df['package_date'].astype('datetime64')
df['package_date_f'] = df['package_date'].dt.strftime("%m/%d/%Y %H:%M").astype('datetime64')

In [64]:
# Creata a time interval delta
def diff_func(df):
    return abs(df.diff().dt.total_seconds() / 60)

# Now call the function using .apply
df['time_delta_f'] = df.groupby(['device_id'])['package_date'].apply(diff_func)

# Fill in any NaN values
df['time_delta_f'].fillna('0', inplace=True)

# Convert the output into a float
df['time_delta_f'] = pd.to_numeric(df['time_delta_f']).astype(int)

## standard_weight

In [65]:
df['unit'].value_counts()

KG    237
LB     25
Name: unit, dtype: int64

In [66]:
# Standardize all the weights by making everything KGs
def standard_weight(row):
    if row['unit'] == 'LB':
        return round(row['net_weight'] * 0.453592 , 0)
    else:
        return row['net_weight']
    
df['standard_weight_f'] = df.apply(standard_weight, axis =1).astype(int)

In [67]:
df.loc[df['unit'] == 'LB' , ['unit','net_weight','standard_weight_f']].head(3)

Unnamed: 0,unit,net_weight,standard_weight_f
15,LB,1035.0,469
17,LB,642.0,291
18,LB,628.0,285


In [68]:
df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Unnamed: 0,85,86,87,28,83,88,23,89,90,91
DeviceType,Baler,Baler,Baler,Baler,Baler,Baler,Baler,Baler,Baler,Baler
id,0,0,0,56016431-4754-4155-a88e-34025a996b0a,fe5cddc7-303e-4a04-a909-337cc23e9f23,0,48fa2e14-0fde-4b83-99c6-592287781f1b,0,0,0
img_url,0,0,0,56016431-4754-4155-a88e-34025a996b0a,fe5cddc7-303e-4a04-a909-337cc23e9f23,0,48fa2e14-0fde-4b83-99c6-592287781f1b,0,0,0
package_date,2019-06-15 09:25:00,2020-05-11 18:18:00,2020-06-25 13:33:00,2020-06-26 10:39:00,2020-06-26 09:50:00,2020-06-25 09:28:00,2020-06-26 11:15:00,2019-05-31 15:34:00,2020-04-28 17:52:00,2020-06-24 10:17:00
device_id,02467072-ffd6-432e-ab02-97e477ee3f94,039309c9-b9e6-4ca9-ada9-f52937087a79,04152770-fbc7-41a8-9b61-b19eaf55dc8f,04152770-fbc7-41a8-9b61-b19eaf55dc8f,04152770-fbc7-41a8-9b61-b19eaf55dc8f,09d09144-5db2-4461-8706-2f74b06e52b8,09d09144-5db2-4461-8706-2f74b06e52b8,0ae4190d-6acb-463d-857d-aab765f4554b,0cf08389-c413-46b8-b260-2b2d2d94778d,0d4af338-80b8-49eb-a0e4-dc074deebc3b
DeviceCode,0,0,0,WALM1018,WALM1018,0,LEYM1008,0,0,0
package_id,0,0,0,604,603,0,539,0,0,0
barcode,0,0,0,WALM10182006260B011201120604,WALM10182006260A035903590603,0,0,0,0,0
material_type,0,0,0,B,A,0,A,0,0,0


## barcode to label

In [69]:
# How many barcode are duplicated?

df['barcode'].value_counts()[0:5]

0                               203
WALM10182006260A035903590603      1
WALM10042006260A041204120991      1
SYSC10022006260A063706370082      1
AMZS10172006260A083008300017      1
Name: barcode, dtype: int64

In [70]:
# Create an aditional field that contains if the bale has a barcode or not.
df['barcode_f'] = np.where(df['barcode'] == 'null', 0, df['barcode'])

In [71]:
df['label_f'] = np.where(df['barcode_f'] == 0, 0, 1)

In [72]:
df[df['barcode'] == 'null']['label_f'].unique()

array([], dtype=int32)

In [73]:
df['label_f'].value_counts()

0    203
1     59
Name: label_f, dtype: int64

## img_url to image

In [74]:
# create an aditional field that contains if the bale has a image or not.
df['img_url'].value_counts()

0                                       187
34676e99-912b-4397-a176-802d63cb2961      1
283793d6-223e-462b-9561-d84ef1eb48c7      1
8d401286-34c5-4370-894a-fdacfcf6c7d8      1
c1a5109b-18a7-49db-844a-c26ac257ebcd      1
                                       ... 
8f2392f9-898d-4f56-ade6-8a2cb79f7146      1
e05fb96c-fbc9-4bbb-ac50-6d37cd61d57b      1
d7a61856-787e-47d9-b161-ceca6b188750      1
c84b91ad-8529-443a-b165-a86161bfb163      1
7e45024a-b0c7-4d10-a78c-a77f4ac21159      1
Name: img_url, Length: 76, dtype: int64

In [75]:
df['image_f'] = np.where(df['img_url'] == 0, 0, 1)

In [76]:
df['image_f'].value_counts()

0    187
1     75
Name: image_f, dtype: int64

## duplicates in manual audit

In [77]:
# Duplicates are marked from zero to one
df['audit_duplicates_f'] = df.groupby(by=['device_id'])['time_delta_f'].shift(-1, fill_value=60)
df['audit_duplicates_f'] = np.where(df['audit_duplicates_f'] <= 5, 0, 1)

In [78]:
df[df['audit_duplicates_f'] == 0].head(10).T

Unnamed: 0,17,19,20,21,25,90,99,111,112,113
Unnamed: 0,71,20,8,64,62,34,76,13,70,36
DeviceType,Baler,Baler,Baler,Baler,Baler,Baler,Baler,Baler,Baler,Baler
id,c84b91ad-8529-443a-b165-a86161bfb163,3dd68889-d79b-4081-b1f5-95096982efac,1c817bdb-3433-4d74-b317-fb726cefdfe5,b8c19c00-a748-45db-88f9-8be71b39dff0,b4721650-8585-4200-b22b-1741f2b81035,6b83edb2-d5f3-4409-8286-681e2eaf437a,e942fbed-bbcf-4484-8f2e-5c4e4c51eb27,2dd67cc1-9323-4dc1-af7a-29cda1a8b148,c6f106eb-7546-4021-9a6c-555aafc16941,7318b1cc-139d-48b0-884d-d0522d98b439
img_url,c84b91ad-8529-443a-b165-a86161bfb163,3dd68889-d79b-4081-b1f5-95096982efac,1c817bdb-3433-4d74-b317-fb726cefdfe5,b8c19c00-a748-45db-88f9-8be71b39dff0,b4721650-8585-4200-b22b-1741f2b81035,6b83edb2-d5f3-4409-8286-681e2eaf437a,e942fbed-bbcf-4484-8f2e-5c4e4c51eb27,0,0,0
package_date,2020-06-26 15:01:00,2020-06-26 15:20:00,2020-06-26 15:24:00,2020-06-26 15:24:00,2020-06-26 15:48:00,2020-06-26 10:28:00,2020-06-26 09:38:00,2020-06-26 09:52:00,2020-06-26 09:53:00,2020-06-26 09:54:00
device_id,175ded97-27a5-4c71-b084-2dccaa759b8c,175ded97-27a5-4c71-b084-2dccaa759b8c,175ded97-27a5-4c71-b084-2dccaa759b8c,175ded97-27a5-4c71-b084-2dccaa759b8c,175ded97-27a5-4c71-b084-2dccaa759b8c,668c00d2-b89d-49a7-b8f6-685f089ee329,6ae02536-9609-4668-8eca-7fd9b98135ca,7849428d-2323-408b-81a0-9048d573841e,7849428d-2323-408b-81a0-9048d573841e,7849428d-2323-408b-81a0-9048d573841e
DeviceCode,SYSC1002,SYSC1002,SYSC1002,SYSC1002,SYSC1002,FETX1003,LEYM1003,HFCC1001,HFCC1001,HFCC1001
package_id,79,81,82,83,87,222,2542,438,439,441
barcode,SYSC10022006260A064206420079,SYSC10022006260A062306230081,SYSC10022006260A063706370082,SYSC10022006260A062206220083,SYSC10022006260A061806180087,FETX10032006260A077007700222,0,HFCC10012006260A025002500438,HFCC10012006260A025002500439,HFCC10012006260A025002500441
material_type,A,A,A,A,A,A,A,A,A,A


In [79]:
df[df['device_id'] == '668c00d2-b89d-49a7-b8f6-685f089ee329'].T

Unnamed: 0,89,90,91
Unnamed: 0,153,34,73
DeviceType,Baler,Baler,Baler
id,0,6b83edb2-d5f3-4409-8286-681e2eaf437a,d7a61856-787e-47d9-b161-ceca6b188750
img_url,0,6b83edb2-d5f3-4409-8286-681e2eaf437a,d7a61856-787e-47d9-b161-ceca6b188750
package_date,2020-06-24 21:54:00,2020-06-26 10:28:00,2020-06-26 10:28:00
device_id,668c00d2-b89d-49a7-b8f6-685f089ee329,668c00d2-b89d-49a7-b8f6-685f089ee329,668c00d2-b89d-49a7-b8f6-685f089ee329
DeviceCode,0,FETX1003,FETX1003
package_id,0,222,223
barcode,0,FETX10032006260A077007700222,FETX10032006260A077507750223
material_type,0,A,A


## mark group of duplicates

In [80]:
df['duplicates'] = df.groupby(by=['device_id'])['audit_duplicates_f'].diff(1).fillna(0)
df['duplicates'].unique()

array([ 0., -1.,  1.])

In [81]:
df[df['device_id'] == '668c00d2-b89d-49a7-b8f6-685f089ee329'].T

Unnamed: 0,89,90,91
Unnamed: 0,153,34,73
DeviceType,Baler,Baler,Baler
id,0,6b83edb2-d5f3-4409-8286-681e2eaf437a,d7a61856-787e-47d9-b161-ceca6b188750
img_url,0,6b83edb2-d5f3-4409-8286-681e2eaf437a,d7a61856-787e-47d9-b161-ceca6b188750
package_date,2020-06-24 21:54:00,2020-06-26 10:28:00,2020-06-26 10:28:00
device_id,668c00d2-b89d-49a7-b8f6-685f089ee329,668c00d2-b89d-49a7-b8f6-685f089ee329,668c00d2-b89d-49a7-b8f6-685f089ee329
DeviceCode,0,FETX1003,FETX1003
package_id,0,222,223
barcode,0,FETX10032006260A077007700222,FETX10032006260A077507750223
material_type,0,A,A


In [82]:
df['duplicates'] = np.where(df['audit_duplicates_f'] == 0, 1, df['duplicates']).astype(int)
df['duplicates'].unique()

array([0, 1])

In [83]:
df[df['device_id'] == '668c00d2-b89d-49a7-b8f6-685f089ee329'].T

Unnamed: 0,89,90,91
Unnamed: 0,153,34,73
DeviceType,Baler,Baler,Baler
id,0,6b83edb2-d5f3-4409-8286-681e2eaf437a,d7a61856-787e-47d9-b161-ceca6b188750
img_url,0,6b83edb2-d5f3-4409-8286-681e2eaf437a,d7a61856-787e-47d9-b161-ceca6b188750
package_date,2020-06-24 21:54:00,2020-06-26 10:28:00,2020-06-26 10:28:00
device_id,668c00d2-b89d-49a7-b8f6-685f089ee329,668c00d2-b89d-49a7-b8f6-685f089ee329,668c00d2-b89d-49a7-b8f6-685f089ee329
DeviceCode,0,FETX1003,FETX1003
package_id,0,222,223
barcode,0,FETX10032006260A077007700222,FETX10032006260A077507750223
material_type,0,A,A


In [84]:
df.drop(df[df['audit_status'] == 'A'].index, inplace=True)

In [85]:
df.shape

(85, 26)

## audit result of the group of duplicates

In [86]:
def find_reperead_groups(temp_df):
    print(temp_df['device_id'].unique())
    
    group_flag = False

    lower_pointer = 0
    upper_pointer = 0

    for i in range(len(temp_df)):

        if temp_df['audit_duplicates_f'].iloc[i] == 0:

            if group_flag == False:
                group_flag = True
                lower_pointer = i

        else:
            if group_flag == True:
                group_flag = False
                upper_pointer = i
                
                audit_lst = list(temp_df[lower_pointer:upper_pointer+1]['audit_status'])

                print('lower_pointer', lower_pointer, ': upper_pointer', upper_pointer)
                print(temp_df[lower_pointer:upper_pointer+1][['id','audit_status','package_date_f','time_delta_f']])
                        
    return temp_df

In [87]:
df = df.groupby(by=['device_id']).apply(find_reperead_groups)

['04152770-fbc7-41a8-9b61-b19eaf55dc8f']
['09d09144-5db2-4461-8706-2f74b06e52b8']
['15da27b8-81d2-4944-8479-b1a2445c0d2a']
['175ded97-27a5-4c71-b084-2dccaa759b8c']
lower_pointer 0 : upper_pointer 1
                                      id audit_status      package_date_f  \
17  c84b91ad-8529-443a-b165-a86161bfb163            0 2020-06-26 15:01:00   
18  4963a19c-3968-43b2-a19e-bc4055a7cca5            0 2020-06-26 15:06:00   

    time_delta_f  
17          2183  
18             5  
lower_pointer 2 : upper_pointer 5
                                      id audit_status      package_date_f  \
19  3dd68889-d79b-4081-b1f5-95096982efac            0 2020-06-26 15:20:00   
20  1c817bdb-3433-4d74-b317-fb726cefdfe5            0 2020-06-26 15:24:00   
21  b8c19c00-a748-45db-88f9-8be71b39dff0            0 2020-06-26 15:24:00   
22  b1926e91-ea27-4281-9aac-3f3c6324fdc8            0 2020-06-26 15:26:00   

    time_delta_f  
19            14  
20             4  
21             0  
22             2 

In [88]:
df.head(3).T

Unnamed: 0,3,4,6
Unnamed: 0,28,83,23
DeviceType,Baler,Baler,Baler
id,56016431-4754-4155-a88e-34025a996b0a,fe5cddc7-303e-4a04-a909-337cc23e9f23,48fa2e14-0fde-4b83-99c6-592287781f1b
img_url,56016431-4754-4155-a88e-34025a996b0a,fe5cddc7-303e-4a04-a909-337cc23e9f23,48fa2e14-0fde-4b83-99c6-592287781f1b
package_date,2020-06-26 10:39:00,2020-06-26 09:50:00,2020-06-26 11:15:00
device_id,04152770-fbc7-41a8-9b61-b19eaf55dc8f,04152770-fbc7-41a8-9b61-b19eaf55dc8f,09d09144-5db2-4461-8706-2f74b06e52b8
DeviceCode,WALM1018,WALM1018,LEYM1008
package_id,604,603,539
barcode,WALM10182006260B011201120604,WALM10182006260A035903590603,0
material_type,B,A,A


In [92]:
df['ir_original_class'] = np.where(df['ir_original_class'] == 'None', 'BALED EMPTY', df['ir_original_class'])
df['ir_original_class'] = np.where(df['ir_original_class'] == 0, 'BALED EMPTY', df['ir_original_class'])
df['ir_original_class'] = np.where(df['ir_original_class'] == 'BALED PLASTIC', 'BALED EMPTY', df['ir_original_class'])
df['ir_original_class'] = np.where(df['ir_original_class'] == 'BALED FOAM', 'BALED EMPTY', df['ir_original_class'])
df['ir_original_class'] = np.where(df['ir_original_class'] == 'BALED PET', 'BALED EMPTY', df['ir_original_class'])
df['ir_original_class'] = np.where(df['ir_original_class'] == 'BALED SHRINK', 'BALED EMPTY', df['ir_original_class'])
df['ir_original_class'] = np.where(df['ir_original_class'] == 'BALED ALUMINUM', 'BALED EMPTY', df['ir_original_class'])

## Feature selection

In [93]:
df_src = df[['device_id',
             'id',
             'label_f',
             'image_f',
             'time_delta_f',
             'standard_weight_f',
             'material_description_f',
             'ir_original_class',
             'duplicates']]

In [94]:
df_model = pd.get_dummies(df_src, columns=["material_description_f","ir_original_class"], drop_first=False)

In [95]:
df_model.columns

Index(['device_id', 'id', 'label_f', 'image_f', 'time_delta_f',
       'standard_weight_f', 'duplicates',
       'material_description_f_BALED CARDBOARD',
       'material_description_f_BALED EMPTY',
       'material_description_f_BALED FILM',
       'ir_original_class_BALED CARDBOARD', 'ir_original_class_BALED EMPTY',
       'ir_original_class_BALED FILM'],
      dtype='object')

## One Hot Encoding

In [96]:
#df_model.drop(columns=['material_description_f_BALED EMPTY'], inplace=True)
df_model.drop(columns=['material_description_f_BALED EMPTY','ir_original_class_BALED EMPTY'], inplace=True)

In [97]:
#df_model[(df_model['audit_status_f'] == 'A') & (df_model['material_description_f_BALED CARDBOARD'] == 1)].head(21).T

In [100]:
df_model.to_csv('data_frame_to_model.csv', index_label=False)

# Split data in Training and Validation Sets

In [98]:
# Define the X and Y variables
X = df_model[['label_f',
              'image_f',
              'time_delta_w',
              'standard_weight_w',
              'material_description_f_BALED CARDBOARD',
              'material_description_f_BALED FILM',
              'ir_original_class_BALED CARDBOARD',
              'ir_original_class_BALED FILM'
             #'duplicates'
             ]]

y = df_model['audit_status_f'] # binary output

KeyError: "['standard_weight_w', 'time_delta_w'] not in index"

In [None]:
import pickle

In [None]:
pickle_out = open('baler_classifier.pkl','wb')
pickle.dump(rfc2, pickle_out)
pickle_out.close()