In [1]:
import re
import json
import pprint

In [45]:
crowley_details = {"order_number": None,
                   "trip_number": None,
                   "division": None,
                   "load_rate_confirmation": {
                       "dispatch_phone_number": None,
                       "booking_number": None,
                       "equipment_type": None,
                       "weight": None,
                       "pieces_pallets": None,
                       "commodity": None,
                       "mileage": None
                   },
                   "sent_by_contact": {
                       "sent_by": list(),
                       "contact_info": list()
                   },
                   "payment_processing": {
                       "total": None,
                       "line_haul": None,
                       "status_line": None
                   },
                   "live_load": list(),
                   "live_unload": list()
}

In [46]:
def data_clean(text):
    try:
        strip_po = [i.strip() for i in text.split('\n')]
        remove_space = [i for i in strip_po if i not in [' ', '']]
        remove_space_between = [i.split('  ') for i in remove_space]
        cleaned_data = list()
        count = 0
        for i in remove_space_between:
            cleaned_data.append([])
            for j in i:
                if j not in ['']:
                    cleaned_data[count].append(j.strip())
            count += 1
        
        return cleaned_data

    except Exception as e:
        print("kvt data clean exception")
        print(e)
        return None

In [47]:
f = open("crowley2.txt", "r")
tmp = f.read()

In [48]:
tmp

'                                                                                                                                                                                      Sent By:\n                                                                            CROWLEY LOGISTICS, INC.                                                                                Ryan Malloy\n                                                                              US Transportation and Distribution\n                                                                                                                                                                       Ryan.Malloy@crowley.com\n                                                                       9487 Regency Square Blvd, Jacksonville, FL 32225\n                                                                                                                                                                            11/03/2020 10:46

In [49]:
tmp2 = data_clean(tmp)

In [50]:
pprint.pprint(tmp2)

[['Sent By:'],
 ['CROWLEY LOGISTICS, INC.', 'Ryan Malloy'],
 ['US Transportation and Distribution'],
 ['Ryan.Malloy@crowley.com'],
 ['9487 Regency Square Blvd, Jacksonville, FL 32225'],
 ['11/03/2020 10:46 AM'],
 ['Order Number 2897521', 'Division:', 'CDTS'],
 ['LOAD RATE CONFIRMATION', 'Trip Number 4309299'],
 ['Please call 310-732-6550 for dispatch and reference order number 2897521.'],
 ['Carrier:', 'TEMP', 'Truck Pay Total:', '$1,934.00'],
 ['Booking:', '808210080', 'Line Haul Rate:', '$1,934.00'],
 ['Voyage:', 'Fuel Surcharge:', '$0.00'],
 ['Accessorial(s):', '$0.00'],
 ['Payment Processing:'],
 ['Equipment Type:', '53AIR'],
 ['PLEASE DO NOT SUBMIT INVOICES FOR PAYMENT'],
 ['Weight:',
  '5000',
  'Payments will be automatically processed via ERS based on Truck Pay Total '
  '(see above), standard/negotiated payment'],
 ['Pieces/Pallets:',
  '2.00',
  'terms, and receipt and processing of documentation requirements.'],
 ['-Once Trip Segment is closed, payment terms (e.g., Net 30 da

In [51]:
pprint.pprint(crowley_details)

{'division': None,
 'live_load': [],
 'live_unload': [],
 'load_rate_confirmation': {'booking_number': None,
                            'commodity': None,
                            'dispatch_phone_number': None,
                            'equipment_type': None,
                            'mileage': None,
                            'pieces_pallets': None,
                            'weight': None},
 'order_number': None,
 'payment_processing': {'line_haul': None, 'status_line': None, 'total': None},
 'sent_by_contact': {'contact_info': [], 'sent_by': []},
 'trip_number': None}


In [52]:
regex_dv = r'^[A-Z]{4}'
compiled_re = re.compile(regex_dv)

In [53]:
dv_good_1 = 'ABCD'
compiled_re.findall(dv_good_1)

['ABCD']

In [54]:
dv_bad_1 = 'abcd'
compiled_re.findall(dv_bad_1)

[]

In [55]:
dv_bad_2 = 'ABCDEFSDS'
compiled_re.findall(dv_bad_2)

['ABCD']

In [56]:
regex_on = r'\d{7}'
compiled_re = re.compile(regex_on)

In [57]:
on_good_1 = '1234567'
compiled_re.findall(on_good_1)

['1234567']

In [58]:
on_bad_1 = '123'
compiled_re.findall(on_bad_1)

[]

In [59]:
def get_lines_between(origtmp, str1, str2):
    start_index = 0
    stop_index = 0
    for ele_nos, ele in enumerate(origtmp):
        for stchar in ele:
            if str1 in stchar.lower():
                start_index = ele_nos

            if str2 in stchar.lower():
                stop_index = ele_nos
                return origtmp[start_index:stop_index+1]
    return([])

In [60]:
get_lines_between(tmp2, "order number", "division")

[['Order Number 2897521', 'Division:', 'CDTS']]

In [61]:
def get_order_number(origtmp):
    '''
    Derive Order Number
    '''
    
    #Get The Order Code
    regex_on = r'\d{7}'
    compiled_re = re.compile(regex_on)
    
    #Get the relevant lines for search
    get_lines = get_lines_between(tmp2, "order number", "division")
    
    #Loop through the subset of lines and apply regex
    for line in get_lines:
        for wordstr in line:
            on = compiled_re.findall(wordstr)
            if len(on) > 0:
                return(on[0])
    return('')

In [62]:
get_order_number(tmp2)

'2897521'

In [63]:
def get_division(origtmp):
    '''
    Derive Division
    '''
    
    #Get The Division Code or whatever
    regex_dv = r'^[A-Z]{4}'
    compiled_re = re.compile(regex_dv)
    
    #Get the relevant lines for search
    get_lines = get_lines_between(tmp2, "order number", "division")
    
    #Loop through the subset of lines and apply regex
    for line in get_lines:
        for wordstr in line:
            dv = compiled_re.findall(wordstr)
            if len(dv) > 0:
                return(dv[0])
    return('')

In [64]:
get_division(tmp2)

'CDTS'

In [65]:
print(get_division.__doc__)


    Derive Division
    


In [66]:
order_number_only = get_order_number(tmp2)
print(order_number_only)

2897521


In [67]:
division_only = get_division(tmp2)
print(division_only)

CDTS


In [68]:
def get_order_division(origtmp):
    order_number_only = get_order_number(tmp2)
    division_only = get_division(tmp2)
    
    crowley_details["order_number"] = order_number_only
    crowley_details["division"] = division_only

In [69]:
get_order_division(tmp2)

In [70]:
pprint.pprint(crowley_details)

{'division': 'CDTS',
 'live_load': [],
 'live_unload': [],
 'load_rate_confirmation': {'booking_number': None,
                            'commodity': None,
                            'dispatch_phone_number': None,
                            'equipment_type': None,
                            'mileage': None,
                            'pieces_pallets': None,
                            'weight': None},
 'order_number': '2897521',
 'payment_processing': {'line_haul': None, 'status_line': None, 'total': None},
 'sent_by_contact': {'contact_info': [], 'sent_by': []},
 'trip_number': None}


In [71]:
get_lines_between(tmp2, "load rate confirmation", "mileage")

[['LOAD RATE CONFIRMATION', 'Trip Number 4309299'],
 ['Please call 310-732-6550 for dispatch and reference order number 2897521.'],
 ['Carrier:', 'TEMP', 'Truck Pay Total:', '$1,934.00'],
 ['Booking:', '808210080', 'Line Haul Rate:', '$1,934.00'],
 ['Voyage:', 'Fuel Surcharge:', '$0.00'],
 ['Accessorial(s):', '$0.00'],
 ['Payment Processing:'],
 ['Equipment Type:', '53AIR'],
 ['PLEASE DO NOT SUBMIT INVOICES FOR PAYMENT'],
 ['Weight:',
  '5000',
  'Payments will be automatically processed via ERS based on Truck Pay Total (see above), standard/negotiated payment'],
 ['Pieces/Pallets:',
  '2.00',
  'terms, and receipt and processing of documentation requirements.'],
 ['-Once Trip Segment is closed, payment terms (e.g., Net 30 day) will start from the date required documentation is'],
 ['Commodity:', 'U-Box', 'processed.'],
 ['-Detention must be requested and approved at the time of the occurrence. Detention in out times must be noted on'],
 ['Mileage:', '2292', 'BOL/POD for payment to be 

In [72]:
regex_tn = r'^\d{7}$'
compiled_re = re.compile(regex_tn)

In [73]:
tn_good_1 = '1234567'
compiled_re.findall(tn_good_1)

['1234567']

In [74]:
tn_bad_1 = '234'
compiled_re.findall(tn_bad_1)

[]

In [75]:
get_lines_between(tmp2, "load rate confirmation", "booking")

[['LOAD RATE CONFIRMATION', 'Trip Number 4309299'],
 ['Please call 310-732-6550 for dispatch and reference order number 2897521.'],
 ['Carrier:', 'TEMP', 'Truck Pay Total:', '$1,934.00'],
 ['Booking:', '808210080', 'Line Haul Rate:', '$1,934.00']]

In [76]:
def get_trip_number(origtmp):
    '''
    Derive Trip Number
    '''
    
    #Get The Trip Number
    regex_tn = r'\d{7}'
    compiled_re = re.compile(regex_tn)
    
    #Get the relevant lines for search
    get_lines = get_lines_between(tmp2, "load rate confirmation", "booking")
    
    #Loop through the subset of lines and apply regex
    for line in get_lines:
        for wordstr in line:
            tn = compiled_re.findall(wordstr)
            if len(tn) > 0:
                return(tn[0])
    return('')

In [77]:
get_trip_number(tmp2)

'4309299'

In [78]:
regex_ca = r'^[A-Z]{4}$'
compiled_re = re.compile(regex_ca)

In [79]:
ca_good_1 = 'TEPO'
compiled_re.findall(ca_good_1)

['TEPO']

In [80]:
ca_bad_1 = 'TEPGO'
compiled_re.findall(ca_bad_1)

[]

In [81]:
def get_carrier_info(origtmp):
    '''
    Derive Carrier Info
    '''
    
    #Get The Division Code or whatever
    regex_ca = r'^[A-Z]{4}$'
    compiled_re = re.compile(regex_ca)
    
    #Get the relevant lines for search
    get_lines = get_lines_between(tmp2, "load rate confirmation", "booking")
    
    #Loop through the subset of lines and apply regex
    for line in get_lines:
        for wordstr in line:
            ca = compiled_re.findall(wordstr)
            if len(ca) > 0:
                return(ca[0])
    return('')

In [82]:
get_carrier_info(tmp2)

'TEMP'

In [83]:
def get_trip_carrier(origtmp):
    trip_number_only = get_trip_number(tmp2)
    carrier_type = get_carrier_info(tmp2)
    
    crowley_details["trip_number"] = trip_number_only
    crowley_details["carrier"] = carrier_type

In [84]:
get_trip_carrier(tmp2)

In [85]:
pprint.pprint(crowley_details)

{'carrier': 'TEMP',
 'division': 'CDTS',
 'live_load': [],
 'live_unload': [],
 'load_rate_confirmation': {'booking_number': None,
                            'commodity': None,
                            'dispatch_phone_number': None,
                            'equipment_type': None,
                            'mileage': None,
                            'pieces_pallets': None,
                            'weight': None},
 'order_number': '2897521',
 'payment_processing': {'line_haul': None, 'status_line': None, 'total': None},
 'sent_by_contact': {'contact_info': [], 'sent_by': []},
 'trip_number': '4309299'}


In [87]:
get_lines_between(tmp2, "booking", "weight")

[['Booking:', '808210080', 'Line Haul Rate:', '$1,934.00'],
 ['Voyage:', 'Fuel Surcharge:', '$0.00'],
 ['Accessorial(s):', '$0.00'],
 ['Payment Processing:'],
 ['Equipment Type:', '53AIR'],
 ['PLEASE DO NOT SUBMIT INVOICES FOR PAYMENT'],
 ['Weight:',
  '5000',
  'Payments will be automatically processed via ERS based on Truck Pay Total (see above), standard/negotiated payment']]

In [92]:
regex_bn = r'^\d{9}$'
compiled_re = re.compile(regex_bn)

In [93]:
bn_good_1 = '123456789'
compiled_re.findall(bn_good_1)

['123456789']

In [94]:
bn_bad_1 = '1234567232389'
compiled_re.findall(bn_bad_1)

[]

In [95]:
bn_bad_2 = '123'
compiled_re.findall(bn_bad_2)

[]

In [96]:
regex_mi = r'^\d{4}$'
compiled_re = re.compile(regex_mi)

In [97]:
mi_good_1 = '1234'
compiled_re.findall(mi_good_1)

['1234']

In [98]:
mi_bad_1 = '123423232'
compiled_re.findall(mi_bad_1)

[]

In [99]:
mi_bad_2 = '12'
compiled_re.findall(mi_bad_2)

[]

In [108]:
regex_dispatch_phone = r'(\d{3}(\-|\.)\d{3}(\-|\.)\d{4})'
compiled_re = re.compile(regex_dispatch_phone)

In [109]:
dp_good_1 = '214-342-5423'
compiled_re.findall(dp_good_1)

[('214-342-5423', '-', '-')]

In [110]:
dp_good_1 = '214.342.5423'
compiled_re.findall(dp_good_1)

[('214.342.5423', '.', '.')]

In [111]:
dp_bad_1 = '214.342'
compiled_re.findall(dp_bad_1)

[]

In [112]:
dp_bad_2 = '214.3432'
compiled_re.findall(dp_bad_2)

[]

In [113]:
dp_bad_3 = '214.343.dad2'
compiled_re.findall(dp_bad_3)

[]

In [118]:
regex_pieces = r'\d{1}\.\d{2}'
compiled_re = re.compile(regex_pieces)

In [119]:
p_good_1 = '4.42'
compiled_re.findall(p_good_1)

['4.42']

In [121]:
p_bad_1 = '41.42'
compiled_re.findall(p_bad_1)

[]

In [122]:
p_bad_2 = '1.424'
compiled_re.findall(p_bad_2)

[]

In [None]:
def get_load_rate_confirmation(origtmp):
    '''
    Derive Load Rate
    '''