In [521]:
import pandas as pd
import numpy as np

In [522]:
def load_data(df_name):
    
    df = pd.read_csv(df_name, header = None)
    
    if df_name == 'df_2.csv':
        
        df = df.loc[0,:].tolist()
        
        return df
    
    elif df_name == 'df_3.csv':
        
        df = df.values.tolist()
        
        return df
    
    else:      
    
        df.columns = ['data']
        df = df.data.tolist()
    
        return df   

In [523]:
#Load data

In [524]:
#Part 1

In [525]:
def parse_valid_numbers_list(valid_fields):
    
    """
    Function takes as input the list of valid fields, and returns 
    a (parsed) list of values that are valid.
    
    """
    
    valid_field_list = []
    
    for values in valid_fields:
        
        data = values.split(':',)[1].strip()
        first_range = data.split('or')[0].strip()
        second_range = data.split('or')[1].strip()
        
        range_1 = (int(first_range.split('-')[0]), int(first_range.split('-')[1]))
        range_2 = (int(second_range.split('-')[0]), int(second_range.split('-')[1]))
        
        valid_field_list.append(range_1)
        valid_field_list.append(range_2)      

    
    
    return valid_field_list        

In [526]:
def check_if_number_is_invalid(input_number, valid_fields):
    
    """ 
    Function takes as input a number, and check if it exists in 
    valid_fields. If it does, the function returns 0. Otherwise,
    the function returns 1.
    
    """
    
    for value_ranges in valid_fields:
        
        mask1 = input_number >= value_ranges[0]
        mask2 = input_number <= value_ranges[1]
        mask3 = (mask1 & mask2)

        if mask3:
            
            return 0
        
        else:
            
            pass
        
    return 1  

In [527]:
df_1 = load_data('df_1.csv')

In [528]:
df_2 = load_data('df_2.csv')

In [529]:
df_3 = load_data('df_3.csv')

In [530]:
df_1 = parse_valid_numbers_list(df_1)

In [531]:
invalid_numbers = []

for lists in df_3:
    
    for values in lists:
        
        invalid_number = check_if_number_is_invalid(values, df_1)
        
        if invalid_number == 1:
            
            invalid_numbers.append(values)
        
        else:
            
            pass

In [532]:
print("The ticket scanning error rate is: {}".format(np.sum(invalid_numbers)))

The ticket scanning error rate is: 25788


In [533]:
#Part 2

In [534]:
#Remove tickets that contain invalid fields!

In [535]:
print("There are {} tickets in total".format(len(df_3)))

There are 237 tickets in total


In [536]:
valid_tickets = []

for ticket in df_3:
    
    invalid_numbers_in_ticket = []
    
    for number in ticket:
        
        invalid_numbers_in_ticket.append(check_if_number_is_invalid(number, df_1))
        
    if np.sum(invalid_numbers_in_ticket) > 0:
        
        pass
    
    else:
        
        valid_tickets.append(ticket)       

In [537]:
print("There are {} valid tickets".format(len(valid_tickets)))

There are 190 valid tickets


In [538]:
field_names = [

'departure location', 
'departure station',
'departure platform', 
'departure track', 
'departure date', 
'departure time', 
'arrival location',
'arrival station', 
'arrival platform', 
'arrival track', 
'class',
'duration', 
'price', 
'route',
'row', 
'seat', 
'train', 
'type', 
'wagon', 
'zone']

In [539]:
my_dict = {}
i = 0

for field in field_names:
    associated_fields = []
    associated_fields.append(df_1[i])
    associated_fields.append(df_1[i + 1])
    my_dict[field] = associated_fields
    i += 2

In [540]:
def check_if_field_is_valid(tickets, relevant_range, guess):
    
    """
    Function takes as input data with tickets, the range associated with the field we are examining, and
    a guess about which column corresponds to the field of interest. If no inconsistencys are found, funcion
    returns 1, else 0.
    """
    
    numbers = []
    
    for ticket in tickets:
        
        value_in_col = ticket[guess]
        
        for i, ranges in enumerate(relevant_range):
            
            mask1 = value_in_col >= ranges[0]
            mask2 = value_in_col <= ranges[1]
            mask3 = (mask1 & mask2)
            
            if mask3:
                
                numbers.append(1)
                break
                
            elif i == 1:
                
                numbers.append(0)
                break
                
            else:
                
                pass       
  
    
    check = np.min(numbers)    
    
    if check > 0:
        
        return 1
    
    else:
        
        return 0

In [541]:
#Create dictionary 

In [542]:
res_dict = {}
cols = [x for x in range(20)]

In [543]:
for field_name in field_names:
    
    relevant_ranges = my_dict[field_name]
    res_list = []
    
    for col in cols:
        
        res_list.append(check_if_field_is_valid(valid_tickets, relevant_ranges, col))
        
    
    res_dict[field_name] = res_list  

In [544]:
df = pd.DataFrame(res_dict)

In [545]:
degrees_of_freedom = df.sum().sort_values().to_frame('degrees_of_freedom').reset_index()

In [546]:
degrees_of_freedom = degrees_of_freedom.rename(columns = {'index': 'field'})

In [547]:
degrees_of_freedom

Unnamed: 0,field,degrees_of_freedom
0,arrival track,1
1,train,2
2,arrival platform,3
3,arrival location,4
4,class,5
5,price,6
6,type,7
7,arrival station,8
8,seat,9
9,departure track,10


Algo:

1) Pick top-field in degrees_of_freedom (starts at "arrival track").

2) Return index where arrival track column = 1.

3) Set this row to zero for all columns.

4) Continue to next value in degrees_of_freedom.

In [548]:
final_dict = {}

In [549]:
search_order = degrees_of_freedom.field.tolist()

In [550]:
for field in search_order:
    
    col_position = int(df[df[field] == 1].index.values)
    final_dict[field] = col_position
    
    mask = df.index == col_position

    for var in df.columns:
    
        df.loc[mask,var] = 0

In [551]:
final_dict

{'arrival track': 0,
 'train': 3,
 'arrival platform': 8,
 'arrival location': 1,
 'class': 12,
 'price': 14,
 'type': 6,
 'arrival station': 5,
 'seat': 9,
 'departure track': 17,
 'departure location': 13,
 'departure time': 16,
 'departure date': 19,
 'departure station': 10,
 'departure platform': 7,
 'duration': 4,
 'row': 2,
 'wagon': 15,
 'route': 11,
 'zone': 18}

In [552]:
fields_to_sum = [17,13,16,19,10,7]

In [553]:
fields_to_sum

[17, 13, 16, 19, 10, 7]

In [554]:
my_ticket = [163,151,149,67,71,79,109,61,83,137,89,59,53,179,73,157,139,173,131,167]

In [560]:
factors = []

for j in fields_to_sum:
    
    factors.append(my_ticket[j])

In [569]:
np.prod(factors)

-1559356505

In [567]:
factors

[173, 179, 139, 167, 89, 61]