In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns


RAW_DATA = '../data/raw/'

In [2]:
!ls ../data/raw

'Dataset for Multi-Channel Contacts Problem (3).zip'   contacts.json
'Sample Submissions.csv'


In [3]:
import json

In [4]:
# Read as dict
data = []
with open("../data/raw/contacts.json") as f:
    data = json.load(f)

len(data)

500000

In [5]:
# Create Reverse Index Email
reverse_index_email = {}
for row in data:
    if row["Email"] != '':
        if row["Email"] not in reverse_index_email:
            reverse_index_email[row["Email"]] = []
        reverse_index_email[row["Email"]].append(row["Id"])
        
len(reverse_index_email)

249156

In [6]:
# Create Reverse Index Phone
reverse_index_phone = {}
for row in data:
    if row["Phone"] != '':
        if row["Phone"] not in reverse_index_phone:
            reverse_index_phone[row["Phone"]] = []
        reverse_index_phone[row["Phone"]].append(row["Id"])
        
len(reverse_index_phone)

190677

In [7]:
# Create Reverse Index OrderId
reverse_index_order = {}
for row in data:
    if row["OrderId"] != '':
        if row["OrderId"] not in reverse_index_order:
            reverse_index_order[row["OrderId"]] = []
        reverse_index_order[row["OrderId"]].append(row["Id"])
        
len(reverse_index_order)

189302

In [8]:
# Create array of id
id_bucket = [row["Id"] for row in data]

rank = [0 for i in id_bucket]
len(id_bucket)

500000

In [9]:
# Create methods for UFDS from CP3
def find_set(idx):
    if id_bucket[idx] == idx:
        return idx
    id_bucket[idx] = find_set(id_bucket[idx])
    return id_bucket[idx]

def is_same_set(i, j):
    return find_set(i) == find_set(j)

def union_set(i,j):
    if not is_same_set(i,j):
        x,y = find_set(i), find_set(j)
        if rank[x] > rank[y]:
            id_bucket[y] = x
        else:
            id_bucket[x] = y
            if rank[x] == rank[y]:
                rank[y] += 1

In [10]:
from tqdm import tqdm

In [11]:
# Run the algorithm
for i in tqdm(range(len(data)), total=len(data)):
    row = data[i]
    # Iterate Email
    if row["Email"] != "":
        for id_data in reverse_index_email[row["Email"]]:
            if id_data != i:
                union_set(i, id_data)
    # Iterate Phone
    if row["Phone"] != "":
        for id_data in reverse_index_phone[row["Phone"]]:
            if id_data != i:
                union_set(i, id_data)
    # Iterate orderId
    if row["OrderId"] != "":
        for id_data in reverse_index_order[row["OrderId"]]:
            if id_data != i:
                union_set(i, id_data)


100%|██████████| 500000/500000 [00:01<00:00, 440716.20it/s]


In [12]:
from collections import OrderedDict 

result = OrderedDict()
for i in range(len(id_bucket)):
    res = id_bucket[i]
    parent = find_set(res)
    
    if parent not in result:
        result[parent] = {}
        result[parent]["contacts"] = 0
        result[parent]["ids"] = []
    
    result[parent]["contacts"] += data[i]["Contacts"]
    result[parent]["ids"].append(i)

In [13]:
result

OrderedDict([(0, {'contacts': 1, 'ids': [0]}),
             (2458,
              {'contacts': 12,
               'ids': [1, 2458, 98519, 115061, 140081, 165605, 476346]}),
             (348955, {'contacts': 4, 'ids': [2, 159312, 322639, 348955]}),
             (3, {'contacts': 0, 'ids': [3]}),
             (4, {'contacts': 2, 'ids': [4]}),
             (50,
              {'contacts': 15,
               'ids': [5,
                50,
                212533,
                215197,
                226720,
                383605,
                404324,
                458692,
                482810]}),
             (38, {'contacts': 13, 'ids': [6, 38, 32871, 142067, 236367]}),
             (7, {'contacts': 1, 'ids': [7]}),
             (183160, {'contacts': 5, 'ids': [8, 183160, 406623]}),
             (468927,
              {'contacts': 8,
               'ids': [9, 13, 16708, 33415, 343161, 417916, 468927, 484896]}),
             (93270, {'contacts': 7, 'ids': [10, 93270]}),
           

In [14]:
for key, value in result.items():
    result[key]["string"] = "-".join(str(idx) for idx in sorted(value["ids"]))

In [15]:
sub_df = {
    'ticket_id':[],
    'ticket_trace':[],
    'contact':[]
}
# # Generate CSV file
# with open("result.csv", "w") as f:  
#     f.write("ticket_id,")
for i in tqdm(range(len(data)), total=len(data)):
    sub_df['ticket_id'].append(i)
    information = result[find_set(i)]
    sub_df['ticket_trace'].append(information["string"])
    sub_df['contact'].append(information["contacts"])

100%|██████████| 500000/500000 [00:00<00:00, 910155.07it/s]


In [16]:
sub_df = pd.DataFrame(sub_df)

In [17]:
sub_df.shape

(500000, 3)

In [18]:
sub_df.drop_duplicates().shape

(500000, 3)

In [19]:
df = pd.DataFrame(data)

In [20]:
assert (
    sub_df.groupby('ticket_trace')['contact'].min().sum() == 
    sub_df.groupby('ticket_trace')['contact'].max().sum() == 
    sub_df.groupby('ticket_trace')['contact'].mean().sum() == 
    df['Contacts'].sum()
)

In [21]:
sub_df['ticket_count'] = sub_df['ticket_trace'].apply(lambda x: len(x.split('-')))

In [22]:
sub_df.groupby('ticket_trace')['ticket_count'].max().sum()

500000

In [23]:
sub_df.sort_values(['ticket_trace', 'ticket_id']).head(10)

Unnamed: 0,ticket_id,ticket_trace,contact,ticket_count
0,0,0,1,1
1,1,1-2458-98519-115061-140081-165605-476346,12,7
2458,2458,1-2458-98519-115061-140081-165605-476346,12,7
98519,98519,1-2458-98519-115061-140081-165605-476346,12,7
115061,115061,1-2458-98519-115061-140081-165605-476346,12,7
140081,140081,1-2458-98519-115061-140081-165605-476346,12,7
165605,165605,1-2458-98519-115061-140081-165605-476346,12,7
476346,476346,1-2458-98519-115061-140081-165605-476346,12,7
10,10,10-93270,7,2
93270,93270,10-93270,7,2


In [24]:
sample_submission = pd.read_csv('../data/raw/Sample Submissions.csv')

In [25]:
sample_submission.columns

Index(['ticket_id', 'ticket_trace/contact'], dtype='object')

In [26]:
sample_submission

Unnamed: 0,ticket_id,ticket_trace/contact
0,0,"0, 1"
1,1,"1, 2"
2,2,"2-150, 6"
3,3,"3-100, 3"


In [27]:
sub_df['ticket_trace/contact'] = sub_df['ticket_trace'].apply(str) + ', ' + sub_df['contact'].apply(str)

In [28]:
sub_df[sample_submission.columns].to_csv('../data/submission/solution.csv', index=False)

In [29]:
sample_submission

Unnamed: 0,ticket_id,ticket_trace/contact
0,0,"0, 1"
1,1,"1, 2"
2,2,"2-150, 6"
3,3,"3-100, 3"


In [30]:
sub_df[sample_submission.columns].head(4)

Unnamed: 0,ticket_id,ticket_trace/contact
0,0,"0, 1"
1,1,"1-2458-98519-115061-140081-165605-476346, 12"
2,2,"2-159312-322639-348955, 4"
3,3,"3, 0"
