In [1]:
from google.colab import drive
import os
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import json
with open('gdrive/MyDrive/contacts.json') as f:
  data = json.loads(f.read())
for x in data:
  x['Id'] = str(x['Id'])

In [3]:
data[0:3]

[{'Contacts': 1,
  'Email': 'gkzAbIy@qq.com',
  'Id': '0',
  'OrderId': '',
  'Phone': ''},
 {'Contacts': 4,
  'Email': '',
  'Id': '1',
  'OrderId': 'vDDJJcxfLtSfkooPhbYnJdxov',
  'Phone': '329442681752'},
 {'Contacts': 0, 'Email': '', 'Id': '2', 'OrderId': '', 'Phone': '9125983679'}]

In [4]:
import pandas as pd
df = pd.json_normalize(data)

In [5]:
df.head()

Unnamed: 0,Id,Email,Phone,Contacts,OrderId
0,0,gkzAbIy@qq.com,,1,
1,1,,329442681752.0,4,vDDJJcxfLtSfkooPhbYnJdxov
2,2,,9125983679.0,0,
3,3,mdllpYmE@gmail.com,,0,bHquEnCbbsGLqllwryxPsNOxa
4,4,,300364407.0,2,


In [6]:
print(len(df[(df.Email != "") | (df.Phone != "") | (df.OrderId != "")]))
print(df.Email.nunique())
print(df.Phone.nunique())
print(df.OrderId.nunique())

500000
249157
190678
189303


In [7]:
# singly linked list
class Node:
  def __init__(self, value=None):
    self.next = None
    self.value = set([value]) if value is not None else set()

def get_deepest_node(node):
  while node.next is not None:
    node = node.next
  return node

def gen_new_node(node_list):
  # connect each node in the node_list to a new_node
  # (all the nodes in the node_list share the same new_node object)
  # the value of new_node is the union of all its previous nodes 
  new_node = Node()
  for nd in node_list:
    nd.next = new_node
    new_node.value = new_node.value.union(nd.value)

In [8]:
# initialization，give each Id a brand new node
res = dict() # Id -> Node
for x in data:
  res[x['Id']] = Node(x['Id'])

In [9]:
print(res['0'])
print(get_deepest_node(res['0']))
print(id(res['0']))
print(id(get_deepest_node(res['0'])))
print(res['0'].next)
print(res['0'].value)

<__main__.Node object at 0x7f0975c70d90>
<__main__.Node object at 0x7f0975c70d90>
139678607412624
139678607412624
None
{'0'}


In [10]:
for feat in ['OrderId', 'Phone', 'Email']:
  val_cnt = df[feat].value_counts()
  candidate = {x for x in val_cnt[(val_cnt > 1)].index if x != ""}
  for _, grp in df[df[feat].isin(candidate)].groupby(feat):
    group_ids = set(grp.Id)
    uniq_nodes = {
        id(get_deepest_node(res[id_])): get_deepest_node(res[id_]) 
        for id_ in group_ids
        } # Node_address -> Node_object
    gen_new_node([nd for nd in uniq_nodes.values()])
    # print(group_ids)
    # break

In [11]:
# sanity check
for i in {'5', '50'}:
  print(i)
  print('this')
  print(res[i])
  print(get_deepest_node(res[i]))
  print(id(res[i]))
  print(id(get_deepest_node(res[i])))
  print(res[i].next)
  print(res[i].value)
  print('deepest value')
  print(get_deepest_node(res[i]).value)
  print()

5
this
<__main__.Node object at 0x7f0975c70e50>
<__main__.Node object at 0x7f0969edc590>
139678607412816
139678408623504
<__main__.Node object at 0x7f096c95bbd0>
{'5'}
deepest value
{'5', '50', '482810', '404324', '458692', '226720', '383605', '215197', '212533'}

50
this
<__main__.Node object at 0x7f0975c72950>
<__main__.Node object at 0x7f0969edc590>
139678607419728
139678408623504
<__main__.Node object at 0x7f096e5298d0>
{'50'}
deepest value
{'5', '50', '482810', '404324', '458692', '226720', '383605', '215197', '212533'}



In [12]:
# id -> aggregated new id
remap = {k: '-'.join(sorted(get_deepest_node(v).value, key=int)) for k, v in res.items()}

In [13]:
# sanity check
for i in sorted({'215197', '50', '404324', '5', '212533', '226720', '458692', '383605', '482810'}, key=int):
  print(f'{i:>10}:    {remap[i]}')

         5:    5-50-212533-215197-226720-383605-404324-458692-482810
        50:    5-50-212533-215197-226720-383605-404324-458692-482810
    212533:    5-50-212533-215197-226720-383605-404324-458692-482810
    215197:    5-50-212533-215197-226720-383605-404324-458692-482810
    226720:    5-50-212533-215197-226720-383605-404324-458692-482810
    383605:    5-50-212533-215197-226720-383605-404324-458692-482810
    404324:    5-50-212533-215197-226720-383605-404324-458692-482810
    458692:    5-50-212533-215197-226720-383605-404324-458692-482810
    482810:    5-50-212533-215197-226720-383605-404324-458692-482810


In [14]:
df['newId'] = [remap[id_] for id_ in df.Id]
df.head(5)

Unnamed: 0,Id,Email,Phone,Contacts,OrderId,newId
0,0,gkzAbIy@qq.com,,1,,0
1,1,,329442681752.0,4,vDDJJcxfLtSfkooPhbYnJdxov,1-2458-98519-115061-140081-165605-476346
2,2,,9125983679.0,0,,2-159312-322639-348955
3,3,mdllpYmE@gmail.com,,0,bHquEnCbbsGLqllwryxPsNOxa,3
4,4,,300364407.0,2,,4


In [15]:
df = df.assign(ContactsSum=df.groupby('newId')['Contacts'].transform('sum'))
df.head(5)

Unnamed: 0,Id,Email,Phone,Contacts,OrderId,newId,ContactsSum
0,0,gkzAbIy@qq.com,,1,,0,1
1,1,,329442681752.0,4,vDDJJcxfLtSfkooPhbYnJdxov,1-2458-98519-115061-140081-165605-476346,12
2,2,,9125983679.0,0,,2-159312-322639-348955,4
3,3,mdllpYmE@gmail.com,,0,bHquEnCbbsGLqllwryxPsNOxa,3,0
4,4,,300364407.0,2,,4,2


In [16]:
# sanity check
print(df[df.Id.isin({'215197', '50', '404324', '5', '212533', '226720', '458692', '383605', '482810'})].Contacts.sum())
df[df.Id.isin({'215197', '50', '404324', '5', '212533', '226720', '458692', '383605', '482810'})]

15


Unnamed: 0,Id,Email,Phone,Contacts,OrderId,newId,ContactsSum
5,5,,840113148.0,0,,5-50-212533-215197-226720-383605-404324-458692...,15
50,50,TIMejlVYhfqjy@gmail.com,840113148.0,4,XBkNrwcWWslDvSQdhYPfCpQJo,5-50-212533-215197-226720-383605-404324-458692...,15
212533,212533,izafiOnTIVibbz@hotmail.com,4962058234.0,0,,5-50-212533-215197-226720-383605-404324-458692...,15
215197,215197,izafiOnTIVibbz@hotmail.com,2529522887.0,0,sEgewZTtnDMAADcwLSjxsHiSK,5-50-212533-215197-226720-383605-404324-458692...,15
226720,226720,,2529522887.0,4,XBkNrwcWWslDvSQdhYPfCpQJo,5-50-212533-215197-226720-383605-404324-458692...,15
383605,383605,izafiOnTIVibbz@hotmail.com,,0,,5-50-212533-215197-226720-383605-404324-458692...,15
404324,404324,yFcCGIfYcFKfmqpNk@yahoo.com,,3,,5-50-212533-215197-226720-383605-404324-458692...,15
458692,458692,yFcCGIfYcFKfmqpNk@yahoo.com,,0,XBkNrwcWWslDvSQdhYPfCpQJo,5-50-212533-215197-226720-383605-404324-458692...,15
482810,482810,yFcCGIfYcFKfmqpNk@yahoo.com,,4,,5-50-212533-215197-226720-383605-404324-458692...,15


In [17]:
# submission
sub_df = df[['Id', 'newId']].copy()
sub_df['newId'] = sub_df['newId'] + ", " + df['ContactsSum'].astype(str)
sub_df.columns = ['ticket_id', 'ticket_trace/contact']
sub_df = sub_df.sort_values('ticket_id').reset_index(drop=True)
sub_df.head()

Unnamed: 0,ticket_id,ticket_trace/contact
0,0,"0, 1"
1,1,"1-2458-98519-115061-140081-165605-476346, 12"
2,10,"10-93270, 7"
3,100,100-822-1157-79530-95287-109959-129043-146402-...
4,1000,"1000-86254, 3"


In [18]:
sub_df.to_csv("answer.csv", index=False)