### Cleaning the data to make sure there are no duplicates

In [73]:
from collections import Counter
import pandas as pd

import gzip
import json

In [5]:
def read_comp_json(FileName: str) -> None:
    with gzip.open(FileName, 'r') as fin:
        data = json.loads(fin.read().decode('utf-8'))
    return data

In [4]:
data = read_comp_json(FileName='db.json')

In [31]:
titles = []
for k,v in data.items():
    # k: n + 1
    if k == 'COUNT':
        print(v)
        continue
    titles.append(v['title'])

489


In [34]:
freq = dict(Counter(titles))

{'PGL Wallachia S3': 1,
 'BLAST Slam #3': 1,
 'ESL One Raleigh 2025': 1,
 'PGL Wallachia S4': 1,
 'Dota Duo': 1,
 'Dota, The Q4': 1,
 'Dota & Band': 1,
 'Dota - \\Springbrunnen\\ - Tour 2025': 1,
 'Dota': 1,
 'Esports - Weekly Open Play @ Sunnyslope': 1,
 'Metro Esports - Game Jam!': 1,
 'Esports Lounge Gaming Experience': 2,
 '2025 Wisconsin Esports Summit': 1,
 'Battle for Nevada High School Esports Tournament': 1,
 'Marvel Rivals In-Person Stream Viewing Esports': 1,
 'Esports - Weekly Open Play @ PVCC': 1,
 'Esports': 1,
 'Metro Esports - After School Esports Leagues: Try 1 Month!': 1,
 'Teen Anime Club': 1,
 'Anime & Manga Madness': 1,
 'Anime Art Club': 1,
 'Character Creation: Anime Manga - Young Rembrandts': 1,
 'Anime & Manga: Styles of Classic Mangaka': 1,
 'Anime Milwaukee 2025': 2,
 'ANIME CLUB ages 12 to 18': 1,
 'Anime Club': 3,
 'Anime Afternoon': 1,
 'Trip to Anime Milwaukee': 1,
 'DO NOT DISTURB: An IMPULSE Magazine Benefit Exhibition — IMPULSE Magazine': 1,
 'Impulse 

In [36]:
s_title = set()
for k,v in freq.items():
    if v > 1:
        s_title.add(k)

In [39]:
for k,v in data.items():
    if k == 'COUNT':
        continue
    if v['title'] in s_title:
        print(v)

{'title': 'Esports Lounge Gaming Experience', 'date': {'start_date': 'Mar 2', 'when': 'Sun, Mar 2, 10 PM – Mon, Mar 3, 1 AM'}, 'address': ['', ''], 'link': 'https://allevents.in/palm%20harbor/esports-lounge-gaming-experience/80002614770415?slot=2025-03-02', 'description': 'The new Esports Lounge of palm Harbor is finally here! Palm Harbor Parks and Recreation unveiled its exciting new Esports Lounge to the public on June 18th. Excited gamers community leaders and...', 'ticket_info': [{'source': 'AllEvents', 'link': 'https://allevents.in/palm%20harbor/esports-lounge-gaming-experience/80002614770415?slot=2025-03-02', 'link_type': 'more info'}], 'thumbnail': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTgR8jPQZRns0wbjSopqw_lZUNIkUhr5yNp-grCInIaMNwDJLSbyVYHrms&s', 'image': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ33WtibUT9mLgzzLIBqdswFaQPrcUW3AhzVMSkfBSOvUbvDNTRHrFmv8Trsg&s=10'}
{'title': 'Esports Lounge Gaming Experience', 'date': {'start_date': 'Mar 6', 'when': '

### Duplicate Names result in different events
---

In [None]:
sample = list(data.items())[1]
k,v = sample

## Inventory of Events (Final Format)

### Event Information
- Name       : str       -> The name of the Events.
- Description: str       -> A small description about the "event" in this case it is the video game.
- Price      : float     -> Current in ($USD)
- Likes      : ints      -> Number of current likes

### Geographical
- address1   : str       -> Main address
- address2   : str       -> appt or suit nunber
- city       : str       -> 
- state      : str       -> 
- postalCode : str       -> zipcode
- lat        : float     -> Lattitude
- lng        : float     -> Longitutde. 

### Temporal
- Start time : datetime  -> The time the event will start.
- End time   : datetime  -> The time the event will end. 

In [54]:
del data['COUNT']

In [61]:
unique_keys = set()

for k,v in data.items():
    title     = v['title']
    desc      = v['description'] if 'description' in v.keys() else ''
    main_page = v['link']
    address1, address2 = v['address']
    thumbnail = v['thumbnail']
    image     = v['image'] if 'image' in v.keys() else ''

    start_date = v['date']['start_date']
    when_date  = v['date']['when']
    unique_keys.add(tuple(v.keys()))


In [67]:
unique_keys = [set(i) for i in unique_keys]

In [69]:
global_set = set()
for s in unique_keys:
    global_set = global_set | s
global_set

{'address',
 'date',
 'description',
 'event_location_map',
 'image',
 'link',
 'thumbnail',
 'ticket_info',
 'title',
 'venue'}

The following sets include all the unique combinations of missing attributes for all the events.

In [71]:
for s in unique_keys:
    print(global_set - s)

{'venue'}
{'event_location_map', 'venue'}
{'venue', 'image'}
{'venue', 'event_location_map', 'image', 'description'}
{'image', 'description'}
{'venue', 'event_location_map', 'image'}
{'description'}
set()
{'image'}
{'venue', 'description'}
{'event_location_map', 'venue', 'description'}


### Transform data in pd Dataframe

In [128]:
df = pd.DataFrame(
    columns = ['Title','Desc','Start Date','When','Main Page','Address1', 'Address2','thumb','image'])

In [129]:
for k,v in data.items():
    df = pd.concat(
        [df, pd.Series({
        'Title'      : v['title'],
        'Desc'       : v['description'] if 'description' in v.keys() else '',
        'Start Date' : v['date']['start_date'],
        'When'       : v['date']['when'],
        'Main Page'  : v['link'],
        'Address1'   : v['address'][0],
        'Address2'   : v['address'][1],
        'image'      : v['image'] if 'image' in v.keys() else '',
        'thumb'      : v['thumbnail']
     }).to_frame().transpose()], axis=0
     )

In [130]:
df.reset_index(inplace=True)

In [131]:
df.head(n=2)

Unnamed: 0,index,Title,Desc,Start Date,When,Main Page,Address1,Address2,thumb,image
0,0,PGL Wallachia S3,Full information about PGL Wallachia S3 Dota 2...,Mar 8,"Sat, Mar 8",https://ggscore.com/en/dota-2/pgl-wallachia-se...,"PGL ESPORTS, Bulevardul Dimitrie Pompeiu 9-9A","Bucharest, Romania",https://encrypted-tbn0.gstatic.com/images?q=tb...,https://encrypted-tbn0.gstatic.com/images?q=tb...
1,0,BLAST Slam #3,Full information about BLAST Slam #3 Dota 2. M...,May 5,"Mon, May 5",https://ggscore.com/en/dota-2/blast-slam-3,,,https://encrypted-tbn0.gstatic.com/images?q=tb...,https://encrypted-tbn0.gstatic.com/images?q=tb...


In [132]:
df.drop(columns=['index'], inplace=True)

In [133]:
df.head(n = 5)

Unnamed: 0,Title,Desc,Start Date,When,Main Page,Address1,Address2,thumb,image
0,PGL Wallachia S3,Full information about PGL Wallachia S3 Dota 2...,Mar 8,"Sat, Mar 8",https://ggscore.com/en/dota-2/pgl-wallachia-se...,"PGL ESPORTS, Bulevardul Dimitrie Pompeiu 9-9A","Bucharest, Romania",https://encrypted-tbn0.gstatic.com/images?q=tb...,https://encrypted-tbn0.gstatic.com/images?q=tb...
1,BLAST Slam #3,Full information about BLAST Slam #3 Dota 2. M...,May 5,"Mon, May 5",https://ggscore.com/en/dota-2/blast-slam-3,,,https://encrypted-tbn0.gstatic.com/images?q=tb...,https://encrypted-tbn0.gstatic.com/images?q=tb...
2,ESL One Raleigh 2025,Full information about ESL One Raleigh 2025 Do...,Apr 6,"Sun, Apr 6, 8 PM – Sat, Apr 12, 8 PM EDT",https://ggscore.com/en/dota-2/esl-one-raleigh-...,"Raleigh Convention Center, 500 S Salisbury St","Raleigh, NC",https://encrypted-tbn0.gstatic.com/images?q=tb...,https://encrypted-tbn0.gstatic.com/images?q=tb...
3,PGL Wallachia S4,Full information about PGL Wallachia S4 Dota 2...,Apr 19,"Sat, Apr 19",https://ggscore.com/en/dota-2/pgl-wallachia-se...,"PGL ESPORTS, Bulevardul Dimitrie Pompeiu 9-9A","Bucharest, Romania",https://encrypted-tbn0.gstatic.com/images?q=tb...,https://encrypted-tbn0.gstatic.com/images?q=tb...
4,Dota Duo,Find tickets for Dota at Weingut Karl Sonntag ...,Jul 27,"Sun, Jul 27, 7 – 8 PM GMT+2",https://open.spotify.com/concert/2HiJlly0G2JdB...,"Weingut Karl Sonntag, Kirchenweg 22","Nittel, Germany",https://encrypted-tbn0.gstatic.com/images?q=tb...,https://encrypted-tbn0.gstatic.com/images?q=tb...


In [136]:
df.to_csv('Events.csv', index = False)

In [135]:
d = pd.read_csv('Events.csv',compression = 'gzip')
d.head()

Unnamed: 0,Title,Desc,Start Date,When,Main Page,Address1,Address2,thumb,image
0,PGL Wallachia S3,Full information about PGL Wallachia S3 Dota 2...,Mar 8,"Sat, Mar 8",https://ggscore.com/en/dota-2/pgl-wallachia-se...,"PGL ESPORTS, Bulevardul Dimitrie Pompeiu 9-9A","Bucharest, Romania",https://encrypted-tbn0.gstatic.com/images?q=tb...,https://encrypted-tbn0.gstatic.com/images?q=tb...
1,BLAST Slam #3,Full information about BLAST Slam #3 Dota 2. M...,May 5,"Mon, May 5",https://ggscore.com/en/dota-2/blast-slam-3,,,https://encrypted-tbn0.gstatic.com/images?q=tb...,https://encrypted-tbn0.gstatic.com/images?q=tb...
2,ESL One Raleigh 2025,Full information about ESL One Raleigh 2025 Do...,Apr 6,"Sun, Apr 6, 8 PM – Sat, Apr 12, 8 PM EDT",https://ggscore.com/en/dota-2/esl-one-raleigh-...,"Raleigh Convention Center, 500 S Salisbury St","Raleigh, NC",https://encrypted-tbn0.gstatic.com/images?q=tb...,https://encrypted-tbn0.gstatic.com/images?q=tb...
3,PGL Wallachia S4,Full information about PGL Wallachia S4 Dota 2...,Apr 19,"Sat, Apr 19",https://ggscore.com/en/dota-2/pgl-wallachia-se...,"PGL ESPORTS, Bulevardul Dimitrie Pompeiu 9-9A","Bucharest, Romania",https://encrypted-tbn0.gstatic.com/images?q=tb...,https://encrypted-tbn0.gstatic.com/images?q=tb...
4,Dota Duo,Find tickets for Dota at Weingut Karl Sonntag ...,Jul 27,"Sun, Jul 27, 7 – 8 PM GMT+2",https://open.spotify.com/concert/2HiJlly0G2JdB...,"Weingut Karl Sonntag, Kirchenweg 22","Nittel, Germany",https://encrypted-tbn0.gstatic.com/images?q=tb...,https://encrypted-tbn0.gstatic.com/images?q=tb...
