In [1]:
# Getting dataset for event management system
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Simulate data of 50 events with random attributes
num_events = 50
data = {
    'event_id': [f'event_{i+1}' for i in range(num_events)],
    'event_type': [random.choice(['concert', 'conference', 'workshop', 'webinar']) for _ in range(num_events)],
    'event_date': [datetime.now() + timedelta(days=random.randint(1, 365)) for _ in range(num_events)],
    'location': [random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']) for _ in range(num_events)],
    'attendees': [random.randint(50, 500) for _ in range(num_events)],
    'duration_hours': [random.randint(1, 8) for _ in range(num_events)],
    'cost_per_attendee': [round(random.uniform(10.0, 500.0), 2) for _ in range(num_events)],
    'sponsor': [random.choice(['Company A', 'Company B', 'Company C', 'None']) for _ in range(num_events)],
    'feedback_score': [random.uniform(1.0, 5.0) for _ in range(num_events)],
    'last_extraction': [datetime.now() - timedelta(days=random.randint(0, 30)) for _ in range(num_events)],
}
df = pd.DataFrame(data)
df.to_csv('event_dataset.csv', index=False)
df.head()   


Unnamed: 0,event_id,event_type,event_date,location,attendees,duration_hours,cost_per_attendee,sponsor,feedback_score,last_extraction
0,event_1,concert,2025-07-31 16:45:24.921576,Los Angeles,303,8,487.97,Company C,4.21842,2025-05-24 16:45:24.921576
1,event_2,webinar,2025-06-21 16:45:24.921576,Chicago,237,6,36.84,Company B,2.608547,2025-06-08 16:45:24.921576
2,event_3,workshop,2025-10-19 16:45:24.921576,Houston,182,3,205.22,,3.628327,2025-06-10 16:45:24.921576
3,event_4,workshop,2026-02-12 16:45:24.921576,Los Angeles,280,4,348.48,Company B,4.464241,2025-05-31 16:45:24.921576
4,event_5,concert,2026-05-10 16:45:24.921576,Phoenix,94,5,287.62,Company C,4.018718,2025-05-20 16:45:24.921576


# FULL Extraction

In [2]:
# Full Extraction
df_full = pd.read_csv("event_dataset.csv", parse_dates=["last_extraction"])
print(f"Extracted {len(df_full)} rows fully.")
df_full.head()

Extracted 50 rows fully.


Unnamed: 0,event_id,event_type,event_date,location,attendees,duration_hours,cost_per_attendee,sponsor,feedback_score,last_extraction
0,event_1,conference,2026-05-22 17:37:31.804996,Chicago,405,2,412.83,Company C,4.280598,2025-05-12 17:37:31.805987
1,event_2,concert,2026-05-23 17:37:31.804996,Houston,432,8,157.29,Company C,3.468362,2025-06-08 17:37:31.805987
2,event_3,conference,2025-08-12 17:37:31.804996,Houston,440,2,268.59,Company A,3.405829,2025-05-16 17:37:31.805987
3,event_4,webinar,2026-02-26 17:37:31.804996,New York,70,6,67.94,Company C,2.127185,2025-05-26 17:37:31.805987
4,event_5,conference,2026-01-24 17:37:31.804996,Los Angeles,165,2,116.43,,2.129281,2025-05-28 17:37:31.805987


# INCREMENTAL Extraction

In [3]:
# Set initial last extraction time (e.g., halfway through the data range)
with open("last_extraction.txt", "w") as f:
    f.write("2025-06-04 12:00:00") 

In [4]:
# INCREMENTAL EXTRACTION
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()
df = pd.read_csv("event_dataset.csv", parse_dates=["last_extraction"])
last_extraction_time = pd.to_datetime(last_extraction)
df_incremental = df[df['last_extraction'] > last_extraction_time]
print(f"Extracted {len(df_incremental)}  rows since last check.")
df_incremental.head()

Extracted 5  rows since last check.


Unnamed: 0,event_id,event_type,event_date,location,attendees,duration_hours,cost_per_attendee,sponsor,feedback_score,last_extraction
1,event_2,concert,2026-05-23 17:37:31.804996,Houston,432,8,157.29,Company C,3.468362,2025-06-08 17:37:31.805987
6,event_7,conference,2026-01-24 17:37:31.804996,Chicago,448,1,177.73,Company B,4.952686,2025-06-04 17:37:31.805987
24,event_25,webinar,2025-12-19 17:37:31.804996,Houston,177,6,197.43,Company A,4.809862,2025-06-07 17:37:31.805987
25,event_26,concert,2025-12-20 17:37:31.804996,Houston,86,2,46.62,Company A,4.31197,2025-06-06 17:37:31.805987
35,event_36,workshop,2025-10-10 17:37:31.804996,Houston,495,3,391.44,Company C,2.393547,2025-06-05 17:37:31.805987


In [5]:
# Successful extraction
# Get the most recent update
new_checkpoint = df['last_extraction'].max()
# Save it
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())
print(f"Updated last_extraction.txt to {new_checkpoint}")

Updated last_extraction.txt to 2025-06-08 17:37:31.805987
