In [32]:
# Getting dataset for event management system
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Simulate data of 50 events with random attributes
num_events = 50
data = {
    'event_id': [f'event_{i+1}' for i in range(num_events)],
    'event_type': [random.choice(['concert', 'conference', 'workshop', 'webinar']) for _ in range(num_events)],
    'event_date': [datetime.now() + timedelta(days=random.randint(1, 365)) for _ in range(num_events)],
    'location': [random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']) for _ in range(num_events)],
    'attendees': [random.randint(50, 500) for _ in range(num_events)],
    'duration_hours': [random.randint(1, 8) for _ in range(num_events)],
    'cost_per_attendee': [round(random.uniform(10.0, 500.0), 2) for _ in range(num_events)],
    'sponsor': [random.choice(['Company A', 'Company B', 'Company C', 'None']) for _ in range(num_events)],
    'feedback_score': [random.uniform(1.0, 5.0) for _ in range(num_events)],
    'last_extraction': [datetime.now() - timedelta(days=random.randint(0, 30)) for _ in range(num_events)],
}
df = pd.DataFrame(data)
df.to_csv('event_dataset.csv', index=False)
df.head()   


Unnamed: 0,event_id,event_type,event_date,location,attendees,duration_hours,cost_per_attendee,sponsor,feedback_score,last_extraction
0,event_1,concert,2026-05-23 14:27:19.434752,New York,148,5,328.8,,2.661034,2025-05-13 14:27:19.434752
1,event_2,workshop,2026-05-06 14:27:19.434752,Houston,498,2,376.13,Company B,2.904821,2025-06-03 14:27:19.434752
2,event_3,webinar,2025-06-11 14:27:19.434752,Los Angeles,375,2,218.69,Company B,2.908508,2025-05-27 14:27:19.434752
3,event_4,webinar,2025-12-29 14:27:19.434752,Houston,382,1,140.38,Company A,3.420219,2025-05-23 14:27:19.434752
4,event_5,concert,2026-01-10 14:27:19.434752,New York,346,3,318.18,Company B,3.258302,2025-05-24 14:27:19.434752


# FULL Extraction

In [33]:
# Full Extraction
df_full = pd.read_csv("event_dataset.csv", parse_dates=["last_extraction"])
print(f"Extracted {len(df_full)} rows fully.")
df_full.head()

Extracted 50 rows fully.


Unnamed: 0,event_id,event_type,event_date,location,attendees,duration_hours,cost_per_attendee,sponsor,feedback_score,last_extraction
0,event_1,concert,2026-05-23 14:27:19.434752,New York,148,5,328.8,,2.661034,2025-05-13 14:27:19.434752
1,event_2,workshop,2026-05-06 14:27:19.434752,Houston,498,2,376.13,Company B,2.904821,2025-06-03 14:27:19.434752
2,event_3,webinar,2025-06-11 14:27:19.434752,Los Angeles,375,2,218.69,Company B,2.908508,2025-05-27 14:27:19.434752
3,event_4,webinar,2025-12-29 14:27:19.434752,Houston,382,1,140.38,Company A,3.420219,2025-05-23 14:27:19.434752
4,event_5,concert,2026-01-10 14:27:19.434752,New York,346,3,318.18,Company B,3.258302,2025-05-24 14:27:19.434752


# INCREMENTAL Extraction

In [45]:
# Set initial last extraction time (e.g., halfway through the data range)
with open("last_extraction.txt", "w") as f:
    f.write("2025-06-04 12:00:00") 

In [46]:
# INCREMENTAL EXTRACTION
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()
df = pd.read_csv("event_dataset.csv", parse_dates=["last_extraction"])
last_extraction_time = pd.to_datetime(last_extraction)
df_incremental = df[df['last_extraction'] > last_extraction_time]
print(f"Extracted {len(df_incremental)}  rows since last check.")
df_incremental.head()

Extracted 4  rows since last check.


Unnamed: 0,event_id,event_type,event_date,location,attendees,duration_hours,cost_per_attendee,sponsor,feedback_score,last_extraction
12,event_13,webinar,2025-12-18 14:27:19.434752,New York,89,5,483.29,Company C,1.803054,2025-06-06 14:27:19.434752
17,event_18,workshop,2026-05-05 14:27:19.434752,Houston,152,3,40.92,,2.135554,2025-06-06 14:27:19.434752
32,event_33,workshop,2026-02-03 14:27:19.434752,Phoenix,174,2,354.49,Company B,2.927879,2025-06-06 14:27:19.434752
40,event_41,conference,2026-03-07 14:27:19.434752,Los Angeles,148,4,124.51,,3.593551,2025-06-04 14:27:19.434752


In [48]:
# Successful extraction
# Get the most recent update
new_checkpoint = df['last_extraction'].max()
# Save it
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())
print(f"Updated last_extraction.txt to {new_checkpoint}")

Updated last_extraction.txt to 2025-06-06 14:27:19.434752
