In [24]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Configuration
num_records = 50
users = [f"user_{i}" for i in range(1, 21)]
devices = ['desktop', 'mobile', 'tablet']
short_words = ['LOG', 'ACT', 'WEB', 'APP', 'USR', 'NET', 'DAT', 'SFX']
start_date = datetime(2025, 5, 1)

# Generate synthetic session log data with custom session_id
data = []
for _ in range(num_records):
    session_start = start_date + timedelta(
        days=random.randint(0, 30),
        hours=random.randint(0, 23),
        minutes=random.randint(0, 59)
    )
    session_length = timedelta(minutes=random.randint(5, 120))
    session_end = session_start + session_length

    session_id = f"{random.randint(0, 999):03d}-{random.choice(short_words)}"

    data.append({
        'session_id': session_id,
        'user_id': random.choice(users),
        'device_type': random.choice(devices),
        'start_time': session_start.isoformat(),
        'end_time': session_end.isoformat(),
        
    })

# Create DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv('user_sessions.csv', index=False)

# Preview
print(df.head())


  session_id user_id device_type           start_time             end_time
0    410-LOG  user_8      tablet  2025-05-30T02:16:00  2025-05-30T02:24:00
1    953-SFX  user_3      mobile  2025-05-29T03:51:00  2025-05-29T04:16:00
2    920-APP  user_2     desktop  2025-05-01T06:31:00  2025-05-01T07:45:00
3    264-NET  user_3     desktop  2025-05-31T12:24:00  2025-05-31T14:10:00
4    530-DAT  user_4      mobile  2025-05-16T00:25:00  2025-05-16T00:31:00


In [25]:
# SECTION 2: Full Extraction
df_full = pd.read_csv("user_sessions.csv", parse_dates=["start_time", "end_time"])
print(f"Extracted {len(df_full)} rows fully.")
print(df_full.head())


Extracted 50 rows fully.
  session_id user_id device_type          start_time            end_time
0    410-LOG  user_8      tablet 2025-05-30 02:16:00 2025-05-30 02:24:00
1    953-SFX  user_3      mobile 2025-05-29 03:51:00 2025-05-29 04:16:00
2    920-APP  user_2     desktop 2025-05-01 06:31:00 2025-05-01 07:45:00
3    264-NET  user_3     desktop 2025-05-31 12:24:00 2025-05-31 14:10:00
4    530-DAT  user_4      mobile 2025-05-16 00:25:00 2025-05-16 00:31:00


In [26]:
# SECTION 3: Simulate Initial Last Extraction Time
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-15 12:00:00")



In [27]:
# SECTION 4: Incremental Extraction
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()

df = pd.read_csv("user_sessions.csv", parse_dates=["start_time", "end_time"])
last_extraction_time = pd.to_datetime(last_extraction)
df_incremental = df[df['start_time'] > last_extraction_time]

print(f"Extracted {len(df_incremental)} rows incrementally since {last_extraction}.")
print(df_incremental.head())


Extracted 50 rows incrementally since 2025-04-15 12:00:00.
  session_id user_id device_type          start_time            end_time
0    410-LOG  user_8      tablet 2025-05-30 02:16:00 2025-05-30 02:24:00
1    953-SFX  user_3      mobile 2025-05-29 03:51:00 2025-05-29 04:16:00
2    920-APP  user_2     desktop 2025-05-01 06:31:00 2025-05-01 07:45:00
3    264-NET  user_3     desktop 2025-05-31 12:24:00 2025-05-31 14:10:00
4    530-DAT  user_4      mobile 2025-05-16 00:25:00 2025-05-16 00:31:00


In [28]:
# SECTION 5: Save New Timestamp
new_checkpoint = df['start_time'].max()
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())

print(f"Updated last_extraction.txt to {new_checkpoint}")


Updated last_extraction.txt to 2025-05-31 12:24:00
