In [21]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

In [22]:

# Simulate extracted data
np.random.seed(42)
random.seed(42)  # for reproducibility
num_records = 100

# Generate signup dates
signup_dates = pd.to_datetime('2022-01-01') + pd.to_timedelta(np.random.randint(0, 365, size=num_records), unit='D')

# Create user data DataFrame
user_data = pd.DataFrame({
    'user_id': np.arange(1, num_records + 1),
    'age': np.random.randint(18, 60, size=num_records),
    'signup_date': signup_dates,
    'country': np.random.choice(['Kenya', 'Cameroon', 'USA', 'UK'], size=num_records),
    'is_active': np.random.choice([True, False], size=num_records),
})

# Generate last_updated timestamps
user_data['last_updated'] = [
    (signup + timedelta(hours=random.randint(0, 23), minutes=random.randint(0, 59))).isoformat()
    for signup in signup_dates
]

print("Extracted User data (first 5 rows):")
print(user_data.head())


Extracted User data (first 5 rows):
   user_id  age signup_date   country  is_active         last_updated
0        1   25  2022-04-13  Cameroon      False  2022-04-13T20:07:00
1        2   41  2022-12-15       USA      False  2022-12-15T00:47:00
2        3   28  2022-09-28     Kenya      False  2022-09-28T08:15:00
3        4   34  2022-04-17        UK       True  2022-04-17T07:08:00
4        5   25  2022-03-13     Kenya      False  2022-03-13T23:06:00


In [23]:
user_data.shape

(100, 6)

In [24]:
user_data.to_csv('user_data_large.csv', index=False)



In [25]:
# FULL EXTRACTION
df_full = pd.read_csv("user_data_large.csv", parse_dates=["last_updated"])
print(f"Pulled {len(df_full)} rows via full extraction.")
df_full.head()

Pulled 100 rows via full extraction.


Unnamed: 0,user_id,age,signup_date,country,is_active,last_updated
0,1,25,2022-04-13,Cameroon,False,2022-04-13 20:07:00
1,2,41,2022-12-15,USA,False,2022-12-15 00:47:00
2,3,28,2022-09-28,Kenya,False,2022-09-28 08:15:00
3,4,34,2022-04-17,UK,True,2022-04-17 07:08:00
4,5,25,2022-03-13,Kenya,False,2022-03-13 23:06:00


In [31]:
# Find the minimum and maximum of 'last_updated'
min_time = df_full['last_updated'].min()
max_time = df_full['last_updated'].max()

# Set initial checkpoint to halfway between min and max
initial_checkpoint = min_time + (max_time - min_time) / 2

# Save it to a file
with open("last_extraction.txt", "w") as f:
    f.write(initial_checkpoint.isoformat())

print(f"Initial extraction checkpoint set to: {initial_checkpoint}")


Initial extraction checkpoint set to: 2022-07-02 21:47:00


In [32]:
# INCREMENTAL EXTRACTION
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()
df = pd.read_csv("user_data_large.csv", parse_dates=["last_updated"])
last_extraction_time = pd.to_datetime(last_extraction)
df_incremental = df[df['last_updated'] > last_extraction_time]
print(f"Pulled {len(df_incremental)} new/updated rows since {last_extraction}.")
df_incremental.head()

Pulled 58 new/updated rows since 2022-07-02T21:47:00.


Unnamed: 0,user_id,age,signup_date,country,is_active,last_updated
1,2,41,2022-12-15,USA,False,2022-12-15 00:47:00
2,3,28,2022-09-28,Kenya,False,2022-09-28 08:15:00
5,6,52,2022-07-08,Kenya,False,2022-07-08 21:47:00
9,10,59,2022-08-03,Kenya,False,2022-08-03 02:13:00
10,11,56,2022-11-27,UK,False,2022-11-27 07:32:00


In [33]:
# Get the most recent update
new_checkpoint = df['last_updated'].max()
# Save it
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())
print(f"Updated last_extraction.txt to {new_checkpoint}")

Updated last_extraction.txt to 2022-12-30 21:14:00
