In [11]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Set number of records and customer names
num_records = 50
customers = ['Amazon', 'Walmart', 'Target', 'Costco', 'BestBuy', 'eBay']
start_date = datetime(2025, 4, 1)

# Generate synthetic sales data
data = []
for _ in range(num_records):
    date = start_date + timedelta(days=random.randint(0, 59))
    data.append({
        'id': random.randint(1000, 9999),
        'customer': random.choice(customers),
        'date': date.date().isoformat(),
        'amount': random.randint(100, 2000),
        'last_updated': (date + timedelta(
            hours=random.randint(0, 23),
            minutes=random.randint(0, 59)
        )).isoformat()
    })

# Create DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv('custom_data.csv', index=False)

# Preview the data
df.head()



Unnamed: 0,id,customer,date,amount,last_updated
0,6884,Target,2025-04-22,161,2025-04-22T23:22:00
1,9633,Walmart,2025-05-04,613,2025-05-04T15:52:00
2,3387,eBay,2025-04-03,971,2025-04-03T03:40:00
3,7734,eBay,2025-04-12,1228,2025-04-12T13:48:00
4,6888,Target,2025-05-03,1919,2025-05-03T05:52:00


In [12]:
# SECTION 2: Full Extraction
df_full = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])
print(f"Extracted {len(df_full)} rows fully.")
df_full.head()

Extracted 50 rows fully.


Unnamed: 0,id,customer,date,amount,last_updated
0,6884,Target,2025-04-22,161,2025-04-22 23:22:00
1,9633,Walmart,2025-05-04,613,2025-05-04 15:52:00
2,3387,eBay,2025-04-03,971,2025-04-03 03:40:00
3,7734,eBay,2025-04-12,1228,2025-04-12 13:48:00
4,6888,Target,2025-05-03,1919,2025-05-03 05:52:00


In [13]:
# SECTION 3: Simulate Initial Last Extraction Time
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-15 12:00:00")


In [14]:
# SECTION 4: Incremental Extraction
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()

df = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])
last_extraction_time = pd.to_datetime(last_extraction)
df_incremental = df[df['last_updated'] > last_extraction_time]

print(f"Extracted {len(df_incremental)} rows incrementally since {last_extraction}.")
df_incremental.head()


Extracted 42 rows incrementally since 2025-04-15 12:00:00.


Unnamed: 0,id,customer,date,amount,last_updated
0,6884,Target,2025-04-22,161,2025-04-22 23:22:00
1,9633,Walmart,2025-05-04,613,2025-05-04 15:52:00
4,6888,Target,2025-05-03,1919,2025-05-03 05:52:00
5,5094,Walmart,2025-05-27,1728,2025-05-27 14:03:00
6,4754,Amazon,2025-05-07,787,2025-05-07 01:09:00


In [15]:
# SECTION 5: Save New Timestamp
new_checkpoint = df['last_updated'].max()
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())

print(f"Updated last_extraction.txt to {new_checkpoint}")

Updated last_extraction.txt to 2025-05-30 16:28:00
