In [7]:
import pandas as pd
from faker import Faker
import random
from datetime import datetime, timedelta

# Initialize Faker
fake = Faker()

# Generate 100 records
data = []
start_date = datetime(2025, 1, 1)
for i in range(100):
    record = {
        'transaction_id': i + 1,
        'date': start_date + timedelta(days=random.randint(0, 90)),
        'product': fake.word().capitalize() + " " + random.choice(['Laptop', 'Phone', 'Tablet']),
        'quantity': random.randint(1, 10),
        'price': round(random.uniform(100, 2000), 2),
        'customer_name': fake.name()
    }
    data.append(record)

# Create DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv('custom_data.csv', index=False)
print("Dataset generated and saved as custom_data.csv")

Dataset generated and saved as custom_data.csv


In [10]:
# Rename the 'date' column to 'last_updated'
df = df.rename(columns={'date': 'last_updated'})

# Save the updated dataset back to the CSV file
df.to_csv(dataset_path, index=False)
print(df.columns)

Index(['transaction_id', 'last_updated', 'product', 'quantity', 'price',
       'customer_name'],
      dtype='object')


In [11]:
# FULL EXTRACTION
df_full = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])
print(f"Pulled {len(df_full)} rows via full extraction.")
df_full.head()

Pulled 100 rows via full extraction.


Unnamed: 0,transaction_id,last_updated,product,quantity,price,customer_name
0,1,2025-01-08,Population Laptop,3,892.87,Leah Mcfarland
1,2,2025-03-13,Face Phone,3,1374.01,Christopher Frazier
2,3,2025-02-23,Whatever Phone,3,169.83,David Jackson Jr.
3,4,2025-03-31,Blue Laptop,3,1567.63,Lisa Massey
4,5,2025-03-01,Card Tablet,1,216.97,Jeremiah Poole


In [20]:
# Set initial last extraction time (e.g., halfway through the data range)
with open("last_extraction.txt", "w") as f:
    f.write("2025-02-28 10:03:00") 

In [21]:
# INCREMENTAL EXTRACTION
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()
df = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])
last_extraction_time = pd.to_datetime(last_extraction)
df_incremental = df[df['last_updated'] > last_extraction_time]
print(f"Pulled {len(df_incremental)} new/updated rows since {last_extraction}.")
df_incremental.head()

Pulled 34 new/updated rows since 2025-02-28 10:03:00.


Unnamed: 0,transaction_id,last_updated,product,quantity,price,customer_name
1,2,2025-03-13,Face Phone,3,1374.01,Christopher Frazier
3,4,2025-03-31,Blue Laptop,3,1567.63,Lisa Massey
4,5,2025-03-01,Card Tablet,1,216.97,Jeremiah Poole
8,9,2025-03-27,Five Phone,5,1502.16,Mary Williams
13,14,2025-03-31,Protect Tablet,10,1720.48,Natalie Webster


In [22]:
# Get the most recent update
new_checkpoint = df['last_updated'].max()
# Save it
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())
print(f"Updated last_extraction.txt to {new_checkpoint}")

Updated last_extraction.txt to 2025-04-01 00:00:00
