In [14]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Generate 60 records of simulated product reviews
products = ['Laptop', 'Phone', 'Tablet', 'Monitor', 'Keyboard']
data = []
start_date = datetime(2025, 4, 1)

for i in range(60):
    date = start_date + timedelta(days=i)
    data.append({
        'review_id': 1000 + i,
        'product': random.choice(products),
        'rating': random.randint(1, 5),
        'review_date': date.date().isoformat(),
        'last_updated': (date + timedelta(hours=random.randint(0, 23), minutes=random.randint(0, 59))).isoformat()
    })

df = pd.DataFrame(data)
df.to_csv('custom_data.csv', index=False)
df.head()

Unnamed: 0,review_id,product,rating,review_date,last_updated
0,1000,Keyboard,2,2025-04-01,2025-04-01T11:18:00
1,1001,Laptop,2,2025-04-02,2025-04-02T20:06:00
2,1002,Laptop,1,2025-04-03,2025-04-03T09:22:00
3,1003,Keyboard,2,2025-04-04,2025-04-04T06:32:00
4,1004,Laptop,3,2025-04-05,2025-04-05T08:23:00


# Section 1: Full Extraction

In [15]:


df_full = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])
print(f"Extracted {len(df_full)} rows fully.")
print(f"Columns: {list(df_full.columns)}")
df_full.head()


Extracted 60 rows fully.
Columns: ['review_id', 'product', 'rating', 'review_date', 'last_updated']


Unnamed: 0,review_id,product,rating,review_date,last_updated
0,1000,Keyboard,2,2025-04-01,2025-04-01 11:18:00
1,1001,Laptop,2,2025-04-02,2025-04-02 20:06:00
2,1002,Laptop,1,2025-04-03,2025-04-03 09:22:00
3,1003,Keyboard,2,2025-04-04,2025-04-04 06:32:00
4,1004,Laptop,3,2025-04-05,2025-04-05 08:23:00


# Section 2: Incremental Extraction


In [16]:
# Simulate a past extraction time
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-20 12:00:00")

# Incremental extraction logic
with open("last_extraction.txt", "r") as f:
    last_time = f.read().strip()

df = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])
last_extraction_time = pd.to_datetime(last_time)
df_incremental = df[df["last_updated"] > last_extraction_time]

print(f"Extracted {len(df_incremental)} rows incrementally since {last_time}.")
df_incremental.head()


Extracted 40 rows incrementally since 2025-04-20 12:00:00.


Unnamed: 0,review_id,product,rating,review_date,last_updated
20,1020,Phone,5,2025-04-21,2025-04-21 05:46:00
21,1021,Laptop,1,2025-04-22,2025-04-22 20:36:00
22,1022,Keyboard,1,2025-04-23,2025-04-23 07:03:00
23,1023,Keyboard,2,2025-04-24,2025-04-24 02:05:00
24,1024,Tablet,2,2025-04-25,2025-04-25 17:12:00


# Section 3: Update Last Extraction Time

In [17]:
# Get the latest timestamp
new_checkpoint = df['last_updated'].max()

# Save it back
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())

print(f"Updated last_extraction.txt to {new_checkpoint}")


Updated last_extraction.txt to 2025-05-30 10:19:00
