In [13]:
from pytrends.request import TrendReq
import pandas as pd
import time
import os

# Initialize connection
pytrends = TrendReq(hl="en-US", tz=0)

# Create output folder if not exists
os.makedirs("data/google_trends", exist_ok=True)

# Define batches (max 5 keywords each)
batches = {
    "batch1_finance": ["NVIDIA", "NVIDIA stock", "NVDA stock", "AI", "GPU"],
    "batch2_company": ["NVIDIA", "AMD", "Intel", "TSMC", "OpenAI"],
}

timeframe = "2024-01-01 2025-06-30"
geo = ""  # Global
gprop = ""  # Web search

all_data = {}

# Step 1: Run all batches
for name, kw_list in batches.items():
    print(f"\nRunning {name} with: {kw_list}")
    pytrends.build_payload(kw_list=kw_list, timeframe=timeframe, geo=geo, gprop=gprop)
    df = pytrends.interest_over_time()

    # Remove incomplete final row if present
    if "isPartial" in df.columns:
        df = df.drop(columns=["isPartial"])

    all_data[name] = df
    df.to_csv(f"data/google_trends/{name}_trends.csv")

    print(f"Saved {name}_trends.csv ({len(df)} rows, columns: {list(df.columns)})")
    time.sleep(2)  # Rate limit protection

# Step 2: Normalize across batches using NVIDIA as the anchor
anchor = "NVIDIA"

# Find the peak value of NVIDIA in each batch
scales = {name: df[anchor].max() for name, df in all_data.items()}
print("\nAnchor scales:", scales)

# Use the first batch as baseline
base_batch = list(scales.keys())[0]
base_scale = scales[base_batch]

# Normalize all batches so NVIDIA aligns
normalized_data = {}
for name, df in all_data.items():
    factor = base_scale / scales[name]
    df_scaled = df * factor
    normalized_data[name] = df_scaled
    print(f"Normalized {name} (scale factor = {factor:.3f})")

# Step 3: Merge normalized batches on date
merged = pd.concat(normalized_data.values(), axis=1)
merged = merged.loc[:, ~merged.columns.duplicated()]  # remove duplicate columns
merged.to_csv("data/google_trends/all_batches_merged_normalized.csv")

print("\nMerged dataset saved as: data/google_trends/all_batches_merged_normalized.csv")
print(merged.tail())



Running batch1_finance with: ['NVIDIA', 'NVIDIA stock', 'NVDA stock', 'AI', 'GPU']
Saved batch1_finance_trends.csv (79 rows, columns: ['NVIDIA', 'NVIDIA stock', 'NVDA stock', 'AI', 'GPU'])

Running batch2_company with: ['NVIDIA', 'AMD', 'Intel', 'TSMC', 'OpenAI']
Saved batch2_company_trends.csv (79 rows, columns: ['NVIDIA', 'AMD', 'Intel', 'TSMC', 'OpenAI'])

Anchor scales: {'batch1_finance': np.int64(24), 'batch2_company': np.int64(100)}
Normalized batch1_finance (scale factor = 1.000)
Normalized batch2_company (scale factor = 0.240)

Merged dataset saved as: data/google_trends/all_batches_merged_normalized.csv
            NVIDIA  NVIDIA stock  NVDA stock     AI  GPU   AMD  Intel  TSMC  \
date                                                                          
2025-06-01     7.0           2.0         1.0   89.0  3.0  6.24   5.04  0.24   
2025-06-08     7.0           1.0         1.0   93.0  3.0  6.24   4.80  0.24   
2025-06-15     6.0           1.0         1.0   91.0  3.0  6.48 