In [1]:
pip install pytrends

Collecting pytrends
  Downloading pytrends-4.9.2-py3-none-any.whl.metadata (13 kB)
Downloading pytrends-4.9.2-py3-none-any.whl (15 kB)
Installing collected packages: pytrends
Successfully installed pytrends-4.9.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from pytrends.request import TrendReq
import logging

# Configure logging for informative output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_investor_sentiment_index(
    keywords: list[str],
    timeframe: str = '2021-01-01 2024-12-31',
    geo: str = 'IN',
    cat: int = 7,  # Category for Finance (investing)
    hl: str = 'en-IN',
    tz: int = 330, # IST timezone offset (in minutes)
    output_filename: str = "monthly_investor_sentiment_index_india_2021_2024.csv"
) -> pd.DataFrame | None:
    """
    Fetches Google Trends data for given keywords, calculates a monthly investor
    sentiment index, and saves it to a CSV file.

    Args:
        keywords (list[str]): A list of keywords to search on Google Trends.
        timeframe (str): The time range for the data (e.g., '2021-01-01 2024-12-31').
        geo (str): The geographical region (e.g., 'IN' for India).
        cat (int): The category ID for Google Trends. Defaults to 7 (Finance).
        hl (str): Host language code for results (e.g., 'en-IN').
        tz (int): Timezone offset in minutes (e.g., 330 for IST).
        output_filename (str): The name of the CSV file to save the results.

    Returns:
        pd.DataFrame | None: A DataFrame containing the monthly investor sentiment index,
                             or None if an error occurs or no data is found.
    """
    logging.info(f"Attempting to connect to Google Trends with hl='{hl}', tz={tz}...")
    try:
        # Initialize TrendReq object to connect to Google Trends
        pytrends = TrendReq(hl=hl, tz=tz)
    except Exception as e:
        logging.error(f"Failed to connect to Google Trends. Please check your internet connection or `pytrends` setup: {e}")
        return None

    all_keyword_data = []

    logging.info(f"Fetching Google Trends data for keywords: {keywords} within timeframe '{timeframe}' in region '{geo}'...")
    for kw in keywords:
        try:
            # Build payload for the current keyword, category, timeframe, and geography
            pytrends.build_payload([kw], cat=cat, timeframe=timeframe, geo=geo)
            # Request interest over time data from Google Trends
            data = pytrends.interest_over_time()

            if not data.empty:
                # Drop the 'isPartial' column, which indicates partial data for the last period
                data = data.drop(columns='isPartial')
                # Rename the keyword column for consistency (e.g., 'mutual fund' becomes 'mutual_fund')
                data = data.rename(columns={kw: kw.replace(' ', '_')})
                all_keyword_data.append(data)
                logging.info(f"Successfully fetched data for keyword: '{kw}'.")
            else:
                logging.warning(f"No data found for keyword: '{kw}'. Skipping this keyword.")

        except Exception as e:
            logging.error(f"Error fetching data for keyword '{kw}'. This might be due to API limits or network issues: {e}")
            continue # Continue to the next keyword even if one fails

    if not all_keyword_data:
        logging.warning("No data was retrieved for any of the specified keywords. Exiting.")
        return None

    # Concatenate all individual keyword DataFrames into a single DataFrame.
    # 'outer' join ensures all dates are kept, even if some keywords have missing data for certain days.
    # The 'index' of each DataFrame is assumed to be datetime.
    try:
        final_df = pd.concat(all_keyword_data, axis=1, join='outer')
        # Ensure the DataFrame's index is a DatetimeIndex for resampling
        if not isinstance(final_df.index, pd.DatetimeIndex):
            final_df.index = pd.to_datetime(final_df.index)
        logging.info("All keyword data successfully consolidated.")
    except Exception as e:
        logging.error(f"Error concatenating keyword data. Ensure data consistency: {e}")
        return None

    # Resample the data to monthly frequency and compute the mean interest for each month.
    # This automatically handles NaNs by excluding them from the mean calculation.
    monthly_df = final_df.resample('M').mean().reset_index()
    # Rename the new index column (created by reset_index()) to 'date' for clarity
    monthly_df.rename(columns={'index': 'date'}, inplace=True)
    logging.info("Data aggregated to monthly averages.")

    # Compute the average sentiment index across all keyword columns.
    # Exclude the 'date' column when calculating the mean across rows.
    keyword_columns = [col for col in monthly_df.columns if col != 'date']
    if not keyword_columns:
        logging.warning("No valid keyword columns found to calculate the Investor Sentiment Index.")
        # Return the monthly_df even if sentiment index cannot be calculated, it might still be useful
        return monthly_df

    monthly_df['Investor_Sentiment_Index'] = monthly_df[keyword_columns].mean(axis=1)
    logging.info("Investor Sentiment Index successfully calculated.")

    # Save the final DataFrame to a CSV file
    try:
        monthly_df.to_csv(output_filename, index=False)
        logging.info(f"✅ Final monthly investor sentiment index saved to: '{output_filename}'")
    except Exception as e:
        logging.error(f"Error saving data to '{output_filename}'. Check file permissions or path: {e}")
        return None

    return monthly_df

# This block ensures the code runs only when the script is executed directly
if __name__ == "__main__":
    # Define your search parameters
    search_keywords = ['mutual fund', 'SIP investment', 'Groww', 'Nifty 50', 'PPF', 'best mutual fund']
    start_date = '2021-01-01'
    end_date = '2024-12-31'
    output_csv_filename = "monthly_investor_sentiment_index_india_2021_2024.csv"

    # Call the function to generate the investor sentiment index
    investor_sentiment_df = get_investor_sentiment_index(
        keywords=search_keywords,
        timeframe=f'{start_date} {end_date}',
        output_filename=output_csv_filename
    )

    if investor_sentiment_df is not None:
        print("\n--- Sample of the generated Investor Sentiment Index ---")
        print(investor_sentiment_df.head())
        print(f"\nSuccessfully generated and saved the data to {output_csv_filename}")
    else:
        print("\n--- Data generation failed or no data was retrieved. Please check the logs above for details. ---")


2025-06-11 16:31:05,369 - INFO - Attempting to connect to Google Trends with hl='en-IN', tz=330...
2025-06-11 16:31:06,104 - INFO - Fetching Google Trends data for keywords: ['mutual fund', 'SIP investment', 'Groww', 'Nifty 50', 'PPF', 'best mutual fund'] within timeframe '2021-01-01 2024-12-31' in region 'IN'...
2025-06-11 16:31:11,943 - ERROR - Error fetching data for keyword 'mutual fund'. This might be due to API limits or network issues: The request failed: Google returned a response with code 429
2025-06-11 16:31:17,472 - ERROR - Error fetching data for keyword 'SIP investment'. This might be due to API limits or network issues: The request failed: Google returned a response with code 429
2025-06-11 16:31:23,524 - ERROR - Error fetching data for keyword 'Groww'. This might be due to API limits or network issues: The request failed: Google returned a response with code 429
2025-06-11 16:31:29,227 - ERROR - Error fetching data for keyword 'Nifty 50'. This might be due to API limits


--- Data generation failed or no data was retrieved. Please check the logs above for details. ---


In [3]:
!pip install pandas
!pip install pytrends



In [8]:
import pandas as pd
from pytrends.request import TrendReq
import logging
import time # Import the time module
import random # Import random for jitter

# Configure logging for informative output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_investor_sentiment_index(
    keywords: list[str],
    timeframe: str = '2021-01-01 2024-12-31',
    geo: str = 'IN',
    cat: int = 7,  # Category for Finance (investing)
    hl: str = 'en-IN',
    tz: int = 330, # IST timezone offset (in minutes)
    output_filename: str = "monthly_investor_sentiment_index_india_2021_2024.csv",
    request_delay: int = 30, # Increased initial delay between *different* keywords
    max_retries: int = 5, # Max retry attempts for a single keyword
    initial_retry_delay: int = 10, # Initial delay for a retry attempt (will be exponential)
    pytrends_timeout: int = 60 # New: Timeout for pytrends request itself
) -> pd.DataFrame | None:
    """
    Fetches Google Trends data for given keywords, calculates a monthly investor
    sentiment index, and saves it to a CSV file. Implements exponential backoff
    with retries for robust API interaction.

    Args:
        keywords (list[str]): A list of keywords to search on Google Trends.
        timeframe (str): The time range for the data (e.g., '2021-01-01 2024-12-31').
        geo (str): The geographical region (e.g., 'IN' for India).
        cat (int): The category ID for Google Trends. Defaults to 7 (Finance).
        hl (str): Host language code for results (e.g., 'en-IN').
        tz (int): Timezone offset in minutes (e.g., 330 for IST).
        output_filename (str): The name of the CSV file to save the results.
        request_delay (int): The number of seconds to wait between processing
                             different keywords (initial delay).
        max_retries (int): The maximum number of retry attempts for each keyword
                           if an API error occurs.
        initial_retry_delay (int): The starting delay in seconds for the first retry
                                   attempt. This delay will increase exponentially.
        pytrends_timeout (int): The timeout in seconds for the pytrends HTTP request.

    Returns:
        pd.DataFrame | None: A DataFrame containing the monthly investor sentiment index,
                             or None if an error occurs or no data is found.
    """
    logging.info(f"Attempting to connect to Google Trends with hl='{hl}', tz={tz}...")
    try:
        # Initialize TrendReq object with a higher timeout
        pytrends = TrendReq(hl=hl, tz=tz, timeout=(10, pytrends_timeout)) # connect_timeout, read_timeout
    except Exception as e:
        logging.error(f"Failed to connect to Google Trends. Please check your internet connection or `pytrends` setup: {e}")
        return None

    all_keyword_data = []

    logging.info(f"Fetching Google Trends data for keywords: {keywords} within timeframe '{timeframe}' in region '{geo}'...")
    for i, kw in enumerate(keywords):
        if i > 0: # Apply initial delay between different keywords (not for retries)
            logging.info(f"Waiting for {request_delay} seconds before fetching next *new* keyword: '{kw}'...")
            time.sleep(request_delay)

        retries = 0
        successful_fetch = False
        # current_retry_delay is dynamically calculated inside the loop

        while retries <= max_retries and not successful_fetch:
            try:
                # Build payload for the current keyword, category, timeframe, and geography
                pytrends.build_payload([kw], cat=cat, timeframe=timeframe, geo=geo)
                # Request interest over time data from Google Trends
                data = pytrends.interest_over_time()

                if not data.empty:
                    # Drop the 'isPartial' column
                    data = data.drop(columns='isPartial')
                    # Rename the keyword column
                    data = data.rename(columns={kw: kw.replace(' ', '_')})
                    all_keyword_data.append(data)
                    logging.info(f"Successfully fetched data for keyword: '{kw}'.")
                    successful_fetch = True
                else:
                    logging.warning(f"No data found for keyword: '{kw}' after {retries} retries. Skipping.")
                    successful_fetch = True # Treat as successful to move to next keyword, as data might truly be empty

            except Exception as e:
                retries += 1
                if retries <= max_retries:
                    # Add some jitter to the exponential backoff to avoid synchronized retries
                    jitter = random.uniform(0.8, 1.2) # Adjusted jitter range for slightly less variability
                    sleep_time = initial_retry_delay * (2 ** (retries - 1)) * jitter
                    logging.warning(f"Error fetching data for keyword '{kw}' (attempt {retries}/{max_retries}). Retrying in {sleep_time:.2f} seconds. Error: {e}")
                    time.sleep(sleep_time)
                else:
                    logging.error(f"Failed to fetch data for keyword '{kw}' after {max_retries} retries. Giving up. Last error: {e}")
            
        if not successful_fetch: # If after all retries, it still wasn't successful
            logging.error(f"Keyword '{kw}' could not be fetched after all retries.")


    if not all_keyword_data:
        logging.warning("No data was retrieved for any of the specified keywords after all attempts. Exiting.")
        return None

    # Concatenate all individual keyword DataFrames into a single DataFrame.
    try:
        final_df = pd.concat(all_keyword_data, axis=1, join='outer')
        if not isinstance(final_df.index, pd.DatetimeIndex):
            final_df.index = pd.to_datetime(final_df.index)
        logging.info("All keyword data successfully consolidated.")
    except Exception as e:
        logging.error(f"Error concatenating keyword data. Ensure data consistency: {e}")
        return None

    # Resample to monthly frequency and compute mean, using 'ME' for end of month
    monthly_df = final_df.resample('ME').mean().reset_index() # Changed 'M' to 'ME'
    monthly_df.rename(columns={'index': 'date'}, inplace=True)
    logging.info("Data aggregated to monthly averages.")

    # Compute the average sentiment index
    keyword_columns = [col for col in monthly_df.columns if col != 'date']
    if not keyword_columns:
        logging.warning("No valid keyword columns found to calculate the Investor Sentiment Index.")
        return monthly_df

    monthly_df['Investor_Sentiment_Index'] = monthly_df[keyword_columns].mean(axis=1)
    logging.info("Investor Sentiment Index successfully calculated.")

    # Save to CSV
    try:
        monthly_df.to_csv(output_filename, index=False)
        logging.info(f"✅ Final monthly investor sentiment index saved to: '{output_filename}'")
    except Exception as e:
        logging.error(f"Error saving data to '{output_filename}'. Check file permissions or path: {e}")
        return None

    return monthly_df

# Main execution block
if __name__ == "__main__":
    search_keywords = ['mutual fund', 'SIP investment', 'Groww', 'Nifty 50', 'PPF', 'best mutual fund']
    start_date = '2021-01-01'
    end_date = '2024-12-31'
    output_csv_filename = "monthly_investor_sentiment_index_india_2021_2024.csv"

    # Call the function with updated retry parameters
    investor_sentiment_df = get_investor_sentiment_index(
        keywords=search_keywords,
        timeframe=f'{start_date} {end_date}',
        output_filename=output_csv_filename,
        request_delay=30,         # Increased initial delay between distinct keywords
        max_retries=5,           # Max retries per keyword
        initial_retry_delay=10,  # Initial delay for first retry (doubles thereafter)
        pytrends_timeout=60      # Increased pytrends timeout
    )

    if investor_sentiment_df is not None:
        print("\n--- Sample of the generated Investor Sentiment Index ---")
        print(investor_sentiment_df.head())
        print(f"\nSuccessfully generated and saved the data to {output_csv_filename}")
    else:
        print("\n--- Data generation failed or no data was retrieved. Please check the logs above for details. ---")


2025-06-11 16:48:15,144 - INFO - Attempting to connect to Google Trends with hl='en-IN', tz=330...
2025-06-11 16:48:15,922 - INFO - Fetching Google Trends data for keywords: ['mutual fund', 'SIP investment', 'Groww', 'Nifty 50', 'PPF', 'best mutual fund'] within timeframe '2021-01-01 2024-12-31' in region 'IN'...
2025-06-11 16:48:17,534 - INFO - Successfully fetched data for keyword: 'mutual fund'.
2025-06-11 16:48:17,535 - INFO - Waiting for 30 seconds before fetching next *new* keyword: 'SIP investment'...
2025-06-11 16:48:49,068 - INFO - Successfully fetched data for keyword: 'SIP investment'.
2025-06-11 16:48:49,068 - INFO - Waiting for 30 seconds before fetching next *new* keyword: 'Groww'...
2025-06-11 16:54:50,056 - INFO - Successfully fetched data for keyword: 'Groww'.
2025-06-11 16:54:50,056 - INFO - Waiting for 30 seconds before fetching next *new* keyword: 'Nifty 50'...
2025-06-11 17:01:27,246 - INFO - Successfully fetched data for keyword: 'Nifty 50'.
2025-06-11 17:01:27,24


--- Sample of the generated Investor Sentiment Index ---
        date  mutual_fund  SIP_investment  Groww  Nifty_50    PPF  \
0 2020-12-31        62.00           35.00  17.00     15.00  49.00   
1 2021-01-31        61.20           43.20  20.00     19.00  44.40   
2 2021-02-28        56.50           35.75  21.00     20.75  37.50   
3 2021-03-31        48.50           37.50  18.25     20.25  53.25   
4 2021-04-30        45.75           42.25  22.75     23.50  33.75   

   best_mutual_fund  Investor_Sentiment_Index  
0             58.00                 39.333333  
1             56.60                 40.733333  
2             54.00                 37.583333  
3             49.25                 37.833333  
4             49.75                 36.291667  

Successfully generated and saved the data to monthly_investor_sentiment_index_india_2021_2024.csv
