In [1]:
import os 

# data and compute libraries
import pandas as pd

# graphing libraries
import matplotlib.pyplot as plt

# other visuals
import squarify

# libraries for printing markdown tables

# mapping libraries

In [2]:
# # import function definitions from other notebook
# import ipynb

# # full import (full notebook import)
# from ipynb.fs.full.another_notebook import your_function

# # function definition import
# from ipynb.fs.defs.another_notebook import your_function

## Google Trends Data Collation

In [26]:
# fetch each of the trends data from the multiLine.csv files in each folder of keywords in the trends_data folder

# function to get each of the multiLine.csv in each folder in trends_data
def get_datafile_paths(file_name):
    """
    Gets the paths of the multiLine.csv files in each folder of keywords in the trends_data folder.
    """
    # get the list of all folders in the trends_data folder
    folders = os.listdir('trends_data')
    
    # create a list to store the paths of the multiLine.csv files
    datafile_paths = []
    
    # iterate through each folder
    for folder in folders:
        # get the path of the multiLine.csv file in the folder
        datafile_path = os.path.join('trends_data', folder, f'{file_name}.csv')
        
        # append the path to the list
        datafile_paths.append(datafile_path)
    
    # return the list of paths
    return datafile_paths


# function to fetch trends data
def fetch_trends_data(file_path):
    """
    Fetches the trends data from the multiLine.csv files in each folder of keywords in the trends_data folder.
    """
    # read the csv file into a pandas dataframe
    df = pd.read_csv(file_path, skiprows=2)
    
    # return the dataframe
    return df


# function to merge n dataframes from the list of dataframes
def merge_dataframes(dataframes, csv_save_name, merge_column):
    """
    Merges n dataframes from the list of dataframes.
    """
    
    # merge the dataframes on the Month column
    merged_df = dataframes[0]
    # print(merged_df)
    
    for df in dataframes[1:]:
        
        merged_df = pd.merge(merged_df, df, on=f'{merge_column}')

        # rename the columns to be just the keyword by removing the colon and "(Nigeria)"
        merged_df.columns = [col.split(':')[0].strip() if ':' in col else col for col in merged_df.columns]

    # print the merged dataframe
    display(merged_df)

    # data cleaning 
    # if  '<1' convert to 0.5
    merged_df.replace('<1', 0.5, inplace=True)
    # if  'NaN' convert to 0
    merged_df.replace('NaN', 0, inplace=True)
    # if  'NaN' convert to 0
    merged_df.replace('nan', 0, inplace=True)

    # save merged dataframe to csv
    merged_df.to_csv(f'{csv_save_name}.csv', index=False)
    
    # return the merged dataframe
    return merged_df


# function to create combined dataframes and save to csv based on case
def create_combined_dataframes(file_name, csv_save_name, merge_column):
    """
    Creates combined dataframes and saves to csv based on case.
    """
    # get the paths of the multiLine.csv files in each folder of keywords in the trends_data folder
    datafile_paths = get_datafile_paths(file_name)
    # create a list to store the dataframes
    dataframes = []
    # iterate through each path
    for path in datafile_paths:
        # fetch the trends data from the csv file
        df = fetch_trends_data(path)
        # append the dataframe to the list
        dataframes.append(df)

    # merge the dataframes
    merged_df = merge_dataframes(dataframes, csv_save_name, merge_column)

In [27]:
create_combined_dataframes('multiTimeline', 'combined_trends_data_2011_2024', 'Month')

Unnamed: 0,Month,Leisure,Liquid,Rent,Price,Kerosene heater,Public health,Mixture,Gas cylinder,Filling station,...,Cost of living,Samsung,cheap,Health insurance,Tuition payments,Shoes,Budget,Mental health,Soup,Emergency telephone number
0,2011-01,0,51,99,37,0,100,32,0,22,...,0,51,99,23,68,86,61,37,19,0
1,2011-02,43,63,90,35,0,76,29,0,17,...,92,51,95,19,55,86,70,36,17,0
2,2011-03,24,56,96,35,0,81,36,0,0,...,46,52,97,32,54,87,63,31,18,0
3,2011-04,23,50,91,36,0,86,30,0,22,...,55,54,98,25,47,88,48,20,19,0
4,2011-05,34,62,96,36,0,80,31,0,26,...,60,55,92,25,48,97,57,30,23,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,2024-08,23,62,83,89,0,49,48,72,100,...,52,97,53,9,89,57,21,53,67,52
164,2024-09,29,67,80,100,0,47,53,72,84,...,53,94,46,8,88,55,20,53,63,48
165,2024-10,30,66,77,89,0,48,74,73,75,...,57,91,45,9,93,57,19,78,63,57
166,2024-11,29,72,75,96,0,50,72,69,68,...,52,95,46,9,90,65,24,59,70,51


In [28]:
create_combined_dataframes('multiTimeline (1)', 'combined_trends_data_2011_2025', 'Month')

Unnamed: 0,Month,Leisure,Liquid,Rent,Price,Kerosene heater,Public health,Mixture,Gas cylinder,Filling station,...,Cost of living,Samsung,cheap,Health insurance,Tuition payments,Shoes,Budget,Mental health,Soup,Emergency telephone number
0,2011-01,0,51,99,37,0,100,32,0,22,...,0,49,99,23,61,86,63,37,19,0
1,2011-02,43,63,90,35,0,76,29,0,17,...,92,49,95,19,49,86,71,36,17,0
2,2011-03,24,56,96,35,0,81,36,0,0,...,46,50,97,32,48,87,66,31,18,0
3,2011-04,23,50,91,36,0,86,30,0,22,...,55,52,98,25,43,88,52,20,19,0
4,2011-05,34,62,96,36,0,80,31,0,26,...,60,53,92,25,44,97,59,30,23,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,2024-11,29,72,75,96,0,50,72,61,68,...,52,91,46,9,81,65,26,59,70,51
167,2024-12,15,58,75,93,0,41,49,56,72,...,44,96,48,8,67,71,29,48,56,50
168,2025-01,31,74,84,91,36,52,71,100,76,...,54,100,49,9,100,58,28,57,58,62
169,2025-02,27,76,78,93,37,56,71,69,81,...,58,89,56,8,84,60,28,59,59,63


In [30]:
create_combined_dataframes('geoMap', 'combined_trends_data_geo', 'Region')

Unnamed: 0,Region,Leisure,Liquid,Rent,Price,Kerosene heater,Public health,Mixture,Gas cylinder,Filling station,...,Cost of living,Samsung,cheap,Health insurance,Tuition payments,Shoes,Budget,Mental health,Soup,Emergency telephone number
0,Delta,100,80,41,75,46.0,20,58,67,44,...,82.0,57,79,24,53,70,40,42,97,44.0
1,Osun,98,92,51,84,,65,90,74,94,...,83.0,55,72,23,92,78,52,82,52,55.0
2,Ogun State,96,94,86,97,46.0,38,87,81,95,...,90.0,64,89,20,82,87,55,78,65,94.0
3,Imo,96,86,42,81,64.0,53,74,86,57,...,77.0,65,93,24,51,62,41,64,95,52.0
4,Ekiti,92,85,51,76,,46,100,80,100,...,80.0,60,65,20,77,75,56,79,46,76.0
5,Abia,91,86,29,84,,49,89,85,43,...,79.0,63,83,31,54,72,41,62,100,53.0
6,Anambra,89,86,34,83,85.0,25,75,77,46,...,69.0,67,74,23,45,59,40,53,90,60.0
7,Niger,86,93,34,87,,57,90,77,43,...,52.0,72,53,25,57,57,70,64,52,50.0
8,Edo,84,77,36,70,46.0,21,61,55,43,...,67.0,52,67,23,47,66,38,50,85,58.0
9,Oyo,84,83,100,94,67.0,52,78,85,99,...,100.0,62,97,22,100,85,59,100,57,69.0


## Rescaling the Trends Data including 2025 data

In [None]:
# create function to rescale data in a dataframe from Jan 2025 to Mar 2025 by comparing the sum of values from Jan 2011 to Dec 2024 for each keyword and then rescale the values for Jan 2025 to Mar 2025 using the factor of the sums
# Jan 2011 to Dec 2024 is in the first dataframe
# Jan 2011 to Mar 2025 is in the second dataframe
# Comparison is done using all months from Jan 2011 to Dec 2024 in both dataframes
# The focus of rescale is the values from Jan 2025 to Mar 2025 in the second dataframe
# Save the rescaled dataframe to a csv file

def rescale_data(df1, df2, start_row, end_row, rescale_start_row, rescale_end_row):

    for col in df1.columns[1:]:
        # print the column name
        print(f'Column: {col}')
        sum_df1_2011_2024 = df1[(df1['Month'] >= '2011-01') & (df1['Month'] <= '2024-12')][col].sum()
        sum_df2_2011_2024 = df2[(df2['Month'] >= '2011-01') & (df2['Month'] <= '2024-12')][col].sum()
        # print(f'sum_df1_2011_2024: {sum_df1_2011_2024}')
        # print(f'sum_df2_2011_2024: {sum_df2_2011_2024}')

        # calculate the factor for rescaling
        factor = sum_df1_2011_2024 / sum_df2_2011_2024
        # print(f'factor: {factor}')

        # rescale the values in df2 using the factor
        df2[col] = df2[col].apply(lambda x: round(x * factor, 1))

    # save the rescaled dataframe to a csv file
    df2.to_csv('rescaled_combined_trends_data_2011_2025.csv', index=False)

In [49]:
# read the first dataframe
df1 = pd.read_csv('combined_trends_data_2011_2024.csv')

# read the second dataframe
df2 = pd.read_csv('combined_trends_data_2011_2025.csv')

#  call the rescale_data function to rescale the data
rescale_data(df1, df2, '2011-01', '2024-12', '2025-01', '2025-03')

Column: Leisure
sum_df1_2011_2024: 4476
sum_df2_2011_2024: 4476
factor: 1.0
Column: Liquid
sum_df1_2011_2024: 10976
sum_df2_2011_2024: 10976
factor: 1.0
Column: Rent
sum_df1_2011_2024: 10985
sum_df2_2011_2024: 10985
factor: 1.0
Column: Price
sum_df1_2011_2024: 9534
sum_df2_2011_2024: 9534
factor: 1.0
Column: Kerosene heater
sum_df1_2011_2024: 1548
sum_df2_2011_2024: 1548
factor: 1.0
Column: Public health
sum_df1_2011_2024: 6977
sum_df2_2011_2024: 6977
factor: 1.0
Column: Mixture
sum_df1_2011_2024: 9424
sum_df2_2011_2024: 9424
factor: 1.0
Column: Gas cylinder
sum_df1_2011_2024: 5186
sum_df2_2011_2024: 4610
factor: 1.124945770065076
Column: Filling station
sum_df1_2011_2024: 6648
sum_df2_2011_2024: 6648
factor: 1.0
Column: State of emergency
sum_df1_2011_2024: 469.5
sum_df2_2011_2024: 469.5
factor: 1.0
Column: Gasoline
sum_df1_2011_2024: 4898
sum_df2_2011_2024: 4898
factor: 1.0
Column: Pressure
sum_df1_2011_2024: 9702
sum_df2_2011_2024: 9702
factor: 1.0
Column: Petroleum industry
sum_df1