![Workout image](gym.png)

You are a product manager for a fitness studio based in Singapore and are interested in understanding the types of digital products you should offer. You plan to conduct a market analysis in Python to understand how to place your digital fitness products in the regional market. A market analysis will allow you to identify strengths of your competitors, gauge demand, and create unique new digital products and services for potential users.

You are provided with a number of CSV files in the Files-"data" folder, which offer international data on Google Trends and YouTube keyword searches related to fitness and related products. Two helper functions have also been provided, `read_file` and `read_geo`, to help you process and visualize these CSV files for further analysis.

You'll use `pandas` methods to explore this data and drive your product management insights. 

You can continue beyond the bounds of this project and also investigate in-person classes, local gyms, and online personal instructors!

In [None]:
# STARTER CODE - PLEASE DO NOT EDIT ANY CODE IN THIS CELL

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='white', palette='Pastel2')
import os

def read_file(filepath, plot = True):
    """
    Read a CSV file from a given filepath, convert it into a pandas DataFrame,
    and return a processed DataFrame with three columns: 'week', 'region', and 'interest'. Generate a line plot using Seaborn to visualize the data. This corresponds to the first graphic (time series) returned by trends.google.com. 
    """
    file = pd.read_csv(filepath, header=1)
    df = file.set_index('Week').stack().reset_index()
    df.columns = ['week','region','interest']
    df['week'] = pd.to_datetime(df['week'])
    plt.figure(figsize=(8,3))
    df = df[df['interest']!="<1"]
    df['interest'] = df['interest'].astype(float)

    if plot:
        sns.lineplot(data = df, x= 'week', y= 'interest',hue='region')
    return df

def read_geo(filepath, multi=False):
    """
    Read a CSV file from a given filepath, convert it into a pandas DataFrame,
    and return a processed DataFrame with two columns: 'country' and 'interest'. Generate a bar plot using Seaborn to visualize the data. This corresponds to the second graphic returned by trends.google.com. Use multi=False if only one keyword is being analyzed, and multi=True if more than one keyword is being analyzed.
    """
    file = pd.read_csv(filepath, header=1)

    if not multi:
        file.columns = ['country', 'interest']
        plt.figure(figsize=(8,4))
        sns.barplot(data = file.dropna().iloc[:25,:], y = 'country', x='interest')

    if multi:
        plt.figure(figsize=(3,8))
        file = file.set_index('Country').stack().reset_index()
        file.columns = ['country','category','interest']
        file['interest'] = pd.to_numeric(file['interest'].apply(lambda x: x[:-1]))
        sns.barplot(data=file.dropna(), y = 'country', x='interest', hue='category')

    file = file.sort_values(ascending=False,by='interest')
    return file

# 1. Load data on global interest in fitness

In [None]:
# Start your coding here ....
# Read 'workout.csv' file
workout = read_file("data/workout.csv")
workout.head(10)

In [None]:
print(f"Name of columns in the workout data:\n{workout.columns}")
print(f"\nColumn types in the workout data:\n{workout.dtypes}")

# 2. Assess global interest in fitness

In [None]:
# Grouping the data by month and calculating mean interest
workout_by_month = workout.groupby(pd.Grouper(key='week', freq='MS')).agg({'interest': 'mean'})

# Resetting the index
workout_by_month.reset_index(inplace=True)

# Displaying the resulting DataFrame
print(workout_by_month)

In [None]:
# finding the month with the highest interest
month_high = workout_by_month[workout_by_month["interest"] == workout_by_month["interest"].max()]

month_str = str(month_high['week'].iloc[0].date())

In [None]:
workout_by_month.head(10)

In [None]:
print(month_high)
print(f"\nThe month with the highest interest is: {month_str}")

# 3. Compare interest in home workouts, gym workouts and home gyms

In [None]:
# Read 'three_keywords.csv' file
keywords = read_file("data/three_keywords.csv")
keywords.head(50)

In [None]:
print(f"Name of columns in the keywords data:\n{keywords.columns}")
print(f"\nColumn types in the keywords data:\n{keywords.dtypes}")

In [None]:
# Filter the DataFrame to include only the years 2022 and 2023
keywords_22_23_years = keywords[keywords["week"].dt.year.isin([2022,2023])]

# Find the maximum interest value overall
highest_interest_value = keywords_22_23_years['interest'].max()

# Find the region(s) with the highest interest value
region_with_highest_interest = keywords_22_23_years[keywords_22_23_years['interest'] == highest_interest_value]['region'].unique()

print(f"The region(s) with the highest interest between 2022 and 2023 is/are: {', '.join(region_with_highest_interest)} with a total interest of {highest_interest_value}.")

# Gym workouts attracted the highest interest during the period of 2022-2023
current = "gym workout"

In [None]:
# Filter the DataFrane to include only the 2020 year
keywords_20_year = keywords[keywords["week"].dt.year.isin([2020])]

# Find the maximum interest value overall
highest_interest_value_covid = keywords_20_year['interest'].max()

# Find the region(s) with the highest interest value
region_with_highest_interest_covid = keywords_20_year[keywords_20_year['interest'] == highest_interest_value_covid]['region'].unique()

print(f"The region(s) with the highest interest in 2020 is/are: {', '.join(region_with_highest_interest_covid)} with a total interest of {highest_interest_value}.")

# At the peak of the COVID-19 pandemic, home workouts garnered the highest interest
peak_covid = "home workout"

# 4. Segment global interest by region

In [None]:
# Read 'workout_global.csv' file
workout_global = read_geo("data/workout_global.csv")
workout_global.head(10)

In [None]:
print(f"Name of columns in the workout_global data:\n{workout_global.columns}")
print(f"\nColumn types in the workout_global data:\n{workout_global.dtypes}")

In [None]:
# Select top 25 countries with the highest interest in workout
top_25_countries = workout_global.sort_values(by="interest", ascending=False)[:25]
print(top_25_countries)

In [None]:
# Get the country with the highest interest in workout
top_country = workout_global[workout_global["interest"] == workout_global["interest"].max()]
print(top_country)

# 5. Assessing regional demand for home workouts, gym workouts and home gyms

In [None]:
# Read 'geo_three_keywords.csv' file
geo_categories = read_geo("data/geo_three_keywords.csv", multi=True)
geo_categories.head()

In [None]:
print(f"Name of columns in the geo_categories data:\n{geo_categories.columns}")
print(f"\nColumn types in the geo_categories data:\n{geo_categories.dtypes}")

In [None]:
# Filter the DataFrame 'geo_categories' to get countries from the Middle East and South Asia
mesa = geo_categories.loc[geo_categories.country.isin(["Philippines", "Singapore", "United Arab Emirates" "Qatar" "Kuwait" "Lebanon" "Malaysia" "Sri Lanka" "India", "Pakistan"]), :]
print(mesa)

# 6. Assess the split of interest by country and category

In [None]:
# Set the index of the MESA DataFrame by country and category, and then unstack it
mesa_indexed = mesa.set_index(['country','category']).unstack()
top_home_workout_country = mesa_indexed['interest'].idxmax()

print(top_home_workout_country)
# The country with the highest interest in home workouts is Philippines"
top_home_workout_country = 'Philippines'

# 7. A deeper dive into two countries

In [None]:
# Read 'yoga_zumba_sng.csv' file
read_file('data/yoga_zumba_sng.csv')

In [None]:
# Read 'yoga_zumba_phl.csv' file
read_file('data/yoga_zumba_phl.csv')

In [None]:
# As depicted in the plots, the peak interest in yoga and zumba is observed in the Philippines and Singapore
pilot_content = ['yoga', 'zumba']