# Factors Affecting Health and Fitness

## Import Countries.csv and clean the DataFrame

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import scipy.stats as st

# Replace 'filename.csv' with the path to your CSV file
df = pd.read_csv('countries.csv')

# Display the first few rows of the DataFrame
df.head()

In [None]:
# Clear out unnecessary columns
df = df[['Country', 
                             'People and Society: Population', 
                             'People and Society: Median age - total', 
                             'People and Society: Obesity - adult prevalence rate', 
                             'Economy: Real GDP per capita']]
df

In [None]:
# Remove parentheses and everything inside them from the "Economy: Real GDP per capita" column
df['Economy: Real GDP per capita'] = df['Economy: Real GDP per capita'].str.replace(r'\(.*\)', '')


# Split each cell into multiple columns using the "$" character as the separator
df_new = df['Economy: Real GDP per capita'].str.split('$', expand=True)

# Keep only the first column containing the desired data and rename the column

df_new = df_new[[0]].rename(columns={0: 'New_Column_Name'})

# Merge the modified column back into the original dataframe
df = pd.concat([df, df_new], axis=1)

# Remove the final column and show the dataframe
countries_df = df.drop(columns=['New_Column_Name'])

countries_df = countries_df.rename(columns={'People and Society: Obesity - adult prevalence rate': 'Obesity Rate in Adults (2016)'})

countries_df['Obesity Rate in Adults (2016)'] = countries_df['Obesity Rate in Adults (2016)'].str.replace('\s*\(\d{4}\)', '', regex=True)


In [None]:
countries_df

## Bring in World Happiness Report

In [None]:
whr21_df = pd.read_csv('world-happiness-report-2021.csv')

whr21_df

In [None]:
whr21_df2 = whr21_df[['Country name', 
                      'Healthy life expectancy', 
                      'Ladder score']]

whr21_df2 = whr21_df2.rename(columns={'Country name': 'Country'})

whr21_df2

## Bring in GymData

In [None]:
gd_df = pd.read_csv('gym_data.csv')

# Display the first few rows of the DataFrame
gd_df.head()

In [None]:
gym_data_df = gd_df[['Country', 
                     'Total Number of Gyms', 
                     'Total Members', 
                     'Members Per Gym', 
                     'Annual Revenue Per Member']]

gym_data_df = gym_data_df.dropna()
gym_data_df['Annual Revenue Per Member'] = gym_data_df['Annual Revenue Per Member'].str.replace('$', '').str.replace(',', '').str.strip()
gym_data_df['Annual Revenue Per Member'] = gym_data_df['Annual Revenue Per Member'].astype(float)


gym_data_df

## Bring in AQI

In [None]:
AQI_city_df = pd.read_csv('AQI_by_city.csv')

AQI_city_df = AQI_city_df.rename(columns={'2021' : '2021 AQI'})

AQI_city_df

In [None]:
split_data = AQI_city_df['City'].str.split(',', expand=True)

AQI_city_df.drop('City', axis=1, inplace=True)

# Insert the new columns into the original DataFrame
AQI_city_df.insert(loc=1, column='City', value=split_data[0])
AQI_city_df.insert(loc=2, column='Country', value=split_data[1])


#Rename Column
AQI_city_df

In [None]:
AQI_city_df2 = AQI_city_df[['City',
                           'Country', 
                           '2021 AQI', 
                     ]]
AQI_city_df2

In [None]:
# Compute the average AQI for each country
avg_aqi_by_country = AQI_city_df.groupby(['Country'])['2021 AQI'].mean().reset_index()

# Rename the '2021 AQI' column to 'Average AQI'
avg_aqi_by_country = avg_aqi_by_country.rename(columns={'2021 AQI': 'Average AQI (2021)'})

avg_aqi_by_country

## Merge the DataFrames

In [None]:
merged_df = pd.merge(gym_data_df, countries_df, on='Country')

merged_df['Economy: Real GDP per capita'] = merged_df['Economy: Real GDP per capita'].str.replace('$', '').str.replace(',', '').str.strip().astype(float)

merged_df

In [None]:
merged_df2 = pd.merge(merged_df, whr21_df2, on='Country')

merged_df2 = merged_df2.rename(columns={'Ladder score': 'World Happiness Score'})

merged_df2

In [None]:
merged_df3 = merged_df2.merge(avg_aqi_by_country, on='Country', how='left')

merged_df3['Obesity Rate in Adults (2016)'] = merged_df3['Obesity Rate in Adults (2016)'].str.replace('%', '').str.strip()
merged_df3['Obesity Rate in Adults (2016)'] = merged_df3['Obesity Rate in Adults (2016)'].astype(float)
merged_df3['Average AQI (2021)'] = merged_df3['Average AQI (2021)'].astype(float)
merged_df3["Average AQI (2021)"] = merged_df3["Average AQI (2021)"].round(1)

merged_df3

# 1) Does the environment of a country affect its’ citizens life expectancy?

Create a Scatter Plot of AQI vs Healthy Life Expectancy

In [None]:
# Create a Scatter plot of AQI vs Healthy life expectancy
merged_df4 = merged_df3.dropna(subset=['Average AQI (2021)'])

plt.scatter(merged_df4['Average AQI (2021)'], merged_df4['Healthy life expectancy'])
plt.xlabel('Average AQI')
plt.ylabel('Healthy life expectancy')
plt.title('AQI vs Healthy life expectancy')

# Create line of best fit
x = merged_df4['Average AQI (2021)']
y = merged_df4['Healthy life expectancy']
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, color='red')


# Add constant to X array for intercept term
X = sm.add_constant(x)

# Fit linear regression model using statsmodels
model = sm.OLS(y, X).fit()

# Get regression statistics
slope = model.params[1]
intercept = model.params[0]
r_squared = model.rsquared
p_value = model.f_pvalue
std_error = model.bse[1]

# Print regression statistics
print(f"Slope: {slope}")
print(f"Intercept: {intercept}")
print(f"R-squared score: {r_squared}")
print(f"P-value: {p_value}")
print(f"Standard error: {std_error}")


plt.show()

Based on the regression statistics, we can see that the slope of the line of best fit is negative (-0.082), indicating that as the average AQI (air quality index) score increases, the healthy life expectancy decreases. However, the R-squared value is quite low (0.048), which suggests that only a small proportion of the variance in healthy life expectancy can be explained by changes in AQI. Additionally, the p-value (0.313) is greater than 0.05, which means that the relationship between AQI and healthy life expectancy may not be statistically significant. Therefore, we cannot definitively conclude that the environment of a country affects their life expectancy based on this analysis alone.

Bar Chart of AQI by Country

In [None]:
# Bar Chart of AQI by Country
plt.bar(merged_df4['Country'], merged_df4['Average AQI (2021)'])
plt.xticks(rotation=90)
plt.xlabel('Country')
plt.ylabel('Average AQI')
plt.show()

# 2) Do health levels correlate to the level of happiness listed for each country?

In [None]:
#create a a code to determine the correlation between Health Levels and happiness.
correlation = st.pearsonr(whr21_df2["Healthy life expectancy"],whr21_df2["Ladder score"])
print(f"The correlation between Healthy life expectancy and happiness Ladder score is {round(correlation[0],2)}")

In [None]:
# Create a scatter plot of the positive correlation betwee a healthy life expectancy and happiness

(slope, intercept,rvalue, pvalue, stderr) = st.linregress(whr21_df2["Healthy life expectancy"],whr21_df2["Ladder score"])
regress_values=whr21_df2["Healthy life expectancy"]* slope + intercept
line_eq= f"y = {round(slope, 2)} x + {round(intercept, 2)}"

plt.scatter(whr21_df2["Healthy life expectancy"],whr21_df2["Ladder score"],color='b')
plt.plot(whr21_df2["Healthy life expectancy"], regress_values, color='red')
plt.annotate(line_eq,(2,4), fontsize=13)
plt.xlabel("Healthy life expectancy")
plt.ylabel("Ladder score")
plt.title("Healthy life expectancy vs World happiness score")
print(f"The r-squared is: {round(rvalue**2,3)}")
plt.show()

# 3) Does the amount of gyms per country have any correlation to life expectancy?

In [None]:
#what is the correlation between life expectancy and number of gyms using a scatter plot 
life_gym = plt.scatter(merged_df3["Total Number of Gyms"],merged_df3["Healthy life expectancy"])


plt.title('Correlation between the Number of Gyms and Healthy Life Expectancy')
plt.xlabel("Total Number of Gyms")
plt.ylabel("Healthy Life Expectancy")
plt.xticks(rotation=90)

In [None]:
merged_df2.plot('Total Number of Gyms', 'Healthy life expectancy')

# 4) Is the cost of a gym membership prohibitive when compared to GDP per Capita and does that affect Obesity ? 

Regression: GDP per Capita vs. Annual Revenue per Member

In [None]:
x = merged_df3['Economy: Real GDP per capita']
y = merged_df3['Annual Revenue Per Member']

# Fit a linear regression to the data
slope, intercept, r_value, p_value, std_err = st.linregress(x, y)

# Create a scatter plot of the data
plt.scatter(x, y)

# Plot the regression line
plt.plot(x, slope*x + intercept, color='red')

# Add axis labels and title
plt.xlabel('GDP per Capita')
plt.ylabel('Annual Revenue per Member')
plt.title('GDP per Capita vs. Annual Revenue per Member')

# Print the relevant statistics
print(f"Slope: {slope}")
print(f"Intercept: {intercept}")
print(f"R-squared score: {r_value**2}")
print(f"P-value: {p_value}")
print(f"Standard error: {std_err}")

# Display the plot
plt.show()

Based on the linear regression results provided, the slope is 0.0055, which indicates that there is a positive relationship between GDP per capita and the annual revenue per member of the gym. The intercept is 296.73, which means that even if the GDP per capita is zero, the expected annual revenue per member of the gym is $296.73.

The R-squared score is 0.46, which means that 46% of the variation in the annual revenue per member can be explained by the variation in GDP per capita.

The p-value is 1.38e-05, which is smaller than the significance level of 0.05, indicating that the slope is statistically significant, and that GDP per capita has a significant effect on the annual revenue per member.

Therefore, based on these results, we can conclude that the cost of a gym membership begins at a fairly prohibitive level when compared to GDP per capita. One would hope that after starting out at a price point of almost $300 per year, the price would stay fairly steady. But to the contrary, as GDP per Capita increases, the price increases at a high rate as well.

Regression: Healthy life expectancy vs Economy: Real GDP per capita

In [None]:
#LINEAR REGRESSION
# Extract the data from the dataframe
x = merged_df3['Economy: Real GDP per capita'].values
y = merged_df3['Healthy life expectancy'].values

# Calculate the slope and intercept of the regression line
slope, intercept = np.polyfit(x, y, 1)

# Make predictions using the regression line
y_pred = slope * x + intercept

# Calculate the R-squared score
r_squared = 1 - np.sum((y - y_pred)**2) / np.sum((y - np.mean(y))**2)

# Print the results
print('Slope:', slope)
print('Intercept:', intercept)
print('R-squared score:', r_squared)


# Create scatter plot
x = merged_df3['Economy: Real GDP per capita']
y = merged_df3['Healthy life expectancy']
plt.scatter(x, y)

# Add regression line to plot
plt.plot(x, y_pred, color='red')

# Add axis labels and title
plt.xlabel('Economy: Real GDP per capita')
plt.ylabel('Healthy life expectancy')
plt.title('Healthy life expectancy vs Economy: Real GDP per capita')

# Display plot
plt.show()

Based on the findings from the linear regression analysis of healthy life expectancy vs real GDP per capita, the slope of the regression line is 9.501174622525271e-05, indicating that for every unit increase in real GDP per capita, there is an increase of approximately 9.5e-05 units in healthy life expectancy. The intercept of the regression line is 65.23182121314176, meaning that when real GDP per capita is zero, the predicted healthy life expectancy is approximately 65.23 years.

The R-squared score of 0.44321444068667604 suggests that the linear regression model explains approximately 44.3% of the variability in the data. This means that there are other factors that contribute to healthy life expectancy beyond just real GDP per capita.

In conclusion, this analysis suggests that there is a positive relationship between real GDP per capita and healthy life expectancy, but the relationship is not perfect. Other factors such as healthcare, education, and lifestyle choices may also play a role in determining healthy life expectancy. This finding could be useful for policymakers and healthcare professionals to consider when developing strategies to improve public health and well-being.

Regression: GDP per Capita vs. Obesity Rate

In [None]:
# Extract the X and Y values from your data
x = merged_df3['Economy: Real GDP per capita']
y = merged_df3['Obesity Rate in Adults (2016)']

# Fit a linear regression to the data
slope, intercept, r_value, p_value, std_err = st.linregress(x, y)

# Create a scatter plot of the data
plt.scatter(x, y)

# Plot the regression line
plt.plot(x, intercept + slope*x, color='red')

# Add axis labels and title
plt.xlabel('GDP per Capita')
plt.ylabel('Obesity Rate')
plt.title('GDP per Capita vs. Obesity Rate')

# Display the plot
plt.show()

# Print the statistics
print("Slope:", slope)
print("Intercept:", intercept)
print("R-squared score:", r_value**2)
print("P-value:", p_value)
print("Standard error:", std_err)

Based on the findings from the linear regression analysis of GDP per Capita vs. Obesity Rate, the slope of the regression line indicates that for every unit increase in GDP per Capita, there is an increase of approximately 1.39e-05 units in Obesity Rate. The intercept of the regression line is 21.89, meaning that when GDP per Capita is zero, the predicted Obesity Rate is approximately 21.90%.

The R-squared score of 0.0018 suggests that the linear regression model explains only approximately 0.18% of the variability in the data. This means that GDP per Capita is not a strong predictor of Obesity Rate.

The P-value of 0.8123972454287824 suggests that the slope of the regression line is not statistically significant at the 5% level. This means that there is insufficient evidence to reject the null hypothesis that the slope is equal to zero.

In conclusion, this analysis suggests that there is a weak and statistically insignificant relationship between GDP per Capita and Obesity Rate. Other factors such as cultural and dietary habits, lifestyle choices, and healthcare accessibility may play a more significant role in determining Obesity Rate. This finding could be useful for policymakers and healthcare professionals to consider when developing strategies to address the issue of obesity.

# 5) Does having a higher number of hours worked effect happiness/gym availability/obesity levels?

In [None]:
# create df showcasing Country, Members per Gym, Work happiness Score and Obesity rate in adults.
merged_df5 = merged_df3[["Country", "Members Per Gym","World Happiness Score","Obesity Rate in Adults (2016)"]]

merged_df5

In [None]:
# Create Visualization 5) Does having a higher number of obesity levels  affect happiness, and gym availability?

# Create a sample dataset with 5 columns
pulled_data = pd.DataFrame(np.random.randn(100, 4), columns=["Country", "World Happiness Score", 
                                                             "Obesity Rate in Adults (2016)", "Total Members"])


# Compute the correlation matrix
corr = merged_df3.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(8, 6))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax= 1.0, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})

# Show the plot
plt.show()