In [26]:
# Python 3 environment
# import packages and libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import matplotlib.pyplot as plt
import seaborn as sns
import descartes
import geopandas

In [2]:
df = pd.read_csv("../input/healthy-lifestyle-cities-report-2021/healthy_lifestyle_city_2021.csv")


We will begin by performing exploratory data analysis on our data set. For purposes of this notebook, we will examine high value data and get an understanding of possible coorelations within our dataset.

In [3]:
df.dtypes

In [4]:
# some of our categories contain incompatible datatypes. Let's clean those in order to perform data analysis

df['Sunshine hours(City)'] = df['Sunshine hours(City)'].replace(to_replace = '-', value = '0')
df['Sunshine hours(City)'] = df['Sunshine hours(City)'].astype('float64')
df['Sunshine hours(City)'].replace(to_replace = 0 , value = df['Sunshine hours(City)'].mean(), inplace = True)
df['Sunshine hours(City)'] = df['Sunshine hours(City)'].astype('float64')



In [5]:
# remove € and change datatype
df['Cost of a bottle of water(City)'] = df['Cost of a bottle of water(City)'].astype('str')
df['Cost of a bottle of water(City)'] = df['Cost of a bottle of water(City)'].str[1:]
df['Cost of a bottle of water(City)'] = df['Cost of a bottle of water(City)'].astype('float')

In [6]:
# remove % and change datatype
df['Obesity levels(Country)'] = df['Obesity levels(Country)'].astype('str')
df['Obesity levels(Country)'] = df['Obesity levels(Country)'].str[:-1]
df['Obesity levels(Country)'] = df['Obesity levels(Country)'].astype('float')

In [7]:
df['Pollution(Index score) (City)'] = df['Pollution(Index score) (City)'].replace(to_replace = '-', value = '0')
df['Pollution(Index score) (City)'] = df['Pollution(Index score) (City)'].astype('float')
df['Pollution(Index score) (City)'].replace(to_replace = 0 , value = df['Pollution(Index score) (City)'].mean(), inplace = True)
df['Pollution(Index score) (City)'] = df['Pollution(Index score) (City)'].astype('Float64')

In [8]:
df['Annual avg. hours worked'] = df['Annual avg. hours worked'].replace(to_replace = '-', value = '0')
df['Annual avg. hours worked'] = df['Annual avg. hours worked'].astype('float64')
df['Annual avg. hours worked'].replace(to_replace = 0 , value = df['Annual avg. hours worked'].mean(), inplace = True)
df['Annual avg. hours worked'] = df['Annual avg. hours worked'].astype('float64')

In [9]:
# remove € and change datatype
df['Cost of a monthly gym membership(City)'] = df['Cost of a monthly gym membership(City)'].astype('str')
df['Cost of a monthly gym membership(City)'] = df['Cost of a monthly gym membership(City)'].str[1:]
df['Cost of a monthly gym membership(City)'] = df['Cost of a monthly gym membership(City)'].astype('float')


In [10]:
# Check for null values
print(df.isnull().sum())

In [11]:
# No null values are present

In [12]:
df.dtypes

We can now work with our cleaned dataset. Before we begin to identify relationships within our dataset, we will get an understanding of the shape. 

In [13]:
df.head(10)

In [14]:
df.describe()

In [15]:
df.shape

In [16]:
df.info()

Now that we have an understanding of the dataset we are working with, we can begin to indentify relationships and outliers through the use of graphs and charts. 

In [17]:
# Top 10 Healthiest Cities 
print(df['City'][:10])

In [18]:
plt.figure(figsize=(14,7))
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="City", y="Life expectancy(years) (Country)",data=df.nlargest(10,'Happiness levels(Country)'))
plt.xticks(rotation=45)            

In [19]:
plt.figure(figsize=(14,7))
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="Happiness levels(Country)", y="Number of take out places(City)", data=df.nlargest(10,'Happiness levels(Country)'))
plt.xticks(rotation=45)

In [20]:
plt.figsize=(18,15)
sns.pairplot(df)
sns.color_palette('flare')
plt.show()

In [21]:
# correlation matrix

plt.figure(figsize=(14,7))
sns.heatmap(df.corr(),annot=True,cmap='rocket')

In [22]:
# Is there a relationship between Happiness and Life Expectancy? 

plt.figsize=(20,15)
sns.set_theme(style="darkgrid")
ax = sns.boxplot(x='Happiness levels(Country)', y= 'Life expectancy(years) (Country)', data=df.nlargest(10, 'Rank'))

In [27]:
# Is there a relationship between Happiness Level and Number of Take Out Places? 

sns.lmplot(x='Happiness levels(Country)', y='Number of take out places(City)', data=df,
          scatter_kws={'color':'darkblue', 'marker':'^'}, line_kws={'color':'black', 'lw':2}, height=5, aspect=1.75);