In [1]:
import pandas as pd
import numpy as np
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import requests
import io
from datetime import date

# Download de el archivo .csv

In [72]:
url = 'https://raw.githubusercontent.com/Marcellopp/Desarollo-de-applicaciones-mobiles/main/healthy_lifestyle_city_2021.csv'
download = requests.get(url).content

# Lectura de datos

In [81]:
df = pd.read_csv(io.StringIO(download.decode('utf-8')))
df.head()

Unnamed: 0,City,Rank,Sunshine hours(City),Cost of a bottle of water(City),Obesity levels(Country),Life expectancy(years) (Country),Pollution(Index score) (City),Annual avg. hours worked,Happiness levels(Country),Outdoor activities(City),Number of take out places(City),Cost of a monthly gym membership(City)
0,Amsterdam,1,1858,£1.92,20.40%,81.2,30.93,1434,7.44,422,1048,£34.90
1,Sydney,2,2636,£1.48,29.00%,82.1,26.86,1712,7.22,406,1103,£41.66
2,Vienna,3,1884,£1.94,20.10%,81.0,17.33,1501,7.29,132,1008,£25.74
3,Stockholm,4,1821,£1.72,20.60%,81.8,19.63,1452,7.35,129,598,£37.31
4,Copenhagen,5,1630,£2.19,19.70%,79.8,21.24,1380,7.64,154,523,£32.53


# Data Treatment

In [82]:
print(df.shape) #Dimensiones
print(df.dtypes) # Tipos de las columnas
df.isna().sum() #Comprobar el número de NAs

(44, 12)
City                                       object
Rank                                        int64
Sunshine hours(City)                       object
Cost of a bottle of water(City)            object
Obesity levels(Country)                    object
Life expectancy(years) (Country)          float64
Pollution(Index score) (City)              object
Annual avg. hours worked                   object
Happiness levels(Country)                 float64
Outdoor activities(City)                    int64
Number of take out places(City)             int64
Cost of a monthly gym membership(City)     object
dtype: object


City                                      0
Rank                                      0
Sunshine hours(City)                      0
Cost of a bottle of water(City)           0
Obesity levels(Country)                   0
Life expectancy(years) (Country)          0
Pollution(Index score) (City)             0
Annual avg. hours worked                  0
Happiness levels(Country)                 0
Outdoor activities(City)                  0
Number of take out places(City)           0
Cost of a monthly gym membership(City)    0
dtype: int64

Muchas de las variables son strings en vez de numerics. Antes de converterlas, tenemos que quitar los caracteres especiales (£ y %).

In [83]:
df['Cost of a bottle of water(City)'] = df['Cost of a bottle of water(City)'].str.lstrip('£')
df['Obesity levels(Country)'] = df['Obesity levels(Country)'].str.rstrip('%')
df['Cost of a monthly gym membership(City)'] = df['Cost of a monthly gym membership(City)'].str.lstrip('£')

Conversión

In [84]:
df['Sunshine hours(City)'] = pd.to_numeric(df['Sunshine hours(City)'], errors = 'coerce')
df['Cost of a bottle of water(City)'] = pd.to_numeric(df['Cost of a bottle of water(City)'], errors = 'coerce')
df['Obesity levels(Country)'] = pd.to_numeric(df['Obesity levels(Country)'], errors = 'coerce')
df['Pollution(Index score) (City)'] = pd.to_numeric(df['Pollution(Index score) (City)'], errors = 'coerce')
df['Annual avg. hours worked'] = pd.to_numeric(df['Annual avg. hours worked'], errors = 'coerce')
df['Cost of a monthly gym membership(City)'] = pd.to_numeric(df['Cost of a monthly gym membership(City)'], errors = 'coerce')
print(df.dtypes)

City                                       object
Rank                                        int64
Sunshine hours(City)                      float64
Cost of a bottle of water(City)           float64
Obesity levels(Country)                   float64
Life expectancy(years) (Country)          float64
Pollution(Index score) (City)             float64
Annual avg. hours worked                  float64
Happiness levels(Country)                 float64
Outdoor activities(City)                    int64
Number of take out places(City)             int64
Cost of a monthly gym membership(City)    float64
dtype: object


In [61]:
df.describe()

Unnamed: 0,Rank,Sunshine hours(City),Cost of a bottle of water(City),Obesity levels(Country),Life expectancy(years) (Country),Pollution(Index score) (City),Annual avg. hours worked,Happiness levels(Country),Outdoor activities(City),Number of take out places(City),Cost of a monthly gym membership(City)
count,44.0,43.0,44.0,44.0,44.0,43.0,33.0,44.0,44.0,44.0,44.0
mean,22.5,2245.860465,1.173409,21.925,78.175,51.122326,1672.909091,6.435,213.977273,1443.113636,40.42
std,12.845233,567.403719,0.718642,10.19567,5.30437,21.85619,179.626933,0.991202,127.190297,1388.80327,15.006457
min,1.0,1405.0,0.15,3.9,56.3,13.08,1380.0,3.57,23.0,250.0,16.07
25%,11.75,1798.5,0.57,19.5,75.4,34.355,1540.0,5.87,125.25,548.0,31.31
50%,22.5,2066.0,1.195,22.3,80.4,52.64,1686.0,6.9,189.5,998.0,37.33
75%,33.25,2629.0,1.6,29.0,81.8,66.63,1779.0,7.175,288.25,1674.25,47.21
max,44.0,3542.0,3.2,36.2,83.2,91.74,2137.0,7.8,585.0,6417.0,73.11


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 12 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   City                                    44 non-null     object 
 1   Rank                                    44 non-null     int64  
 2   Sunshine hours(City)                    43 non-null     float64
 3   Cost of a bottle of water(City)         44 non-null     float64
 4   Obesity levels(Country)                 44 non-null     float64
 5   Life expectancy(years) (Country)        44 non-null     float64
 6   Pollution(Index score) (City)           43 non-null     float64
 7   Annual avg. hours worked                33 non-null     float64
 8   Happiness levels(Country)               44 non-null     float64
 9   Outdoor activities(City)                44 non-null     int64  
 10  Number of take out places(City)         44 non-null     int64  


La cantidad de NAs encuentrada en "df.isna().sum()" está incorrecta porque las columnas estaban como strings.

In [52]:
df.isna().sum()

City                                       0
Rank                                       0
Sunshine hours(City)                       1
Cost of a bottle of water(City)            0
Obesity levels(Country)                    0
Life expectancy(years) (Country)           0
Pollution(Index score) (City)              1
Annual avg. hours worked                  11
Happiness levels(Country)                  0
Outdoor activities(City)                   0
Number of take out places(City)            0
Cost of a monthly gym membership(City)     0
dtype: int64

# 1 - Análisis Descriptivo de las Ciudades

Matriz de Correlación

In [53]:
df_corr = df.drop(['City','Rank'],axis = 1, inplace=True)
df_corr = df.corr()

In [54]:
pio.templates.default = "plotly_white"

corr = go.Heatmap(
    z=np.tril(df_corr),
    x=df_corr.columns,
    y=df_corr.columns,
    colorscale=px.colors.diverging.RdBu,
    zmin=-1,
    zmax=1
)

title = 'Correlation Matrix'

layout = go.Layout(
    title_text=title, 
    title_x=0.5, 
    width=600, 
    height=600,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    yaxis_autorange='reversed'
)

fig=go.Figure(data=[corr], layout=layout)
fig.show()