In [1]:
import plotly.plotly as py
import cufflinks as cf
import pandas as pd
import numpy as np
from ipywidgets import interact

cf.go_offline()

In [2]:
df = pd.read_csv("alcohol_consumption.tsv", sep = "[,\t]", engine = "python")
df.head()

Unnamed: 0,unit,frequenc,sex,age,citizen,time\geo,EU28,BE,BG,CZ,...,PT,RO,SI,SK,FI,SE,UK,IS,NO,TR
0,PC,DAY,F,TOTAL,EU28_FOR,2014,5.2 e,7.2 u,: u,: u,...,9.7,0.0,: u,: u,1.9 u,0.9,1.1,1.8,0.8,0 u
1,PC,DAY,F,TOTAL,FOR,2014,4.3 e,7.8 u,: u,6.5 u,...,8.2,0.0,0,: u,3.6,1.4,1.6,1.4,0.5,0
2,PC,DAY,F,TOTAL,NAT,2014,5.0 e,9.9,3.6,3.2,...,11.9,0.6,3.4,0.8,1.4,1.7,5.6,0.2,1.4,0.1
3,PC,DAY,F,TOTAL,NEU28_FOR,2014,3.4 e,8.8 u,: u,: u,...,7.7,0.0,0,: u,: u,1.8,2.3,: u,0,0 u
4,PC,DAY,F,Y15-24,EU28_FOR,2014,0 e,0 u,: u,0,...,: u,0.0,: u,0,: u,: u,: u,: u,: u,: u


In [3]:
# Comprobamos los nulos:
df.isnull().sum()

unit         0
frequenc     0
sex          0
age          0
citizen      0
time\geo     0
EU28         0
BE           0
BG           0
CZ           0
DK           0
DE           0
EE           0
IE           0
EL           0
ES           0
HR           0
IT           0
CY           0
LV           0
LT           0
LU           0
HU           0
MT           0
AT           0
PL           0
PT           0
RO           0
SI           0
SK           0
FI           0
SE           0
UK           0
IS           0
NO           0
TR           0
dtype: int64

In [4]:
# Revisamos los nombres de las columnas:
df.columns

Index(['unit', 'frequenc', 'sex', 'age', 'citizen', 'time\geo ', 'EU28 ',
       'BE ', 'BG ', 'CZ ', 'DK ', 'DE ', 'EE ', 'IE ', 'EL ', 'ES ', 'HR ',
       'IT ', 'CY ', 'LV ', 'LT ', 'LU ', 'HU ', 'MT ', 'AT ', 'PL ', 'PT ',
       'RO ', 'SI ', 'SK ', 'FI ', 'SE ', 'UK ', 'IS ', 'NO ', 'TR'],
      dtype='object')

In [5]:
# Quitamos espacios finales a las columnas:
df.columns = df.columns.str.strip()
df.head()

Unnamed: 0,unit,frequenc,sex,age,citizen,time\geo,EU28,BE,BG,CZ,...,PT,RO,SI,SK,FI,SE,UK,IS,NO,TR
0,PC,DAY,F,TOTAL,EU28_FOR,2014,5.2 e,7.2 u,: u,: u,...,9.7,0.0,: u,: u,1.9 u,0.9,1.1,1.8,0.8,0 u
1,PC,DAY,F,TOTAL,FOR,2014,4.3 e,7.8 u,: u,6.5 u,...,8.2,0.0,0,: u,3.6,1.4,1.6,1.4,0.5,0
2,PC,DAY,F,TOTAL,NAT,2014,5.0 e,9.9,3.6,3.2,...,11.9,0.6,3.4,0.8,1.4,1.7,5.6,0.2,1.4,0.1
3,PC,DAY,F,TOTAL,NEU28_FOR,2014,3.4 e,8.8 u,: u,: u,...,7.7,0.0,0,: u,: u,1.8,2.3,: u,0,0 u
4,PC,DAY,F,Y15-24,EU28_FOR,2014,0 e,0 u,: u,0,...,: u,0.0,: u,0,: u,: u,: u,: u,: u,: u


In [6]:
# Comprobamos las categorías de "frequenc":
df["frequenc"].unique()

array(['DAY', 'LT1M', 'MTH', 'NM12', 'NVR', 'NVR_NM12', 'WEEK'],
      dtype=object)

In [7]:
# Nos quedamos sólo con "DAY", "MTH" y "WEEK":
df = df.drop(df[df["frequenc"] == "LT1M"].index).drop(df[df["frequenc"] == "NM12"].index).drop(df[df["frequenc"] == "NVR"].index).drop(df[df["frequenc"] == "NVR_NM12"].index)
df["frequenc"].unique()

array(['DAY', 'MTH', 'WEEK'], dtype=object)

In [8]:
# Comprobamos los intervalos de "age":
df["age"].unique()

array(['TOTAL', 'Y15-24', 'Y15-29', 'Y15-64', 'Y18-24', 'Y18-44',
       'Y18-64', 'Y25-34', 'Y25-64', 'Y35-44', 'Y45-54', 'Y45-64',
       'Y55-64', 'Y65-74', 'Y_GE18', 'Y_GE65', 'Y_GE75'], dtype=object)

In [9]:
# Eliminamos todos los intervalos que no vamos a usar en el estudio y renombramos "Y_GE75":
df = df.drop(df[df["age"] == "TOTAL"].index).drop(df[df["age"] == "Y15-29"].index).drop(df[df["age"] == "Y15-64"].index).drop(df[df["age"] == "Y18-24"].index).drop(df[df["age"] == "Y18-44"].index).drop(df[df["age"] == "Y18-64"].index).drop(df[df["age"] == "Y25-64"].index).drop(df[df["age"] == "Y45-54"].index).drop(df[df["age"] == "Y55-64"].index).drop(df[df["age"] == "Y_GE18"].index).drop(df[df["age"] == "Y_GE65"].index)
df["age"].unique()

array(['Y15-24', 'Y25-34', 'Y35-44', 'Y45-64', 'Y65-74', 'Y_GE75'],
      dtype=object)

In [10]:
# Comprobamos las categorías de "citizen"...
df["citizen"].unique()

array(['EU28_FOR', 'FOR', 'NAT', 'NEU28_FOR'], dtype=object)

In [11]:
#... y vemos que aquella con info es "NAT":
df.groupby(by = "citizen").agg(sum)

Unnamed: 0_level_0,time\geo,RO
citizen,Unnamed: 1_level_1,Unnamed: 2_level_1
EU28_FOR,108756,0.0
FOR,108756,0.0
NAT,108756,772.7
NEU28_FOR,108756,0.0


In [12]:
# Nos quedamos con sólo NAT:
df = df.drop((df[df["citizen"] == "EU28_FOR"]).index).drop((df[df["citizen"] == "FOR"]).index).drop((df[df["citizen"] == "NEU28_FOR"]).index)
df.head()

Unnamed: 0,unit,frequenc,sex,age,citizen,time\geo,EU28,BE,BG,CZ,...,PT,RO,SI,SK,FI,SE,UK,IS,NO,TR
6,PC,DAY,F,Y15-24,NAT,2014,0.2 e,0.2,0.5,0.0,...,0.0,0.0,0.4,0.2,0.0,0.2,0.2,0.3,0.2,0.0
30,PC,DAY,F,Y25-34,NAT,2014,1.3 e,2.0,3.4,1.4,...,2.1,0.2,0.4,0.4,0.7,0.3,2.0,0.0,0.0,0.3
38,PC,DAY,F,Y35-44,NAT,2014,2.7 e,5.7 u,4.9,1.3,...,8.1,0.4,1.3,0.8,1.4,0.9,2.3,0.0,1.1,0.1
46,PC,DAY,F,Y45-64,NAT,2014,6.1 e,14.8,4.5,5.1,...,15.5,0.8,3.7,1.0,2.0,1.4,6.3,0.2,1.6,0.0
54,PC,DAY,F,Y65-74,NAT,2014,8.8 e,15.8,3.1,5.9,...,20.4,1.3,6.6,0.9,1.4,4.5,9.6,0.9,2.8,0.0


In [13]:
# Reseteamos index:
df = df.reset_index()
df = df.drop("index", axis = 1)
df.head()

Unnamed: 0,unit,frequenc,sex,age,citizen,time\geo,EU28,BE,BG,CZ,...,PT,RO,SI,SK,FI,SE,UK,IS,NO,TR
0,PC,DAY,F,Y15-24,NAT,2014,0.2 e,0.2,0.5,0.0,...,0.0,0.0,0.4,0.2,0.0,0.2,0.2,0.3,0.2,0.0
1,PC,DAY,F,Y25-34,NAT,2014,1.3 e,2.0,3.4,1.4,...,2.1,0.2,0.4,0.4,0.7,0.3,2.0,0.0,0.0,0.3
2,PC,DAY,F,Y35-44,NAT,2014,2.7 e,5.7 u,4.9,1.3,...,8.1,0.4,1.3,0.8,1.4,0.9,2.3,0.0,1.1,0.1
3,PC,DAY,F,Y45-64,NAT,2014,6.1 e,14.8,4.5,5.1,...,15.5,0.8,3.7,1.0,2.0,1.4,6.3,0.2,1.6,0.0
4,PC,DAY,F,Y65-74,NAT,2014,8.8 e,15.8,3.1,5.9,...,20.4,1.3,6.6,0.9,1.4,4.5,9.6,0.9,2.8,0.0


In [14]:
# Comprobamos que las columnas "unit", "time\geo" y "citizen" no proporcionan info, las borramos.
# Borramos Turqía por no estar en el EU28.
df = df.drop(["unit", "time\geo", "citizen", "TR"], axis = 1)
df.head()

Unnamed: 0,frequenc,sex,age,EU28,BE,BG,CZ,DK,DE,EE,...,PL,PT,RO,SI,SK,FI,SE,UK,IS,NO
0,DAY,F,Y15-24,0.2 e,0.2,0.5,0.0,0.0,0.4,0.0,...,0.0,0.0,0.0,0.4,0.2,0.0,0.2,0.2,0.3,0.2
1,DAY,F,Y25-34,1.3 e,2.0,3.4,1.4,0.0,1.5,0.2,...,0.3,2.1,0.2,0.4,0.4,0.7,0.3,2.0,0.0,0.0
2,DAY,F,Y35-44,2.7 e,5.7 u,4.9,1.3,1.8,3.2,0.3,...,0.2,8.1,0.4,1.3,0.8,1.4,0.9,2.3,0.0,1.1
3,DAY,F,Y45-64,6.1 e,14.8,4.5,5.1,9.0,7.0,0.2,...,0.7,15.5,0.8,3.7,1.0,2.0,1.4,6.3,0.2,1.6
4,DAY,F,Y65-74,8.8 e,15.8,3.1,5.9,21.9,7.6,0.3,...,0.3,20.4,1.3,6.6,0.9,1.4,4.5,9.6,0.9,2.8


In [15]:
# Nombres de columnas en mayúscula:
df.columns = df.columns.str.upper()
df.head()

Unnamed: 0,FREQUENC,SEX,AGE,EU28,BE,BG,CZ,DK,DE,EE,...,PL,PT,RO,SI,SK,FI,SE,UK,IS,NO
0,DAY,F,Y15-24,0.2 e,0.2,0.5,0.0,0.0,0.4,0.0,...,0.0,0.0,0.0,0.4,0.2,0.0,0.2,0.2,0.3,0.2
1,DAY,F,Y25-34,1.3 e,2.0,3.4,1.4,0.0,1.5,0.2,...,0.3,2.1,0.2,0.4,0.4,0.7,0.3,2.0,0.0,0.0
2,DAY,F,Y35-44,2.7 e,5.7 u,4.9,1.3,1.8,3.2,0.3,...,0.2,8.1,0.4,1.3,0.8,1.4,0.9,2.3,0.0,1.1
3,DAY,F,Y45-64,6.1 e,14.8,4.5,5.1,9.0,7.0,0.2,...,0.7,15.5,0.8,3.7,1.0,2.0,1.4,6.3,0.2,1.6
4,DAY,F,Y65-74,8.8 e,15.8,3.1,5.9,21.9,7.6,0.3,...,0.3,20.4,1.3,6.6,0.9,1.4,4.5,9.6,0.9,2.8


In [16]:
# Renombramos Y_GE75 y la columnas "frequenc":
df.replace({"Y_GE75" : "Y75+"}, inplace = True)
df.rename(columns = {"FREQUENC": "FREQUENCY"}, inplace = True)
df.head()

Unnamed: 0,FREQUENCY,SEX,AGE,EU28,BE,BG,CZ,DK,DE,EE,...,PL,PT,RO,SI,SK,FI,SE,UK,IS,NO
0,DAY,F,Y15-24,0.2 e,0.2,0.5,0.0,0.0,0.4,0.0,...,0.0,0.0,0.0,0.4,0.2,0.0,0.2,0.2,0.3,0.2
1,DAY,F,Y25-34,1.3 e,2.0,3.4,1.4,0.0,1.5,0.2,...,0.3,2.1,0.2,0.4,0.4,0.7,0.3,2.0,0.0,0.0
2,DAY,F,Y35-44,2.7 e,5.7 u,4.9,1.3,1.8,3.2,0.3,...,0.2,8.1,0.4,1.3,0.8,1.4,0.9,2.3,0.0,1.1
3,DAY,F,Y45-64,6.1 e,14.8,4.5,5.1,9.0,7.0,0.2,...,0.7,15.5,0.8,3.7,1.0,2.0,1.4,6.3,0.2,1.6
4,DAY,F,Y65-74,8.8 e,15.8,3.1,5.9,21.9,7.6,0.3,...,0.3,20.4,1.3,6.6,0.9,1.4,4.5,9.6,0.9,2.8


In [17]:
# Exportamos csv limpio:
df.to_csv("BBDD.csv")

In [18]:
# Importamos csv limpio:
df = pd.read_csv('./BBDD.csv')

In [19]:
# Creamos df con sólo datos mensuales:
df_month = df[df["FREQUENCY"]== "MTH"].iloc[:,2:]
df_month.head()

Unnamed: 0,SEX,AGE,EU28,BE,BG,CZ,DK,DE,EE,IE,...,PL,PT,RO,SI,SK,FI,SE,UK,IS,NO
18,F,Y15-24,30.2 e,33.2,16.5,41.4,47.5,38.6,26.3,38.0,...,21.8,27.2,14.2,38.0,28.7,39.7,30.8,25.2,39.0,64.8
19,F,Y25-34,30.8 e,35.5,36.6,41.4,41.1,33.8,36.1,41.0,...,30.1,24.7,27.5,35.6,30.5,38.1,32.8,24.8,36.6,62.7
20,F,Y35-44,27.1 e,24.7 u,31.8,33.6,34.6,28.5,36.3,29.5,...,32.0,21.1,29.6,30.8,32.6,35.9,28.8,19.3,46.0,58.5
21,F,Y45-64,22.3 e,25.8,25.1,30.4,23.4,26.3,31.9,22.2,...,24.7,13.3,26.4,25.7,27.1,28.5,25.7,15.0,41.9,50.7
22,F,Y65-74,16.9 e,16.3,16.9,24.5,19.0,26.4,17.1,19.9,...,13.7,6.5,17.6,21.5,17.5,26.5,22.4,12.1,34.0,41.4


In [20]:
# Creamos pivot_tables de sexo y edad, cogiendo como valor la media los valores de cada país:
gender_month = df_month.pivot_table(values=['EU28', 'BE', 'BG', 'CZ', 'DK', 'DE', 'EE',
       'IE', 'EL', 'ES', 'HR', 'IT', 'CY', 'LV', 'LT', 'LU', 'HU', 'MT', 'AT',
       'PL', 'PT', 'RO', 'SI', 'SK', 'FI', 'SE', 'UK', 'IS', 'NO'], columns='SEX', aggfunc='mean')

age_month = df_month.pivot_table(values=['EU28', 'BE', 'BG', 'CZ', 'DK', 'DE', 'EE',
       'IE', 'EL', 'ES', 'HR', 'IT', 'CY', 'LV', 'LT', 'LU', 'HU', 'MT', 'AT',
       'PL', 'PT', 'RO', 'SI', 'SK', 'FI', 'SE', 'UK', 'IS', 'NO'], columns='AGE', aggfunc='mean')

In [21]:
# Concatenamos en un mismo df y reseteamos index:
data_month = pd.concat([gender_month, age_month], axis=1)
data_month = data_month.reset_index()
data_month.head()

Unnamed: 0,index,F,M,T,Y15-24,Y25-34,Y35-44,Y45-64,Y65-74,Y75+
0,AT,35.833333,28.083333,32.116667,42.733333,36.7,35.0,29.966667,24.633333,23.033333
1,BG,23.333333,26.483333,24.683333,21.1,35.633333,31.3,23.633333,20.133333,17.2
2,CY,17.6,27.883333,22.433333,23.566667,33.466667,28.9,21.933333,16.133333,11.833333
3,CZ,31.816667,20.216667,26.183333,33.366667,34.633333,28.233333,23.566667,19.766667,16.866667
4,DE,29.816667,22.066667,26.066667,34.866667,29.866667,26.7,21.866667,20.7,21.9


In [22]:
# Representamos las medias de la frecuencias mensuales de consumo de alcohol, por sexo y país:
data_month.iplot(kind='bar', x='index', y=['F', 'M', 'T'], xTitle='Country', 
           yTitle='Avg. by sex', title='Monthly avg. by sex and country')

In [23]:
# Representamos las medias de la frecuencias mensuales de consumo de alcohol, por edad y país:
data_month.iplot(kind='bar', x='index', y=['Y75+', 'Y15-24', 'Y25-34', 'Y35-44', 'Y45-64', 'Y65-74'], 
           xTitle='Country', yTitle='Avg. by age', title='Monthly avg. by age and country')