# Data Acquisition. 0 Data Structure of Files

In [1]:
# installations (be careful!!!)
#!pip install missingno

### THIS WAS ALREADY HERE

In [512]:
# all modules
import warnings
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

import requests

from re import sub
from re import split
from re import search
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder

from zipfile import ZipFile

In [235]:
# global stuff

warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_rows', 13)
pd.set_option('display.max_columns', 13)

In [4]:
# load sql extension for jupyter
%load_ext sql
%config SqlMagic.autocommit = False

In [5]:
# load database
%sql sqlite:///../data/raw/raw_data_project_m1.db

In [6]:
%%sql tables_structure <<

select * 
from 
    sqlite_master

where 
    type='table' AND
    name NOT LIKE 'sqlite_%'


 * sqlite:///../data/raw/raw_data_project_m1.db
Done.
Returning data to local variable tables_structure


## 1_ DF PESONAL_INFO 

In [471]:
%%sql personal_info <<

select * from personal_info;

 * sqlite:///../data/raw/raw_data_project_m1.db
Done.
Returning data to local variable personal_info


In [472]:
df_personal_info = personal_info.DataFrame()
df_personal_info.shape

### Funciones y Resultado

In [474]:
def ageStr_to_ageNum(serie):
    """
    INPUT -> serie df[] = ['61 years old', '57 years old', '32 years old'] -> full strings
    OUPUT -> serie df[] = [ 61 57 32 45 41 1990 2000]                      -> only integers
    """
    return serie.apply(lambda x: sub('\D', '', x)).astype(int)


def year_to_age(serie):
    """
    INPUT -> serie:  df[] = [ 61 57 32 45 41 1990 2000]   -> ages + years (all ints)
    OUTPUT -> serie: df[] = [ 61 57 32 45 41   30   20]   -> only ages    (all ints)
    """   
    year_now = datetime.today().year
    return serie.apply(lambda x: year_now - x if int(x) > 200 else x)

def gender_homogenization(serie):
    """
    INPUT  -> female, FeMale, Fem, male, Male
    OUTPUT ->      F,      F,   F,    M,    M
    """
    serie = serie.apply(lambda x: sub('^f\w+|^F\w+', 'F', x))
    serie = serie.apply(lambda x: sub('^m\w+|^M\w+', 'M', x))
    return serie

def yes_no_to_bool(serie):
    """
    Appliable to yes/no questions with multiple formats, to transform into boolean info
    INPUT  -> YES yes Yes yES No NO nO no  -> type str
    OUTPUT ->   1   1   1   1  0  0  0  0  -> type bool
    """
    serie = serie.apply(lambda x: sub('^y\w+|^Y\w+', '1', x))
    serie = serie.apply(lambda x: sub('^n\w+|^N\w+', '0', x)).astype(int)
    return serie.astype(bool)


def separate_df_to_bools(df, cols_to_separate, cols_separated):
    """
    INPUT  -> df[col].unique() = [range_1, range_2, range_3]
    OUTPUT -> df[[range_1, range_2, range_3]] with boolean responses
    """
    df_encoder = OneHotEncoder(dtype=bool, sparse=True)
    df = pd.DataFrame(df_encoder.fit_transform(df[cols_to_separate]).toarray(),columns=cols_separated)
    return df

In [476]:
try:
    initial_cols = ['gender', 'age_group']
    final_cols = ['gender_Female', 'gender_Male', 'ageGroup_14_25', 'ageGroup_26_39', 'ageGroup_40_65', 'ageGroup_juvenile']

    df_personal_info['age'] = ageStr_to_ageNum(df_personal_info['age'])
    df_personal_info['age'] = year_to_age(df_personal_info['age'])

    df_personal_info['gender'] = gender_homogenization(df_personal_info['gender'])
    df_personal_info['dem_has_children'] = yes_no_to_bool(df_personal_info['dem_has_children'])
    
    new_bool_df = separate_df_to_bools(df_personal_info, initial_cols, final_cols)
    df_personal_info = df_personal_info.join(other= new_bool_df, on=None, how='left', sort=False)
    
except:
    pass

finally:
    """ MEMORY USAGE from 377.0+ KB to 367.6+ KB | FROM object(5) to bool(7), object(4) """
    print('Done cleaning df_personal_info!')

In [479]:
df_personal_info.head(10)

Unnamed: 0,uuid,age,gender,dem_has_children,age_group,gender_Female,gender_Male,ageGroup_14_25,ageGroup_26_39,ageGroup_40_65,ageGroup_juvenile
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,61,M,False,40_65,False,True,False,False,True,False
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,57,M,True,40_65,False,True,False,False,True,False
2,83127080-da3d-0133-c74f-0a81e8b09a82,32,M,False,26_39,False,True,False,True,False,False
3,15626d40-db13-0133-ea5c-0a81e8b09a82,45,M,True,40_65,False,True,False,False,True,False
4,24954a70-db98-0133-4a64-0a81e8b09a82,41,F,True,40_65,True,False,False,False,True,False
5,28583060-dbe2-0133-ff9c-0a81e8b09a82,30,F,False,26_39,True,False,False,True,False,False
6,b50dbb80-da53-0133-8956-0a81e8b09a82,30,F,False,26_39,True,False,False,True,False,False
7,dd743bb0-d9a5-0133-53b2-0a81e8b09a82,47,M,False,40_65,False,True,False,False,True,False
8,4fde2370-da48-0133-3a42-0a81e8b09a82,34,F,False,26_39,True,False,False,True,False,False
9,9949c4c0-da5f-0133-c832-0a81e8b09a82,26,F,True,juvenile,True,False,False,False,False,True


## 2_ DF COUNTRY_INFO 

In [621]:
%%sql country_info <<

select * from country_info;

 * sqlite:///../data/raw/raw_data_project_m1.db
Done.
Returning data to local variable country_info


In [622]:
df_country_info = country_info.DataFrame()
df_country_info.shape

(9649, 3)

### Funciones y Resultado

In [632]:
# DICTIONARY TO REPLACE WITH WEB SCRAPPING

countries_code_alpha_2 = {
    'AT' : 'Austria',   'BE' : 'Belgium',        'BG' : 'Bulgaria',  'CY' : 'Cyprus',     'CZ' : 'Czechia',
    'DE' : 'Germany',   'DK' : 'Denmark',        'EE' : 'Estonia',   'ES' : 'Spain',      'FI' : 'Finland', 
    'FR' : 'France',    'GB' : 'United Kingdom', 'GR' : 'Greece',    'HR' : 'Croatia',    'HU' : 'Hungary', 
    'IE' : 'Ireland',   'IT' : 'Italy',          'LT' : 'Lithuania', 'LU' : 'Luxembourg', 'LV' : 'Latvia',  
    'MT' : 'Malta',     'NL' : 'Netherlands',    'PL' : 'Poland',    'PT' : 'Portugal',   'RO' : 'Romania',
    'SE' : 'Sweden',    'SI' : 'Slovenia',       'SK' : 'Slovakia',
}

In [633]:
def countryCode_to_countryName(serie):
    """
    INPUT  ->      AT      FR     ES  -> alpha_2 code
    OUTPUT -> Austria  France  Spain  -> full name
    """
    return serie.apply(lambda x: countries_code_alpha_2[x])

def context_homogenization(serie):
    """
    INPUT  ->   urban  city  rural  Non-Rural Countryside  -> various types of response
    OUTPUT ->   urban  urban rural      urban       rural  -> two types of response
    LIST OF POSSIBLE ANSWERS TAKING INTO ACCOUNT serie count values
    """
    urban_context = ['urban', 'city', 'non-rural', 'Non-Rural']
    rural_context = ['rural', 'country', 'countryside', 'Country']
    
    return  ['urban_context' if response in urban_context 
        else 'rural_context' if response in rural_context 
        else None 
        for response in serie]

In [634]:
try:
    initial_cols = ['rural']
    final_cols = ['rural_context', 'urban_context']

    df_country_info['country_code'] = countryCode_to_countryName(df_country_info['country_code'])
    df_country_info['rural'] = context_homogenization(df_country_info['rural'])

    new_bool_df = separate_df_to_bools(df_country_info, initial_cols, final_cols)
    
    df_country_info = df_country_info.join(other= new_bool_df, 
                                           on=None, 
                                           how='left', 
                                           sort=False)

    df_country_info.drop(columns='rural', 
                         inplace = True)
    
except:
    pass

finally:
    """ MEMORY USAGE from 226.3+ KB to 169.70+ KB | FROM object(3) to bool(2), object(2) """
    print('Done cleaning df_country_info!')

Done cleaning df_country_info!


In [635]:
df_country_info.head(10)

Unnamed: 0,uuid,country_code,rural_context,urban_context
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,Austria,True,False
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,Austria,False,True
2,83127080-da3d-0133-c74f-0a81e8b09a82,Austria,False,True
3,15626d40-db13-0133-ea5c-0a81e8b09a82,Austria,True,False
4,24954a70-db98-0133-4a64-0a81e8b09a82,Austria,False,True
5,28583060-dbe2-0133-ff9c-0a81e8b09a82,Austria,True,False
6,b50dbb80-da53-0133-8956-0a81e8b09a82,Austria,False,True
7,dd743bb0-d9a5-0133-53b2-0a81e8b09a82,Austria,False,True
8,4fde2370-da48-0133-3a42-0a81e8b09a82,Austria,False,True
9,9949c4c0-da5f-0133-c832-0a81e8b09a82,Austria,False,True


## 3_ DF CAREER_INFO 

In [561]:
%%sql career_info <<

select * from career_info;

 * sqlite:///../data/raw/raw_data_project_m1.db
Done.
Returning data to local variable career_info


In [583]:
df_career_info = career_info.DataFrame()
df_career_info.shape

(9649, 4)

In [567]:
def null_to_unknown(serie):
    """
    INPUT  -> no  high     None  medium     None  low  no
    OUTPUT -> no  high  unknown  medium  unknown  low  no
    Podria considerarse tb no... aquí hay interpretación de datos!
    """
    return serie.apply(lambda x: 'unknown' if x == None else x)

In [584]:
try:
    initial_cols  = ['dem_education_level']
    final_cols = ['High_Ed', 'Low_Ed', 'Medium_Ed', 'No_Ed', 'Unknown_Ed']

    df_career_info['dem_full_time_job'] = yes_no_to_bool(df_career_info['dem_full_time_job'])
    df_career_info['dem_education_level'] = null_to_unknown(df_career_info['dem_education_level'])

    new_bool_df = separate_df_to_bools(df_career_info, initial_cols, final_cols)
    df_career_info = df_career_info.join(other= new_bool_df, on=None, how='left', sort=False)

    df_career_info.drop(columns='dem_education_level', inplace= True)
    
except:
    pass

finally:
    """MEMORY USAGE from 301.7+ KB to 207.4+ KB | FROM objects(4) to bool(6), object(2) """
    print('Done cleaning df_career_info!')

Done cleaning df_career_info!


In [570]:
df_career_info.head(15)

Unnamed: 0,uuid,dem_full_time_job,normalized_job_code,High_Ed,Low_Ed,Medium_Ed,No_Ed,Unknown_Ed
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,False,,False,False,False,True,False
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,True,861a9b9151e11362eb3c77ca914172d0,True,False,False,False,False
2,83127080-da3d-0133-c74f-0a81e8b09a82,False,,False,False,False,False,True
3,15626d40-db13-0133-ea5c-0a81e8b09a82,True,049a3f3a2b5f85cb2971ba77ad66e10c,True,False,False,False,False
4,24954a70-db98-0133-4a64-0a81e8b09a82,True,f4b2fb1aa40f661488e2782b6d57ad2f,True,False,False,False,False
...,...,...,...,...,...,...,...,...
10,226cf850-db28-0133-c51a-0a81e8b09a82,True,2ee80cf30e2483dfe0947adc0428f8c5,True,False,False,False,False
11,69f1f400-dc5f-0133-ad9b-0a81e8b09a82,False,,False,False,True,False,False
12,2351fbe0-de4c-0133-0d58-0a81e8b09a82,True,897ef18a9321616495831a4b4b0c1bb2,False,True,False,False,False
13,0f286c80-dd77-0133-7803-0a81e8b09a82,False,,False,False,True,False,False


In [32]:
"""
Esto hay que hacerlo con la API!!!
"""
df_career_info['normalized_job_code'].value_counts()

775190277a849cba701b306a7b374c0a    101
fcaf4aac737e53ad24b9f32f08b8740f     88
d4624ed8068d24948e18303212593cd6     88
049a3f3a2b5f85cb2971ba77ad66e10c     88
dac2e921c1f887d9efc4c06fa33c1397     88
                                   ... 
58eaf730050d2e23c7361fbb782e30a2      6
7aecc34a90868aac810bf3693b11534e      5
5285758deb04c98f58cbae2747195f33      5
631bc3a12ed11340fc94ebfffa8f5f4e      4
b42566382f2d3deeca56d3667004fc9e      4
Name: normalized_job_code, Length: 156, dtype: int64

## 4_ DF POLL_INFO 

In [683]:
%%sql poll_info <<

select * from poll_info;

 * sqlite:///../data/raw/raw_data_project_m1.db
Done.
Returning data to local variable poll_info


In [684]:
df_poll_info = poll_info.DataFrame()
df_poll_info.shape

(9649, 6)

In [685]:
# df_poll_info.info()
""" dtypes: object(6) | memory usage: 452.4+ KB """

' dtypes: object(6) | memory usage: 452.4+ KB '

In [686]:
df_poll_info.describe().T

Unnamed: 0,count,unique,top,freq
uuid,9649,9649,fa4adae0-dc1f-0133-dc65-0a81e8b09a82,1
question_bbi_2016wave4_basicincome_awareness,9649,4,I know something about it,3389
question_bbi_2016wave4_basicincome_vote,9649,5,I would probably vote for it,3272
question_bbi_2016wave4_basicincome_effect,9649,9,A basic income would not affect my work choices,3349
question_bbi_2016wave4_basicincome_argumentsfor,9649,674,None of the above,2163
question_bbi_2016wave4_basicincome_argumentsagainst,9649,881,None of the above,1381


In [674]:
lista = list(np.sort(df_poll_info['question_bbi_2016wave4_basicincome_awareness'].unique()))
print(lista, type(lista))

['I have heard just a little about it', 'I have never heard of it', 'I know something about it', 'I understand it fully'] <class 'list'>


In [687]:
def split_str_at_char(str_to_split, cutter):
    if isinstance(str_to_split, str) and isinstance(cutter, str):
        return re.split(cutter, str_to_split)[1].capitalize()

In [688]:
df_poll_info['question_bbi_2016wave4_basicincome_effect'] = [split_str_at_char(response, 'Û_ ') 
                                                            if re.search('Û_ ', response)
                                                            else response
                                                            for response in df_poll_info['question_bbi_2016wave4_basicincome_effect']
                                                            ]
# revisar si se puede hacer con apply y comparar tiempos

In [689]:
# Esto se puede hacer en una funcion
responses_basic_income_awareness = list(np.sort(df_poll_info['question_bbi_2016wave4_basicincome_awareness'].unique()))
responses_basic_income_vote = list(np.sort(df_poll_info['question_bbi_2016wave4_basicincome_vote'].unique()))
responses_basic_income_effect = list(np.sort(df_poll_info['question_bbi_2016wave4_basicincome_effect'].unique()))

all_responses = responses_basic_income_awareness + responses_basic_income_vote + responses_basic_income_effect
initial_cols = ['question_bbi_2016wave4_basicincome_awareness', 'question_bbi_2016wave4_basicincome_vote', 'question_bbi_2016wave4_basicincome_effect']

In [690]:
new_bool_df = separate_df_to_bools(df_poll_info,\
                                   cols_to_separate= initial_cols,\
                                   cols_separated= all_responses)

df_poll_info = df_poll_info.join(other= new_bool_df, on=None, how='left', sort=False)

# df_poll_info.drop(columns=initial_cols, inplace= True)

In [691]:
df_poll_info.head(10)

Unnamed: 0,uuid,question_bbi_2016wave4_basicincome_awareness,question_bbi_2016wave4_basicincome_vote,question_bbi_2016wave4_basicincome_effect,question_bbi_2016wave4_basicincome_argumentsfor,question_bbi_2016wave4_basicincome_argumentsagainst,...,Look for a different job,None of the above,Spend more time with my family,Stop working,Work as a freelancer,Work less
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,I know something about it,I would not vote,None of the above,None of the above,None of the above,...,False,True,False,False,False,False
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,I understand it fully,I would probably vote for it,A basic income would not affect my work choices,It increases appreciation for household work a...,It might encourage people to stop working,...,False,False,False,False,False,False
2,83127080-da3d-0133-c74f-0a81e8b09a82,I have heard just a little about it,I would not vote,Gain additional skills,It creates more equality of opportunity,Foreigners might come to my country and take a...,...,False,False,False,False,False,False
3,15626d40-db13-0133-ea5c-0a81e8b09a82,I have heard just a little about it,I would probably vote for it,Work less,It reduces anxiety about financing basic needs,None of the above,...,False,False,False,False,False,True
4,24954a70-db98-0133-4a64-0a81e8b09a82,I have heard just a little about it,I would probably vote for it,None of the above,It reduces anxiety about financing basic needs,It is impossible to finance | It might encoura...,...,False,True,False,False,False,False
5,28583060-dbe2-0133-ff9c-0a81e8b09a82,I understand it fully,I would vote against it,A basic income would not affect my work choices,It encourages financial independence and self-...,It is impossible to finance | It might encoura...,...,False,False,False,False,False,False
6,b50dbb80-da53-0133-8956-0a81e8b09a82,I know something about it,I would probably vote for it,Work as a freelancer,It reduces anxiety about financing basic needs,Foreigners might come to my country and take a...,...,False,False,False,False,True,False
7,dd743bb0-d9a5-0133-53b2-0a81e8b09a82,I know something about it,I would probably vote for it,A basic income would not affect my work choices,It increases appreciation for household work a...,It might encourage people to stop working | Fo...,...,False,False,False,False,False,False
8,4fde2370-da48-0133-3a42-0a81e8b09a82,I have heard just a little about it,I would probably vote for it,None of the above,It creates more equality of opportunity | It r...,Foreigners might come to my country and take a...,...,False,True,False,False,False,False
9,9949c4c0-da5f-0133-c832-0a81e8b09a82,I understand it fully,I would vote for it,Do more volunteering work,"It increases solidarity, because it is funded ...",Foreigners might come to my country and take a...,...,False,False,False,False,False,False


In [41]:
for item in df_poll_info['question_bbi_2016wave4_basicincome_argumentsfor'].unique():
    print(f'{item}\n')
    
"""Es una pregunta donde se puede elegir un núm de posibilidades"""

None of the above

It increases appreciation for household work and volunteering | It encourages financial independence and self-responsibility | It reduces anxiety about financing basic needs

It creates more equality of opportunity

It reduces anxiety about financing basic needs

It encourages financial independence and self-responsibility | It reduces anxiety about financing basic needs

It increases appreciation for household work and volunteering | It reduces anxiety about financing basic needs

It creates more equality of opportunity | It reduces anxiety about financing basic needs | It reduces bureaucracy and administrative expenses

It increases solidarity, because it is funded by everyone

It increases appreciation for household work and volunteering

It reduces anxiety about financing basic needs | It creates more equality of opportunity | It encourages financial independence and self-responsibility | It increases appreciation for household work and volunteering

It encourage


It reduces anxiety about financing basic needs | It reduces bureaucracy and administrative expenses | It increases appreciation for household work and volunteering

It creates more equality of opportunity | It increases appreciation for household work and volunteering | It encourages financial independence and self-responsibility

It increases solidarity, because it is funded by everyone | It creates more equality of opportunity | It reduces anxiety about financing basic needs | It encourages financial independence and self-responsibility | It increases appreciation for household work and volunteering

It reduces bureaucracy and administrative expenses | It encourages financial independence and self-responsibility | It reduces anxiety about financing basic needs

It reduces bureaucracy and administrative expenses | It increases appreciation for household work and volunteering | It reduces anxiety about financing basic needs | It creates more equality of opportunity | It encourages fin

It creates more equality of opportunity | It increases appreciation for household work and volunteering | It encourages financial independence and self-responsibility | It reduces anxiety about financing basic needs | It increases solidarity, because it is funded by everyone

It reduces anxiety about financing basic needs | It reduces bureaucracy and administrative expenses | It encourages financial independence and self-responsibility | It increases appreciation for household work and volunteering

It creates more equality of opportunity | It encourages financial independence and self-responsibility | It increases solidarity, because it is funded by everyone | It reduces bureaucracy and administrative expenses | It increases appreciation for household work and volunteering

It reduces anxiety about financing basic needs | It increases solidarity, because it is funded by everyone | It encourages financial independence and self-responsibility | It increases appreciation for household wo

'Es una pregunta donde se puede elegir un núm de posibilidades'

In [42]:
for item in df_poll_info['question_bbi_2016wave4_basicincome_argumentsagainst'].unique():
    print(f'{item}\n')

None of the above

It might encourage people to stop working

Foreigners might come to my country and take advantage of the benefit

It is impossible to finance | It might encourage people to stop working | It increases dependence on the state

It is impossible to finance | It might encourage people to stop working | It is against the principle of linking merit and reward

Foreigners might come to my country and take advantage of the benefit | It might encourage people to stop working | It increases dependence on the state

It might encourage people to stop working | Foreigners might come to my country and take advantage of the benefit | It is impossible to finance | Only the people who need it most should get something from the state

Foreigners might come to my country and take advantage of the benefit | It is impossible to finance

It is against the principle of linking merit and reward | It might encourage people to stop working | It increases dependence on the state

Foreigners mi

Las cols anteriores se deberían separar en opciones y hacer una matriz de True o False, de esta forma se multiplicidad de opciones tendría sentido

## CONEXIÓN API

In [44]:
requests.get('http://dataatwork.org/data/')

<Response [200]>