# Data Acquisition. 0 Data Structure of Files

In [1]:
# import missingno as msno

In [2]:
# all modules
import warnings
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

import requests

import re

from datetime import datetime
from sklearn.preprocessing import OneHotEncoder

# from zipfile import ZipFile
from bs4 import BeautifulSoup

from functools import reduce

In [3]:
# global stuff
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 25)

In [4]:
# load sql extension for jupyter
%load_ext sql
%config SqlMagic.autocommit = False

In [5]:
# load database
%sql sqlite:///../data/raw/raw_data_project_m1.db

In [6]:
%%sql tables_structure <<

select * 
from 
    sqlite_master

where 
    type='table' AND
    name NOT LIKE 'sqlite_%'

 * sqlite:///../data/raw/raw_data_project_m1.db
Done.
Returning data to local variable tables_structure


## OPENING TABLES

In [7]:
%%sql personal_info <<

select * from personal_info;

 * sqlite:///../data/raw/raw_data_project_m1.db
Done.
Returning data to local variable personal_info


In [8]:
%%sql country_info <<

select * from country_info;

 * sqlite:///../data/raw/raw_data_project_m1.db
Done.
Returning data to local variable country_info


In [9]:
%%sql career_info <<

select * from career_info;

 * sqlite:///../data/raw/raw_data_project_m1.db
Done.
Returning data to local variable career_info


In [10]:
%%sql poll_info <<

select * from poll_info;

 * sqlite:///../data/raw/raw_data_project_m1.db
Done.
Returning data to local variable poll_info


In [11]:
df_personal_info = personal_info.DataFrame()
df_country_info = country_info.DataFrame()
df_career_info = career_info.DataFrame()
df_poll_info = poll_info.DataFrame()

# WEB SCRAPPING

In [12]:
def get_dictCountries(web_page):
    """
    Note_1: Only works for a part of the web scrapping, therefore dict lenght is limited to countries_in_eu
    Note_2: 'https://ec.europa.eu/eurostat/statistics-explained/index.php/Glossary:Country_codes'
    ------------------------------------------------------------------------------------------------    
    INPUT: url
    OUTPUT: dict -> 'SP': 'Spain'

    """
    countries_in_europe = 45
        
    url= web_page
    html = requests.get(url).content

    soup = BeautifulSoup(html, 'lxml')
    table = soup.find_all('table')
    
    countries_list = list(filter(None, 
                                 [re.sub('\n','', re.sub('</td>','', re.sub('^<td>','', str(i)))) 
                                  for i in soup.find_all('td')]))

    """
    Note
    ------------------------------------------------------------------------------------------------
    Esta parte es mejorable: Kosovo* y CH_X_HK etc ...  
    """
    countries_names = [(''.join(i.strip())).split('<')[0] for k, i in enumerate(countries_list) if k%2 == 0]
    countries_alpha2= [(''.join(i.strip())).split('<')[0][1:3] for k, i in enumerate(countries_list) if k%2 != 0]
    
    countries_dict = dict(zip(countries_alpha2[:countries_in_europe], countries_names[:countries_in_europe]))
    

    """
    IMPERFECT IMPLEMENTATIONS (as seen in wikipedia)
    ------------------------------------------------------------------------------------------------
    The European Commission generally uses ISO 3166-1 alpha-2 codes with two exceptions: 
    EL (not GR) is used to represent Greece, and UK (not GB) is used to represent the United Kingdom
    """
    countries_dict = dict(zip(countries_alpha2[:countries_in_europe], countries_names[:countries_in_europe]))
    countries_dict['GB'] = 'United Kingdom'
    countries_dict['GR'] = 'Greece'

    return countries_dict

In [13]:
countries_dict = get_dictCountries('https://ec.europa.eu/eurostat/statistics-explained/index.php/Glossary:Country_codes')

In [14]:
# DICTIONARY TO REPLACE WITH WEB SCRAPPING

countries_code_alpha_2 = {
    'AT' : 'Austria',   'BE' : 'Belgium',        'BG' : 'Bulgaria',  'CY' : 'Cyprus',     'CZ' : 'Czechia',
    'DE' : 'Germany',   'DK' : 'Denmark',        'EE' : 'Estonia',   'ES' : 'Spain',      'FI' : 'Finland', 
    'FR' : 'France',    'GB' : 'United Kingdom', 'GR' : 'Greece',    'HR' : 'Croatia',    'HU' : 'Hungary', 
    'IE' : 'Ireland',   'IT' : 'Italy',          'LT' : 'Lithuania', 'LU' : 'Luxembourg', 'LV' : 'Latvia',  
    'MT' : 'Malta',     'NL' : 'Netherlands',    'PL' : 'Poland',    'PT' : 'Portugal',   'RO' : 'Romania',
    'SE' : 'Sweden',    'SI' : 'Slovenia',       'SK' : 'Slovakia',
} 
"""Hay 28 paises y me he sacado 45 de lap pag (TODA EUROPA) """ 

'Hay 28 paises y me he sacado 45 de lap pag (TODA EUROPA) '

In [15]:
def get_dictEuropeanCountries():
    wiki_url= 'https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2'
    url= 'https://www.euro.who.int/en/countries'
    
    html_wikipage = requests.get(wiki_url).content
    html_eurocountries = requests.get(url).content
        
    countries_list = pd.read_html(html_wikipage, header=0)[2]
    countries_dict = dict(zip(countries_list['Code'], countries_list['Country name (using title case)']))

    soup = BeautifulSoup(html_eurocountries, 'lxml')
    table = soup.find_all('section', {'class':'clearfix'})

    all_contries = [content.text for content in table]
    eu_countries = list(filter(None, all_contries[0].split('\n')))

    european_countries_values = [val for k,val in countries_dict.items() for eu_c in eu_countries if val == eu_c]
    european_countries_key = [k for k,val in countries_dict.items() for eu_c in eu_countries if val == eu_c]

    return dict(zip(european_countries_key, european_countries_values))

In [16]:
european_countries = get_dictEuropeanCountries()

In [17]:
european_countries

{'AD': 'Andorra',
 'AL': 'Albania',
 'AM': 'Armenia',
 'AT': 'Austria',
 'AZ': 'Azerbaijan',
 'BA': 'Bosnia and Herzegovina',
 'BE': 'Belgium',
 'BG': 'Bulgaria',
 'BY': 'Belarus',
 'CH': 'Switzerland',
 'CY': 'Cyprus',
 'CZ': 'Czechia',
 'DE': 'Germany',
 'DK': 'Denmark',
 'EE': 'Estonia',
 'ES': 'Spain',
 'FI': 'Finland',
 'FR': 'France',
 'GB': 'United Kingdom of Great Britain and Northern Ireland',
 'GE': 'Georgia',
 'GR': 'Greece',
 'HR': 'Croatia',
 'HU': 'Hungary',
 'IE': 'Ireland',
 'IL': 'Israel',
 'IS': 'Iceland',
 'IT': 'Italy',
 'KG': 'Kyrgyzstan',
 'KZ': 'Kazakhstan',
 'LT': 'Lithuania',
 'LU': 'Luxembourg',
 'LV': 'Latvia',
 'MC': 'Monaco',
 'ME': 'Montenegro',
 'MK': 'North Macedonia',
 'MT': 'Malta',
 'NL': 'Netherlands',
 'NO': 'Norway',
 'PL': 'Poland',
 'PT': 'Portugal',
 'RO': 'Romania',
 'RS': 'Serbia',
 'RU': 'Russian Federation',
 'SE': 'Sweden',
 'SI': 'Slovenia',
 'SK': 'Slovakia',
 'SM': 'San Marino',
 'TJ': 'Tajikistan',
 'TM': 'Turkmenistan',
 'TR': 'Turkey'

In [18]:
def get_dictWorldCountries():
    wiki_url= 'https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2'
    
    html_wikipage = requests.get(wiki_url).content
        
    countries_list = pd.read_html(html_wikipage, header=0)[2]
    countries_dict = dict(zip(countries_list['Code'], countries_list['Country name (using title case)']))

    return countries_dict    

In [19]:
input('Select an option:
        ')

SyntaxError: EOL while scanning string literal (<ipython-input-19-e71b6541524f>, line 1)

# FUNCTIONS

In [None]:
def ageStr_to_ageNum(serie):
    """
    INPUT -> serie df[] = ['61 years old', '57 years old', '32 years old'] -> full strings
    OUPUT -> serie df[] = [ 61 57 32 45 41 1990 2000]                      -> only integers
    """
    return serie.apply(lambda x: re.sub('\D', '', x)).astype(int)

def year_to_age(serie):
    """
    INPUT -> serie:  df[] = [ 61 57 32 45 41 1990 2000]   -> ages + years (all ints)
    OUTPUT -> serie: df[] = [ 61 57 32 45 41   30   20]   -> only ages    (all ints)
    """   
    year_db = 2016 # DB is from this year!
    return serie.apply(lambda x: year_db - x if int(x) > 200 else x)

def year_update(serie):
    """
    La tabla está en 2016, hay que actualizar datos para uqe no haya incongruencias entre Age y Age_Group
    """
    year_now = datetime.today().year
    year_db = 2016
    return serie.apply(lambda x: (year_now - year_db) + x)

#-----------------------------------------------------------------------------------------------------#

def countryCode_to_countryName(serie, web_page):
    # CAMBIAR SEGÚN DICCIONARIO DE EUROPEAN COUNTRIES
    """
    INPUT  ->      AT      FR     ES  -> alpha_2 code
    OUTPUT -> Austria  France  Spain  -> full name
    --------------------------------------------------------------------------------
    Note: countries_code_alpha_2 is a dict from get_dictCountries()
    """
    url = web_page
    country_dictionary = get_dictCountries(url)
    
    return serie.apply(lambda x: country_dictionary[str(x)]) # Esto se puede hacer en la API

#-----------------------------------------------------------------------------------------------------#

def null_to_unknown(serie):
    """
    INPUT  -> no  high     None  medium     None  low  no
    OUTPUT -> no  high  unknown  medium  unknown  low  no
    
    Podria considerarse tb no... aquí hay interpretación de datos!
    """
    return serie.apply(lambda x: 'unknown' if x == None else x)


def gender_homogenization(serie):
    """
    INPUT  -> female, FeMale, Fem, male, Male
    OUTPUT ->      F,      F,   F,    M,    M
    """
    serie = serie.apply(lambda x: re.sub('^f\w+|^F\w+', 'F', x))
    serie = serie.apply(lambda x: re.sub('^m\w+|^M\w+', 'M', x))
    return serie


def context_homogenization(serie):
    """
    INPUT  ->   urban  city  rural  Non-Rural Countryside  -> various types of response
    OUTPUT ->   urban  urban rural      urban       rural  -> two types of response
    ------------------------------------------------------------------------------------
    LIST OF POSSIBLE ANSWERS TAKING INTO ACCOUNT serie count values
    """
    urban_context = ['urban', 'city', 'non-rural', 'Non-Rural']
    rural_context = ['rural', 'country', 'countryside', 'Country']
    
    return  ['urban_context' if response in urban_context 
        else 'rural_context' if response in rural_context 
        else None 
        for response in serie]



def yes_no_to_bool(serie):
    """
    Appliable to yes/no questions with multiple formats, to transform into boolean info
    INPUT  -> YES yes Yes yES No NO nO no  -> type str
    OUTPUT ->   1   1   1   1  0  0  0  0  -> type bool
    """
    serie = serie.apply(lambda x: re.sub('^y\w+|^Y\w+', '1', x))
    serie = serie.apply(lambda x: re.sub('^n\w+|^N\w+', '0', x)).astype(int)
    return serie.astype(bool)



################################################################################# EJERCICIO DE API ####

def get_job_code(serie):
    """
    In table career_info
    INPUT  -> hash
    OUTPUT -> dictionary w hash info provided by API
    """
    json_dicts = []
    uuid_db = serie.unique().tolist()
    
    for job_code in uuid_db:
        response = requests.get(f'http://api.dataatwork.org/v1/jobs/{job_code}')
        json_dicts.append(response.json())
        
    return json_dicts


def normalized_jobs_col(coded_series, json_data):
    """
    INPUT  -> None  <hash>                 --> coded_col
    OUTPUT -> None  <job name as in API>   --> new_col
    """
    new_col = [d.get('title')  for job_code in coded_series 
                               for d in json_data 
                               if d.get('uuid') == job_code]
    # d.get('title') or d.get('normalized_job_title')
    return new_col  # Con aply or merge tb


################################################################################# SCKLEARN COSAS DE ML ####

def separate_df_to_bools(df, cols_to_separate, cols_separated):
    """
    INPUT  -> df[col].unique() = [range_1, range_2, range_3]
    OUTPUT -> df[[range_1, range_2, range_3]] with boolean responses
    """
    df_encoder = OneHotEncoder(dtype=bool, sparse=True)
    df = pd.DataFrame(df_encoder.fit_transform(df[cols_to_separate]).toarray(),columns=cols_separated)
    return df

################################################################################

def save_df_to_csv(df, path, name):
    
    print(f'\t\t ->Saving {df} in {path} as {name}')
    path = '../' + f'{path}'
    return df.to_csv(f'{path}/{name}.csv')
    print('\t\t ->Job done!')
    
    

### 1_ DF_PERSONAL_INFO

In [None]:
try:
        
    # Paso 1_ normalización de columna Age: str -> int -> change years w/ ages -> update current year?
    df_personal_info['age'] = ageStr_to_ageNum(df_personal_info['age'])
    df_personal_info['age'] = year_to_age(df_personal_info['age'])
    # df_personal_info['age'] = year_update(df_personal_info['age'])
    """ El problema de actualizar la fecha es que hay que reevaluar los grupos de edad"""

    # Paso 2_ String Operations: multiple inputs in binomial cols -> only 2 values for 2 options
    df_personal_info['gender'] = gender_homogenization(df_personal_info['gender'])
    df_personal_info['dem_has_children'] = yes_no_to_bool(df_personal_info['dem_has_children'])
    
    # Paso 3_ Separate cols for boolean options
    initial_cols = ['gender', 'age_group']
    final_cols = ['gender_Female', 'gender_Male', 'ageGroup_14_25', 'ageGroup_26_39', 'ageGroup_40_65', 'ageGroup_juvenile']
    
    new_bool_df = separate_df_to_bools(df_personal_info, initial_cols, final_cols)
    df_personal_info = df_personal_info.join(other= new_bool_df, on=None, how='left', sort=False)
    
    # Paso 4_Guardar la tabla en local
    save_df_to_csv(df_personal_info, 
                   path='data/processed', 
                   name= 'personal_info')

except:
    print('Something went wrong')

finally:
    """ MEMORY USAGE from 377.0+ KB to 367.6+ KB | FROM object(5) to bool(7), object(4) """
    print('Done cleaning df_personal_info!')

In [None]:
df_personal_info.head(3)

### 2_ DF COUNTRY_INFO 

In [None]:
df_personal_info.info()

In [None]:
"""Country info is list_of_dfs[0]"""
try:
    initial_cols = ['rural']
    final_cols = ['rural_context', 'urban_context']
    
    ### Here comes the dict from web scrapping
    
    # Paso 1_ transformar datos en cols en elementos más legibles
    df_country_info['country_Names'] = countryCode_to_countryName(
        serie= df_country_info['country_code'], 
        web_page= 'https://ec.europa.eu/eurostat/statistics-explained/index.php/Glossary:Country_codes')
    
    # Paso 1_ String Operations multiple inputs to binomial cols -> only 2 values from 2 options
    df_country_info['rural'] = context_homogenization(df_country_info['rural'])
    
    # Paso 2_
    new_bool_df = separate_df_to_bools(df_country_info, initial_cols, final_cols)
    
    df_country_info = df_country_info.join(other= new_bool_df, on=None, 
                                                               how='left', 
                                                               sort=False)

    df_country_info.drop(columns='rural', inplace = True)
    
except:
    print('Something went wrong with [acquire_table_career_info]')

finally:
    print('''\t\t\t  >> Done cleaning df_career_info!. 
             \t\t\t  >> Chekout /data/processed/''')

In [None]:
df_country_info.info(3)

### 3_ DF CAREER_INFO 

In [None]:
"""Country info is list_of_dfs[0]"""
try:
    print(f'')
    # Columnas a separar en booleanos
    initial_cols  = ['dem_education_level']
    final_cols = ['High_Ed', 'Low_Ed', 'Medium_Ed', 'No_Ed', 'Unknown_Ed']

    # Paso 1_ cambiar columnas de yes/no -> booleanos + Nones to Unknown en col de valores cualitativos
    df_career_info['dem_full_time_job'] = yes_no_to_bool(df_career_info['dem_full_time_job'])
    df_career_info['dem_education_level'] = null_to_unknown(df_career_info['dem_education_level'])
    
    # Paso 2_ A traves de API, añadir columna de nombres de JOBS
    json_job_data = get_job_code(df_career_info['normalized_job_code'])
    df_career_info['normalized_job_name'] = normalized_jobs_col(df_career_info['normalized_job_code'], json_job_data)
    
    # Paso 3_ Creación de nuevas columnas, a partir de columna inicial
    new_bool_df = separate_df_to_bools(df_career_info, initial_cols, final_cols)
    df_career_info = df_career_info.join(other= new_bool_df, on=None, how='left', sort=False)

    # Paso 4_ Eliminación de información redundante
    cols_to_del = ['dem_education_level', 'normalized_job_code']
    df_career_info.drop(columns= cols_to_del, inplace= True)
    
except:
    print('Something went wrong with [acquire_table_career_info]')

finally:
    """MEMORY USAGE from 301.7+ KB to 207.4+ KB | FROM objects(4) to bool(6), object(2) """
    print('Done cleaning df_career_info!')

In [None]:
df_career_info.head(10)

## 4_ DF POLL_INFO 

In [None]:
# df_poll_info.info()
""" dtypes: object(6) | memory usage: 452.4+ KB """

In [None]:
df_poll_info.describe()

In [None]:
def split_str_at_char(str_to_split, cutter):
    if isinstance(str_to_split, str) and isinstance(cutter, str):
        return re.split(cutter, str_to_split)[1].capitalize()
    else:
        print('EntryError [at split_str_at_char]: wrong type of inputs')
    
def get_uniqueResponses(serie, separator):
    """
    This function searches for the uniques responses in a multiple choice response 
    presented as a concatenated string.
    INPUT  ->  'E | F | C | D'   'D | A'    'C'   'E | B'  --> Unsorted concat strings
    OUTPUT ->  ['E', 'F', 'C', 'D', 'A', 'B']              --> Unsorted unique strings
        
    """
    try:
        if isinstance(separator, str):
            list_of_all_responses = set()
            all_responses = set()

            flattened_list_of_responses = reduce(lambda x,y: x+y, 
                                                 [item.split(separator) for item in serie.unique()])

            for response in flattened_list_of_responses:
                if response not in list_of_all_responses:
                    list_of_all_responses.add(response)

            return list(list_of_all_responses)
        else:
            print('EntryError [at get_uniqueResponses]: wrong type of inputs')
        
    except:
        pass


    
def to_binary_matrix_of_equals(list_uniques, list_to_eval):
    
    list_uniques_lenghts = [len(i) for i in list_uniques]
    
    list_to_eval_iterabl = iter(list_to_eval)
    list_to_eval_lenghts = list(reduce(lambda x,y: x+y, list_to_eval))

    list_of_lists = []
    
    
    try:
        for v in list_to_eval_lenghts:
            arr = [len(i) for i in next(list_to_eval_iterabl)]
            list_of_arrays = []

            for len_num in arr:
                list_of_arrays.append( np.where( np.array(list_uniques_lenghts) == len_num, 1, 0))

            list_of_lists.append(list_of_arrays)
            
    except StopIteration:
        pass
    
    
    flat_arrays = [np.sum(i, axis = 0) for i in list_of_lists]
    binary_matrix = [i.tolist() for i in flat_arrays]
    
    return binary_matrix
    

####################################################################################  DONT NEED THIS  ######
def length_str_nlist(nlist):
    """
    INPUT   -> [['Hello'], ['Hola', 'Caracola!']] -> nested list of strings, diff len
    OUTPUT  -> [[5], [4, 9]]                      -> nested list of integers, diff len
    """
    graph_list_str_lenghts = []

    # iterate through nested items whithout flattening list
    # works for [[1,2], [3,4]]
    for lsts in nlist:
        graph_sublist_str_lenghts = []
        
        for i in lsts:
            graph_sublist_str_lenghts.append(len(i))

        graph_list_str_lenghts.append(graph_sublist_str_lenghts)
        
    return graph_list_str_lenghts


def binary_matrix(list_uniques, eval_matrix):
    """
    This function iterates through a list and searches for equalities into another list.
    INPUT  -> [1,2,5,1,2,1,2,5]  + [1,2]   --> these are the lenghts of strings to search for equals
    OUTPUT -> [1,0,0,1,0,1,0,0]    [1]     --> equality in string of len = 1
              [0,1,0,0,1,0,1,0]    [2]     --> equality in string of len = 2
    """
    
    list_of_lists = []
    
    # iterate through elements in nested lists without flattening list
    for lst in eval_matrix:
        list_of_arrays =[]
        
        for numb in lst:
            list_of_arrays.append(np.where(np.array(list_uniques) == numb, 1, 0))
        
        list_of_lists.append(list_of_arrays)
    
    flat_arrays = [np.sum(i, axis = 0) for i in list_of_lists]
    binary_matrix = [i.tolist() for i in flat_arrays]
    
    return binary_matrix

####################################################################################

In [None]:
def multiple_choice_col_to_df(serie, separator):
    """
    Makes all the operations to return a boolean df with all the possible responses from each poll
    Nested functions : get_uniqueResponses()   to_binany_matrix_of_equals()
    """
    poll_info_allResponses = get_uniqueResponses(serie, separator)

    graph_list_of_responses = serie.apply(lambda x: x.split(separator))

    bin_matrix = to_binary_matrix_of_equals(poll_info_allResponses, graph_list_of_responses)

    df = pd.DataFrame(bin_matrix, columns=poll_info_allResponses, dtype='bool') 

    return df
    

In [None]:
df_poll_info['question_bbi_2016wave4_basicincome_effect'] = [split_str_at_char(response, 'Û_ ') 
                                                            if re.search('Û_ ', response)
                                                            else response
                                                            for response in df_poll_info['question_bbi_2016wave4_basicincome_effect']
                                                            ]

##################################################################################################################

FALTA AÑADIR LA COLUMNA DE UUID

In [None]:

df_poll_basicincome_awareness = multiple_choice_col_to_df(df_poll_info['question_bbi_2016wave4_basicincome_awareness'], ' | ')

df_poll_basicincome_vote = multiple_choice_col_to_df(df_poll_info['question_bbi_2016wave4_basicincome_vote'], separator= ' | ')

df_poll_basicincome_effect = multiple_choice_col_to_df(df_poll_info['question_bbi_2016wave4_basicincome_effect'], separator= ' | ')


df_poll_basicincome_argumentsagainst = multiple_choice_col_to_df(df_poll_info['question_bbi_2016wave4_basicincome_argumentsagainst'], separator= ' | ')

df_poll_basicincome_argumentsfor = multiple_choice_col_to_df(df_poll_info['question_bbi_2016wave4_basicincome_argumentsfor'], separator= ' | ')


In [None]:
%%timeit
df_poll_basicincome_argumentsagainst = multiple_choice_col_to_df(df_poll_info['question_bbi_2016wave4_basicincome_argumentsagainst'], separator= ' | ')

In [None]:
df_poll_basicincome_awareness.info()

In [None]:
df_poll_basicincome_vote.info()

In [None]:
df_poll_basicincome_effect.info()

In [None]:
df_poll_basicincome_argumentsfor..info()

In [None]:
df_poll_basicincome_argumentsfor.shape

In [None]:
df_poll_basicincome_argumentsagainst.info()

In [None]:
df_poll_basicincome_argumentsagainst.shape

################################################################################################################

## TABLAS FINAL LIMPIADAS

Exportar tablas en LOCAL para poder hacer wrangling con ellas

In [None]:
"""
df_personal_info

df_country_info

df_career_info

df_poll_basicincome_awareness
df_poll_basicincome_vote
df_poll_basicincome_effect
df_poll_basicincom_argumentsfor
df_poll_basicincome_argumentsagainst

"""

"""

"""

In [None]:
1 # getting the table to add data

dfs = [df_career_info, df_country_info, df_personal_info]

df_joined = reduce(lambda left,right: pd.merge(left,right,on='uuid'), dfs) #Apply

df_final = df_joined[['country_Names', 'normalized_job_name', 'gender', 'dem_full_time_job']]

In [None]:
df_final.head(3)

In [None]:
####### ESTO ES MI DIVIDENDO: La cantidad de gente con un determinado trabajo en un país, dividido por sexos

df_2 = df_final.groupby(['country_Names','normalized_job_name','gender']).agg('count').reset_index()
df_2.rename(columns={"dem_full_time_job": "Quantity"}, inplace=True)



In [None]:
########################################################################### this is good good good ############
# DIVIDENDO ?
filtr = ['country_Names','normalized_job_name','gender']

df_2 = df_final.assign(quantity = 1)\
                .groupby(['country_Names','normalized_job_name','gender'])\
                .agg('count')\
                .reset_index()\
                .drop(columns='dem_full_time_job')

In [None]:
df_2.head(2)

In [None]:
###### ESTO ES MI DIVIDOR: La cantidad de gente encuestada por pais

df_4 = df_2.groupby(['country_Names'])['normalized_job_name'].nunique()
df_5 = df_2.groupby(['country_Names', 'gender'], as_index = False)['normalized_job_name','Quantity'].agg('count')

In [None]:
df_5.drop(columns='normalized_job_name', inplace=True, errors='raise')
df_5 = df_5.rename(columns = {'Quantity': 'totals_per_gender'})

In [None]:
df_5

In [None]:
df_2.head(2)

In [None]:
df_4 = df_4.to_frame()
df_4 = df_4.rename(columns={'normalized_job_name': 'totals_per_country'})

In [None]:
df_5.head()

In [None]:
df_4.shape

In [None]:
df_2.head(3)

In [None]:
df_all = df_2.join(df_4, on='country_Names', how='left')

In [None]:
df_all_2 = df_5.join(df_4, on='country_Names', how= 'left')

In [None]:
df_all_2

In [None]:
filtr = df_all['country_Names'] == 'France'
df_all[filtr]

In [None]:
df_all['Percentage'] = round(df_all['Quantity'] / df_all['totals_per_country'] * 100, 2)
df_filtr = df_all[filtr]

In [None]:
### CHART 1

# Tabla que muestra distribución según sexos de empleos (por país)
# Falta por trabajar que me muestre la cantidad, no su existencia
"""
df_filtr.groupby(['gender', 'normalized_job_name']).size().unstack().plot(kind='bar',stacked=True, figsize=(14, 25))
plt.show()
"""

In [None]:
"""
df_filtr.plot(kind='bar',x='normalized_job_name',y='Percentage')
plt.show()
"""