# Data Wrangling

In [1]:
# all modules

import warnings
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
from functools import reduce

In [2]:
# global stuff

warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_rows', 13)
pd.set_option('display.max_columns', 13)

In [15]:
def get_dictEuropeanCountries():
    wiki_url= 'https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2'
    url= 'https://www.euro.who.int/en/countries'
    
    html_wikipage = requests.get(wiki_url).content
    html_eurocountries = requests.get(url).content
        
    countries_list = pd.read_html(html_wikipage, header=0)[2]
    countries_dict = dict(zip(countries_list['Code'], countries_list['Country name (using title case)']))

    soup = BeautifulSoup(html_eurocountries, 'lxml')
    table = soup.find_all('section', {'class':'clearfix'})

    all_contries = [content.text for content in table]
    eu_countries = list(filter(None, all_contries[0].split('\n')))

    european_countries_values = [val for k,val in countries_dict.items() for eu_c in eu_countries if val == eu_c]
    european_countries_key = [k for k,val in countries_dict.items() for eu_c in eu_countries if val == eu_c]

    return dict(zip(european_countries_key, european_countries_values))

## OPENING CLEANED TABLES

In [3]:
df_personal_info = pd.read_csv('../data/processed/personal_info.csv', index_col=[0])
df_career_info = pd.read_csv('../data/processed/career_info.csv', index_col=[0])
df_country_info = pd.read_csv('../data/processed/country_info.csv', index_col=[0]

In [131]:
df_career_info.head()

Unnamed: 0,uuid,dem_full_time_job,normalized_job_code,High_Ed,Low_Ed,Medium_Ed,No_Ed,Unknown_Ed,normalized_job_names
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,False,,False,False,False,True,False,
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,True,861a9b9151e11362eb3c77ca914172d0,True,False,False,False,False,Automatic Data Processing Planner
2,83127080-da3d-0133-c74f-0a81e8b09a82,False,,False,False,False,False,True,
3,15626d40-db13-0133-ea5c-0a81e8b09a82,True,049a3f3a2b5f85cb2971ba77ad66e10c,True,False,False,False,False,Data Coordinator
4,24954a70-db98-0133-4a64-0a81e8b09a82,True,f4b2fb1aa40f661488e2782b6d57ad2f,True,False,False,False,False,Database Developer


# TABLE 1 [country-job title-gender-quantity-percetnage]

In [141]:
########## FUNCTIONS TO EVAL ARGUMENT AT THE BEGINNING

def country_argparse_eval(country_argument, list_to_search):
    """
    Evaluates whether argument exists in df_country_info[country_names]
    """
    print(f'\t ··· Validating country argparse')
    
    if country_argument in list_to_search:
        print(f'\t\t >> country_argument found in ddbb')
        def_country_argument = country_argument
        return def_country_argument
    
    else:
        print(f'\t\t >> country_argument not found')
        def_country_argument = ""
        return def_country_argument

    
def country_argparse_filter(country_argument, analysis_df):
    """
    Return analysis df by argument input
    """
    list_to_search = analysis_df['country_names'].unique().tolist()
    c_arg = country_argparse_eval(country_argument, list_to_search)
    
    if c_arg == "":
        return analysis_df
    else:
        return analysis_df[analysis_df['country_names'] == c_arg]

    
def get_base_analysis_df(country_argument):
    """
    Returns base_df for analysis after evaluating country argument
    """
    dfs = [
        df_country_info[['uuid', 'country_names']],
        df_personal_info[['uuid', 'gender']],
        df_career_info[['uuid', 'normalized_job_names', 'dem_full_time_job', 'High_Ed', 'Low_Ed', 'Medium_Ed', 'No_Ed']], 
        ]

    df_final = reduce(lambda left,right: pd.merge(left,right,on='uuid'), dfs)
    
    return country_argparse_filter(country_argument.capitalize(), df_final)

In [148]:
def get_percentages_gender_by_job(base_analysis_df):
    
    # Variables.
    filtr = ['country_names','normalized_job_names','gender']
    
    drop_cols = ['uuid', 'dem_full_time_job', 
                 'High_Ed', 'Low_Ed', 'Medium_Ed', 'No_Ed', 
                 'totals_per_country']
    
    new_cols = ['quantity', 'percentage']

    # Add first col = quality
    df_job_gender = base_analysis_df.assign(quantity = 1)\
                         .drop(columns=drop_cols[0:2])\
                         .groupby(filtr)\
                         .agg('count')\
                         .reset_index()
    
    # Generate totals_per_country
    df_total_per_country = df_job_gender.groupby(filtr[0])\
                        [filtr[1]]\
                        .nunique()\
                        .to_frame()\
                        .rename(columns={filtr[1]: drop_cols[-1]})
    
    df_job_gender = df_job_gender.merge(df_total_per_country, on=filtr[0])

    # Add second col == percentage and deleting totals_per_country when not need
    df_job_gender[new_cols[1]] = round(df_job_gender[new_cols[0]] / df_job_gender[drop_cols[-1]] * 100, 3)
    df_job_gender.drop(columns=[drop_cols[-1]], inplace=True)
    
    return df_job_gender
    

In [149]:
df_job_gender = get_percentages_gender_by_job(
                get_base_analysis_df(country_argument='SpAiN'))

	 ··· Validating country argparse
		 >> country_argument found in ddbb


In [150]:
df_job_gender

Unnamed: 0,country_names,normalized_job_names,gender,quantity,percentage
0,Spain,Analytical Data Miner,M,1,0.781
1,Spain,Automatic Data Processing Customer Liaison (AD...,F,5,3.906
2,Spain,Automatic Data Processing Customer Liaison (AD...,M,6,4.688
3,Spain,Automatic Data Processing Planner,F,5,3.906
4,Spain,Automatic Data Processing Planner,M,4,3.125
...,...,...,...,...,...
203,Spain,Survey Data Technician,F,1,0.781
204,Spain,Voice and Data Technician,F,1,0.781
205,Spain,Voice and Data Technician,M,1,0.781
206,Spain,Weight in Motion Field Data Collection Technician,F,2,1.562


# TABLE 3 [top 10 skills by education level]

In [170]:
df_analysis = get_base_analysis_df(country_argument='SpAiN')

	 ··· Validating country argparse
		 >> country_argument found in ddbb


In [176]:
df_high_ed = df_analysis[df_analysis['High_Ed']].drop(columns=['High_Ed', 'Low_Ed', 'Medium_Ed', 'No_Ed'])

df_high_ed.head(3)

Unnamed: 0,uuid,country_names,gender,normalized_job_names,dem_full_time_job
2212,72a48100-da38-0133-487f-0a81e8b09a82,Spain,M,Clinical Data Management Manager (CDM Manager),True
2218,6256a6a0-d961-0133-7e9a-0a81e8b09a82,Spain,M,,False
2222,709bb380-d8e2-0133-b610-0a81e8b09a82,Spain,F,Data Miner,True


In [297]:
def top_skills_by_ed_level(base_analysis_df, number_skills, ed_level):
    
    # Variables
    filtr = 'normalized_job_names'
    drop_cols = ['uuid', 'country_names', 'gender', 'dem_full_time_job']
    
    # Filter by education level
    df_high_ed = base_analysis_df[df_analysis[ed_level]]\
                                .drop(columns=['High_Ed', 'Low_Ed', 'Medium_Ed', 'No_Ed'])
                                  
    # Counting and sorting
    df_high_ed_skills_gender = df_high_ed.assign(counts = 1)\
                                     .drop(columns=drop_cols)\
                                     .groupby(filtr)\
                                     .agg('count')\
                                     .reset_index()

    df_high_ed_skills_gender.sort_values(by=['counts'], ascending=False, inplace=True)
    # Getting serie 
    pd = df_high_ed_skills_gender[filtr].reset_index()\
                                           .head(number_skills)
                                  
    pd = pd.rename(columns={filtr: ed_level}).drop(columns='index')
    
    return pd

In [315]:
def get_df_top_skills(country_argument, num_top_skills):
    
    # Variables
    ed_levels = ['High_Ed', 'Medium_Ed', 'Low_Ed']
    
    # Single column DF
    serie_top_skills_high_ed = top_skills_by_ed_level(base_analysis_df= get_base_analysis_df(
                                                                    country_argument= country_argument), 
                                                  number_skills= num_top_skills, 
                                                  ed_level= ed_levels[0])
    
    serie_top_skills_medium_ed = top_skills_by_ed_level(base_analysis_df= get_base_analysis_df(
                                                                    country_argument= country_argument), 
                                                  number_skills= num_top_skills, 
                                                  ed_level= ed_levels[1])
    
    serie_top_skills_low_ed = top_skills_by_ed_level(base_analysis_df= get_base_analysis_df(
                                                                    country_argument= country_argument), 
                                                  number_skills= num_top_skills, 
                                                  ed_level= ed_levels[2])
    
    # Construccion of DF
    all_dfs =[serie_top_skills_high_ed, serie_top_skills_medium_ed, serie_top_skills_low_ed]
    cols = dict(zip([n for n in range(5)], ['#'+str(n) for n in range(5)]))

    result = pd.concat(all_dfs, axis=1, sort=False)
    return result.T.rename(columns=cols)

In [316]:
df_top_skills = get_df_top_skills(country_argument = 'Spain', 
                                  num_top_skills = 5)

	 ··· Validating country argparse
		 >> country_argument found in ddbb
	 ··· Validating country argparse
		 >> country_argument found in ddbb
	 ··· Validating country argparse
		 >> country_argument found in ddbb


In [317]:
df_top_skills

Unnamed: 0,#0,#1,#2,#3,#4
High_Ed,Data Communications Software Consultant,Database Engineer,Database Management System Specialist (DBMS Sp...,Data Management Associate,Geographic Information Systems Data Manager (G...
Medium_Ed,Data Security Analyst,Database Administration Manager,Database Coordinator,Automatic Data Processing Customer Liaison (AD...,Database Manager
Low_Ed,Computer or Data Processing Systems Consultant,Automatic Data Processing Planner,Data Warehouse Analyst,SQL Database Administrator,Micro Computer Data Processor
