## Clean data

### 1. Importing all packages

In [1]:
# External
import os
import re
import numpy as np
import pandas as pd
from typing import Callable

### 2. Importing  a CSV file

In [2]:
def get_dfs_from_CSVs_in_folder(directory: str) -> dict[str, pd.DataFrame]:

    dfs = {}

    # https://regex101.com/r/QYuVDf/1
    pattern = r"Data_Engineer_([a-zA-Z_]+)_\d{2}-\d{2}-\d{4}_\d{2}-\d{2}.csv"

    for __, _, files in os.walk(directory):

        for file in files:
            if file.endswith('.csv'):
                match = re.search(pattern, file)
                if match:
                    country = match.group(1)
                    file_path = os.path.join(directory, file)
                    dfs[country] = pd.read_csv(file_path)

    return dfs

In [3]:
CSVs_folder = "data/RAW/Data Engineer"
dfs = get_dfs_from_CSVs_in_folder(CSVs_folder)
dfs

{'Australia':                                  Company_name  Rating         Location  \
 0                                 ONNEC Group     3.2           Sydney   
 1                              Octopus Energy     4.2        Melbourne   
 2                                 TPG Telecom     3.7     North Sydney   
 3                                          EY     3.9           Sydney   
 4                                    7-Eleven     3.3  New South Wales   
 ..                                        ...     ...              ...   
 739                           Publicis Groupe     3.9           Sydney   
 740                                   Equinix     4.2        Unanderra   
 741                                    TikTok     3.6           Sydney   
 742  Australian Government Services Australia     3.6         Canberra   
 743                                 Airwallex     3.4  New South Wales   
 
                                              Job_title  \
 0      Data Centre Cablin

In [4]:
dfs['Austria'].dtypes

Company_name             object
Rating                  float64
Location                 object
Job_title                object
Description              object
Job_age                  object
Easy_apply                 bool
Salary                   object
Employees                object
Type_of_ownership        object
Sector                   object
Founded                 float64
Industry                 object
Revenue_USD              object
Friend_recommend        float64
CEO_approval            float64
Career_opportunities    float64
Comp_&_benefits         float64
Culture_&_values        float64
Senior_management       float64
Work/Life_balance       float64
Pros                     object
Cons                     object
Benefits_rating         float64
Benefits_reviews         object
dtype: object

In [5]:
dfs['Austria'].head()

Unnamed: 0,Company_name,Rating,Location,Job_title,Description,Job_age,Easy_apply,Salary,Employees,Type_of_ownership,...,CEO_approval,Career_opportunities,Comp_&_benefits,Culture_&_values,Senior_management,Work/Life_balance,Pros,Cons,Benefits_rating,Benefits_reviews
0,Infineon Technologies AG,4.2,Villach,Component Verification and Product Characteriz...,You are looking for a new challenge to bring i...,30d+,False,,10000+,Company - Public,...,0.95,3.9,3.8,4.1,3.7,4.1,"['""Good work life balance and learning"" (in 15...","['""Descent pay, work life balance"" (in 153 rev...",3.9,"['Health Insurance (22 comments)\n""basic medli..."
1,B-612 UK Ltd.,,Vienna,Freelance Hardware/ Data Centre Field Engineer,REQUIREMENTS\r\nWe will consider self-employed...,17d,True,€41.65 - €50.90 Per Hour(Employer est.),,,...,,,,,,,,,,
2,Infineon Technologies AG,4.2,Villach,Senior Staff Engineer Digital Verification (f/...,You are looking for a new challenge to bring i...,30d+,False,,10000+,Company - Public,...,0.95,3.9,3.8,4.1,3.7,4.1,"['""Good work life balance and learning"" (in 15...","['""Descent pay, work life balance"" (in 153 rev...",3.9,"['Health Insurance (22 comments)\n""basic medli..."
3,Infineon Technologies AG,4.2,Villach,Senior Staff Engineer Product Development for ...,Do you want to get to know the development of ...,27d,False,,10000+,Company - Public,...,0.95,3.9,3.8,4.1,3.7,4.1,"['""Good work life balance and learning"" (in 15...","['""Descent pay, work life balance"" (in 153 rev...",3.9,"['Health Insurance (22 comments)\n""basic medli..."
4,Infineon Technologies AG,4.2,Villach,Product Application Engineer (f/m/div)*,"You enjoy working in an international team, in...",30d+,False,,10000+,Company - Public,...,0.95,3.9,3.8,4.1,3.7,4.1,"['""Good work life balance and learning"" (in 15...","['""Descent pay, work life balance"" (in 153 rev...",3.9,"['Health Insurance (22 comments)\n""basic medli..."


In [6]:
del CSVs_folder

### 3. Remove rows only with NaNs

In [7]:

def process_dfs(dfs: dict[str, pd.DataFrame], method: Callable):
    differences = {}

    for country_name, country_df in dfs.items():

        country_df_before_len = country_df.shape[0]
        dfs[country_name]: pd.DataFrame = method(country_df)
        country_df_after_len = dfs[country_name].shape[0]

        is_difference = country_df_before_len != country_df_after_len

        if is_difference:
            differences[country_name] = (
                country_df_before_len, 
                country_df_after_len
                )

    print(differences)

    return dfs

In [8]:
dfs = process_dfs(dfs=dfs, method = lambda x: x.dropna(how='all'))

{}


### 4. Remove duplicates

In [9]:
def remove_duplicates(dfs: dict[str, pd.DataFrame]):
    differences = {}

    for country_name, country_df in dfs.items():

        country_df_before_len = country_df.shape[0]
        dfs[country_name]: pd.DataFrame = country_df.drop_duplicates(subset=country_df.columns.difference(['Job_age']))
        country_df_after_len = dfs[country_name].shape[0]

        is_difference = country_df_before_len != country_df_after_len

        if is_difference:
            differences[country_name] = (
                country_df_before_len, 
                country_df_after_len
                )


    print(differences)

    return dfs

dfs = remove_duplicates(dfs)

{'Australia': (744, 111), 'Austria': (840, 652), 'Belgium': (900, 250), 'Czech_Republic': (900, 202), 'Denmark': (570, 317), 'Finland': (300, 192), 'France': (900, 324), 'Germany': (900, 291), 'Greece': (690, 172), 'Hong_Kong': (870, 564), 'Hungary': (690, 293), 'Ireland': (900, 493), 'Israel': (900, 368), 'Italy': (900, 326), 'Japan': (420, 308), 'Luxembourg': (390, 192), 'Netherlands': (900, 150), 'New_Zealand': (390, 332), 'Norway': (360, 122), 'Poland': (900, 189), 'Portugal': (900, 685), 'Romania': (660, 482), 'Singapore': (900, 184), 'South_Korea': (570, 401), 'Spain': (900, 183), 'Sweden': (900, 257), 'Switzerland': (900, 151), 'Taiwan': (690, 129), 'Turkey': (870, 120), 'United_Kingdom': (900, 120), 'United_States': (900, 244)}


There is huge amount of duplicates.

In [10]:
del remove_duplicates

### 5. Clean jobs to only relevant ones

In [11]:
countries_languages = {
     'Austria': [
          "German"
     ],
     'Belgium': [
          "French", "Dutch", "German"
     ],
     'Canada': [
          "French"
     ],
     # By Glassdoor is also Slovakia
     'Czech_Republic': [
          "Czech", "Slovakian", "Hungarian"
     ],
     'Denmark': [
          "Danish"
     ],
     'Finland': [
          "Finnish", "Swedish"
     ],
     # Occitan and Catalan are pretty similar
     'France': [
          "French", "Catalan", "Italian", "Basque"
     ],
     'Germany': [
          "German"
     ],
     'Greece': [
          "Greek"
     ],
     'Hungary': [
          "Hungarian", "Romanian"
     ],
     # Irish is almost not spoken
     'Ireland': [],
     'Israel': [
          "Hebrew", "Arabic"
     ],
     'Italy': [
          "Italian",
          "German",
          "French",
          "Catalan",
          "Greek",
          "Slovenian"
     ],
     'Luxembourg': [
          "German", "French"
     ],
     'Netherlands': [
          "Dutch", "Frisian"
     ],
     'Norway': [
          "Norwegian"
     ],
     'Poland': [
          "Polish"
     ],
     'Portugal': [
          "Portuguese"
     ],
     'Romania': [
          "Romanian"
     ],
     'Spain': [
          "Spanish", "Basque", "Catalan", "Galician"
     ],
     'Sweden': [
          "Swedish", "Finnish"
     ],
     'Switzerland': [
          "German", "French", "Italian"
     ],
     'Turkey': [
          "Turkish", "Kurdish"
     ],
     'United_States': [
          "Spanish"
     ],
     # We can leave Scottish and Gaelic
     'United_Kingdom': [],
     # Yeah there are Ryukyuan languages, but because of the size population and 
     # implementation problems we can skip it, because the gain would not be that much.
     'Japan': [
          "Japanese"
    ],
     'South_Korea': [
          "Korean"
    ],
     'Taiwan': [
          "Chinese_TR"
    ],
     'Singapore': [
          "Chinese_SP"
    ],
     "New_Zealand": [],
     "Australia": [],
     "Hong_Kong": [
          "Chinese_TR"
    ],
}

In [12]:
invalid_non_eng = {
    'Arabic': ["مهندس الجوالات", "مطور الجوالات", "مطور أندرويد", "مهندس أندرويد", "مهندس طبي", "جيولوجي هندسي", "مهندس جيوتقني", "مهندس كهربائي", "مدير مشروع", "مهندس جودة", "مهندس ميكانيكي", "مهندس تصميم ميكانيكي", "مطور تطبيقات جوال"],
	'Basque': ["Mugikor Ingeniaria", "Mugikor Garatzailea", "Android Garatzailea", "Android Ingeniaria", "Biomedikuntza Ingeniaria", "Geologi Ingeniaria", "Geotekniko Ingeniaria", "Elektrizitate Ingeniaria", "Proiektu Kudeatzailea", "Kalitate Ingeniaria", "Mekanika Ingeniaria", "Mekanika Diseinu Ingeniaria", "Mugikor Aplikazio Garatzailea"],
	'Catalan': ["Enginyer Mòbil", "Desenvolupador Mòbil", "Desenvolupador Android", "Enginyer Android", "Enginyer Biomèdic", "Enginyer Geològic", "Enginyer Geotècnic", "Enginyer Elèctric", "Cap de Projecte", "Enginyer de Qualitat", "Enginyer Mecànic", "Enginyer de Disseny Mecànic", "Desenvolupador d'Apps Mòbils"],
	'Czech': ["Mobilní inženýr", "Mobilní vývojář", "Vývojář Androidu", "Android inženýr", "Biomedicínský inženýr", "Inženýrský geolog", "Geotechnický inženýr", "Elektroinženýr", "Manažer projektu", "Kvalitní inženýr", "Mechanický inženýr", "Inženýr návrhu strojů", "Vývojář mobilních aplikací"],
	'German': ["Mobile Ingenieur", "Mobile Entwickler", "Android Entwickler", "Android Ingenieur", "Biomedizinischer Ingenieur", "Ingenieurgeologe", "Geotechnischer Ingenieur", "Elektroingenieur", "Projektmanager", "Qualitätsingenieur", "Maschinenbauingenieur", "Maschinenbau-Konstrukteur", "Mobile App Entwickler"],
	'Danish': ["Mobil ingeniør", "Mobiludvikler", "Android-udvikler", "Android-ingeniør", "Biomedicinsk ingeniør", "Ingeniørgeolog", "Geoteknisk ingeniør", "Elektrisk ingeniør", "Projektleder", "Kvalitetsingeniør", "Mekanisk ingeniør", "Mekanisk-design ingeniør", "Mobil app-udvikler"],
	'Spanish': ["Ingeniero móvil", "Desarrollador móvil", "Desarrollador de Android", "Ingeniero de Android", "Ingeniero biomédico", "Geólogo de ingeniería", "Ingeniero geotécnico", "Ingeniero eléctrico", "Gerente de proyectos", "Ingeniero de calidad", "Ingeniero mecánico", "Ingeniero de diseño mecánico", "Desarrollador de aplicaciones móviles"],
	'Finnish': ["Mobiili-insinööri", "Mobiilikehittäjä", "Android-kehittäjä", "Android-insinööri", "Biomediainsinööri", "Geologian insinööri", "Geotekninen insinööri", "Sähköinsinööri", "Projektipäällikkö", "Laatuinsinööri", "Mekaaninen insinööri", "Mekaanisen suunnittelun insinööri", "Mobiilisovelluskehittäjä"],
	'French': ["Ingénieur mobile", "Développeur mobile", "Développeur Android", "Ingénieur Android", "Ingénieur biomédical", "Géologue d'ingénierie", "Ingénieur géotechnique", "Ingénieur électrique", "Chef de projet", "Ingénieur qualité", "Ingénieur mécanique", "Ingénieur en conception mécanique", "Développeur d'applications mobiles"],
	'Frisian': ["Mobile yngenieur", "Mobile ûntwikkele", "Android-ûntwikkele", "Android yngenieur", "Biomedysk yngenieur", "Engineering geolooch", "Geotechnysk yngenieur", "Elektaryske yngenieur", "Projektmanager", "Kwaliteit yngenieur", "Mekanysk yngenieur", "Mekanyske-ûntwerp yngenieur", "Mobile app-ûntwikkele"],
	'Galician': ["Enxeñeiro móbil", "Desenvolvedor móbil", "Desenvolvedor de Android", "Enxeñeiro de Android", "Enxeñeiro biomédico", "Xeólogo de enxeñería", "Enxeñeiro xeotécnico", "Enxeñeiro eléctrico", "Xestor de proxectos", "Enxeñeiro de calidade", "Enxeñeiro mecánico", "Enxeñeiro de deseño mecánico", "Desenvolvedor de aplicacións móbeis"],
	'Greek': ["Μηχανικός κινητής τηλεφωνίας", "Προγραμματιστής κινητής τηλεφωνίας", "Προγραμματιστής Android", "Μηχανικός Βιοϊατρικής Τεχνολογίας", "Γεωλόγος Μηχανικός", "Μηχανικός Γεωτεχνικών", "Ηλεκτρολόγος Μηχανικός", "Διευθυντής έργου", "Μηχανικός Ποιότητας", "Μηχανικός Μηχανολογίας", "Μηχανικός Μηχανολογίας-Σχεδιασμού", "Προγραμματιστής κινητών εφαρμογών"],
	'Hebrew': ["מהנדס מובייל", "מפתח יישומים מוביילים", "מפתח אנדרואיד", "מהנדס ביו-רפואי", "גיאולוג מהנדס", "מהנדס גיאו-טכני", "מהנדס חשמל", "מנהל פרויקט", "מהנדס איכות", "מהנדס מכונות", "מהנדס מכונות-תכנון", "מפתח יישומי ניידים"],
	'Hungarian': ["Mobil mérnök", "Mobil fejlesztő", "Android fejlesztő", "Android mérnök", "Biomedikus mérnök", "Mérnöki geológus", "Geotechnikai mérnök", "Elektromos mérnök", "Projektmenedzser", "Minőségi mérnök", "Gépészmérnök", "Gépészmérnök-tervező", "Mobil alkalmazás fejlesztő"],
	'Italian': ["Ingegnere mobile", "Sviluppatore mobile", "Sviluppatore Android", "Ingegnere Android", "Ingegnere biomedico", "Geologo ingegnere", "Ingegnere geotecnico", "Ingegnere elettrico", "Project Manager", "Ingegnere della qualità", "Ingegnere meccanico", "Ingegnere meccanico-design", "Sviluppatore di app mobili"],
	'Kurdish': ["Mühendis-ı mobîl", "Pêşkêşvan-ı mobîl", "Pêşkêşvan-ı Android", "Mühendis-ı Android", "Mühendis-ı bîomedîkal", "Cîhêk-î mühendîsî", "Mühendîs-î geoteknik", "Mühendîs-î elektrîkî", "Pêwendîdar-î projeyan", "Mühendîs-î quality", "Mühendîs-î mekanîkî", "Mühendîs-î mekanîkî-têkildarî dizaynê", "Pêşkêşvan-î app-ê mobîl"],
	'Dutch': ["Mobiele ingenieur", "Mobiele ontwikkelaar", "Android ontwikkelaar", "Android ingenieur", "Biomedisch ingenieur", "Ingenieur geologie", "Geotechnisch ingenieur", "Elektrotechnisch ingenieur", "Projectmanager", "Kwaliteitsingenieur", "Werktuigbouwkundig ingenieur", "Werktuigbouwkundig-ontwerp ingenieur", "Mobiele app-ontwikkelaar"],
	'Norwegian': ["Mobil ingeniør", "Mobilutvikler", "Android-utvikler", "Android-ingeniør", "Biomedisinsk ingeniør", "Geologiingeniør", "Geoteknisk ingeniør", "Elektroingeniør", "Prosjektleder", "Kvalitetsingeniør", "Maskiningeniør", "Mekanisk designingeniør", "Utvikler av mobilapper"],
	'Polish': ["Inżynier mobilny", "Deweloper mobilny", "Deweloper Androida", "Inżynier Androida", "Inżynier biomedyczny", "Inżynier geologii", "Inżynier geotechniki", "Inżynier elektryk", "Kierownik projektu", "Inżynier jakości", "Inżynier mechanik", "Inżynier mechaniki i projektowania", "Twórca aplikacji mobilnych"],
	'Portuguese': ["Engenheiro Móvel", "Desenvolvedor Móvel", "Desenvolvedor Android", "Engenheiro Android", "Engenheiro Biomédico", "Geólogo de Engenharia", "Engenheiro Geotécnico", "Engenheiro Elétrico", "Gerente de Projeto", "Engenheiro de Qualidade", "Engenheiro Mecânico", "Engenheiro de Design Mecânico", "Desenvolvedor de Aplicativos Móveis"],
	'Romanian': ["Inginer Mobil", "Dezvoltator mobil", "Dezvoltator Android", "Inginer Android", "Inginer Biomedical", "Geolog Inginer", "Inginer Geotehnic", "Inginer Electric", "Manager de proiect", "Inginer de calitate", "Inginer mecanic", "Inginer de design mecanic", "Dezvoltator de aplicații mobile"],
	'Slovakian': ["Mobilný inžinier", "Mobilný vývojár", "Vývojár Androidu", "Android inžinier", "Biomedicínsky inžinier", "Inžinier geológie", "Geotechnický inžinier", "Elektrický inžinier", "Manažér projektu", "Kvalitný inžinier", "Mechanický inžinier", "Inžinier návrhu mechaniky", "Vývojár mobilných aplikácií"],
	'Slovenian': ["Mobilni inženir", "Mobilni razvijalec", "Razvijalec Androida", "Android inženir", "Biomedicinski inženir", "Inženir geologije", "Geotehnični inženir", "Elektroinženir", "Vodja projekta", "Inženir kakovosti", "Mehanski inženir", "Inženir oblikovanja mehanike", "Razvijalec mobilnih aplikacij"],
	'Swedish': ["Mobilingenjör", "Mobilutvecklare", "Androidutvecklare", "Androidingenjör", "Biomedicinsk ingenjör", "Ingenjör i geologi", "Geoteknisk ingenjör", "Elektroingenjör", "Projektledare", "Kvalitetsingenjör", "Mekanisk ingenjör", "Ingenjör för mekanisk design", "Mobilapputvecklare"],
	'Turkish': ["Mobil mühendisi", "Mobil Geliştirici", "Android Geliştirici", "Android Mühendisi", "Biyomedikal Mühendisi", "Jeoloji Mühendisi", "Zemin Mekaniği Mühendisi", "Elektrik Mühendisi", "Proje Yöneticisi", "Kalite Mühendisi", "Mekanik Mühendisi", "Mekanik-Tasarım Mühendisi", "Mobil Uygulama Geliştiricisi"],
    'Japanese': ["モバイルエンジニア", "モバイル開発者", "Android開発者", "Androidエンジニア", "バイオメディカルエンジニア", "エンジニアリングジオロジスト", "地質工学技術者", "電気技師", "プロジェクトマネージャー", "品質エンジニア", "機械エンジニア", "機械設計エンジニア", "モバイルアプリ開発者"],
	'Korean': ["모바일 엔지니어", "모바일 개발자", "안드로이드 개발자", "안드로이드 엔지니어", "바이오의공학 엔지니어", "공학 지질학자", "지질기술 엔지니어", "전기 기술자", "프로젝트 매니저", "품질 엔지니어", "기계 엔지니어", "기계설계 엔지니어", "모바일 앱 개발자"],
	'Chinese_TR': ["移动工程师", "移动开发人员", "安卓开发人员", "安卓工程师", "生物医学工程师", "工程地质学家", "岩土工程师", "电气工程师", "项目经理", "质量工程师", "机械工程师", "机械设计工程师", "移动应用程序开发人员"],
	'Chinese_SP': ["移动工程师", "移动开发人员", "安卓开发人员", "安卓工程师", "生物医学工程师", "工程地质学家", "岩土工程师", "电气工程师", "项目经理", "质量工程师", "机械工程师", "机械设计工程师", "移动应用程序开发人员"],
}

In [13]:
specializations_non_english = {
	'Arabic': ["مهندس", "هندسة", "مستشار", "معماري", "متخصص", "مدير", "مطور", "عمارة", "مسؤول", "رئيس", "قائد", "مشرف", "منسق", "تنفيذي"],
	'Basque': ["Ingeniaria", "Ingeniaritza", "Konsultore", "arkitektoa", "ESPECIALISTA", "Kudeatzailea", "Garatzailea", "Arkitektura", "Administratzailea", "Koordinatzailea", "Koordinatzaile", "Gobernuko"],
	'Catalan': ["Enginyer", "Enginyeria", "Consultor", "arquitecte", "ESPECIALISTA", "Gerent", "Desenvolupador", "Arquitectura", "Administrador", "Executiu"],
	'Czech': ["Inženýr", "Konstruktér", "Architekt", "SPECIALISTA", "Manažer", "Vývojář", "Architektura", "Správce","Dozorce", "Koordinátor", "Výkonný"],
	'German': ["Ingenieur", "Berater", "Architekt", "SPEZIALIST", "Manager", "Entwickler", "Architektur", "Architecture", "Administrator", "Vorgesetzter", "Koordinator", "Führungskraft"],
	'Danish': ["Ingeniør", "Konsulent", "Arkitekt", "SPECIALIST", "Manager", "Udvikler", "Arkitektur", "Administrator", "Koordinator"],
	'Dutch': ["SPECIALIST", "Engineering", "Manager", "architect", "Beheerder", "Ingenieur", "Adviseur", "Ontwikkelaar", "Architect", "Techniek", "Architectuur", "Toezichthouder", "Coördinator", "Uitvoerend"],
	'Spanish': ["Ingeniero", "Consultor", "Arquitecto", "Especialista", "Gerente", "Desarrollador", "Arquitectura", "Administrador", "Ejecutivo"],
	'Finnish': ["Insinööri", "Konsultti", "Arkkitehti", "ERITYISOSAAMINEN", "Johtaja", "Kehittäjä", "Arkkitehtuuri", "Ylläpitäjä", "Esimies", "Koordinaattori", "Johtaja"],
	'French': ["Architecte", "Développeur", "architecte", "SPECIALISTE", "SPÉCIALISTE", "Consultant", "Conseiller", "Ingénieur", "Administrateur", "Architecture", "Manager", "Ingénierie", "Superviseur", "Coordinateur", "Cadre"],
	'Frisian': ["Ynżenier", "Ynženiering", "Konsultant", "arkitekt", "SPESJALIST", "Manager", "Ûntwikkelers", "Arktitektuer", "Administrator", "Koördinator", "Uitvoerend"],
	'Galician': ["Inxeniero", "Enxeñería", "Consultor", "arquitecto", "ESPECIALISTA", "Xestor", "Desenvolvedor", "Arquitectura", "Administrador", "Coordinador", "Executivo"],
	'Greek': ["Μηχανικός", "Σύμβουλος", "Αρχιτέκτονας", "ΕΙΔΙΚΟΣ", "Διευθυντής", "Προγραμματιστής", "Αρχιτεκτονική", "Διαχειριστής", "Επιβλέπων", "Συντονιστής", "Διευθυντής"],
	'Hebrew': ["מהנדס", "הנדסה", "יועץ", "אדריכל", "מומחה", "מנהל", "מפתח", "ארכיטקטורה", "מנהל מערכות", "מנהל", "רכז", "מבצע"],
	'Hungarian': ["Mérnök", "Mérnöki", "Tanácsadó", "építész", "SZAKÉRTŐ", "Menedzser", "Fejlesztő", "Architektúra", "Rendszergazda", "Felügyelő", "Koordinátor", "Vezető"],
	'Italian': ["Ingegnere", "Consulente", "architetto", "SPECIALISTA", "Manager", "Sviluppatore", "Architettura", "Amministratore", "Supervisore", "Coordinatore", "Esecutivo"],
	'Kurdish': ["Mûhandis", "Mûhendisî", "Pêşkêşker", "pargîdaniyar", "XWESER", "Manajer", "Pêşgir", "Arkîtektur", "Peywendkar", "Koordinator", "Xwedî"],
	'Norwegian': ["Ingeniør", "Konsulent", "arkitekt", "SPECIALIST", "Manager", "Utvikler", "Arkitektur", "Administrator", "Veileder", "Koordinator", "Leder"],
	'Polish': ["Inżynier", "Konsultant", "Architekt", "Specjalista", "Manager", "Programista", "Administrator", "Przełożony", "Koordynator", "Wykonawczy"],
	'Portuguese': ["Engenheiro", "Consultor", "Arquiteto", "Especialista", "Gerente", "Desenvolvedor", "Arquitetura", "Administrador", "Coordenador", "Executivo"],
	'Romanian': ["Inginer", "Consultant", "Arhitect", "Specialist", "Manager", "Dezvoltator", "Arhitectura", "Administrator", "Supraveghetor", "Coordonator", "Executiv"],
	'Slovakian': ["Inžinier", "Konzultant", "architekt", "SPECIALISTA", "Manažér", "Vývojár", "Architektúra", "Správca", "Dozorca", "Koordinátor", "Výkonný"],
	'Slovenian': ["Inženir", "Inženiring", "Svetovalec", "arhitekt", "SPECIALIST", "Vodja", "Razvijalec", "Arhitektura", "Administrator", "Nadzornik", "Koordinator", "Izvršni"],
	'Swedish': ["Ingenjör", "Konsult", "Arkitekt", "Specialist", "Chef", "Utvecklare", "Arkitektur", "Administratör", "Handledare", "Koordinator", "Verkställande"],
	'Turkish': ["Mühendis", "Danışman", "Mimar", "Uzman", "Yönetici", "Geliştirici", "Mimarlık", "Yönetici", "Koordinatör", "Yönetici"],
	'Japanese': ["エンジニア", "エンジニアリング", "コンサルタント", "アーキテクト", "スペシャリスト", "マネージャー", "開発者", "アーキテクチャー", "管理者", "責任者", "リーダー"],
    'Korean': ["엔지니어", "엔지니어링", "컨설턴트", "건축가", "전문가", "매니저", "개발자", "아키텍처", "관리자", "책임자", "리더", "감독자", "코디네이터", "임원"],
    'Chinese_TR': ["工程師", "工程", "顧問", "建築師", "專家", "經理", "開發者", "架構", "管理員", "負責人", "領導", "監督者", "協調員", "行政人員"],
    'Chinese_SP': ["工程师", "工程", "顾问", "建筑师", "专家", "经理", "开发者", "架构", "管理员", "负责人", "领导", "监督者", "协调员", "行政人员"],
}

In [14]:
data_terms_non_english = {
	'Arabic': ["بيانات", "ETL", "سحابة", "تحليلي", "تحليلات", "ذكاء الأعمال", "تحليلات الأعمال", "قاعدة بيانات", "خط أنابيب", "بيانات وصفية", "رصد", "مركز بيانات"],
	'Basque': ["Datuak", "ETL", "Cloud", "Analitikoa", "Analitika", "BI", "Negozioaren Inteligentzia", "Negozioaren Analitika", "Datubasea", "Pipeline-a", "Metadatuak", "Monitoreo", "Datuen zentroa"],
	'Catalan': ["Dades", "ETL", "Núvol", "Analític", "Anàlisi de dades", "BI", "Intel·ligència de negocis", "Anàlisi de negocis", "Base de dades", "Pipeline", "Metadades", "Monitorització", "Centre de dades"],
	'Czech': ["Data", "ETL", "Cloud", "Analytický", "Analytika", "BI", "Business Intelligence", "Business Analytics", "Databáze", "Pipeline", "Metadata", "Monitorování", "Datacentrum"],
	'German': ["Daten", "ETL", "Cloud", "Analytisch", "Analytics", "BI", "Business Intelligence", "Business Analytics", "Datenbank", "Pipeline", "Metadaten", "Überwachung", "Rechenzentrum", "Datenzentrum"],
	'Danish': ["Data", "ETL", "Cloud", "Analytisk", "Analyse", "BI", "Forretningsanalyse", "Database", "Pipeline", "Metadata", "Overvågning", "Datacenter"],
	'Dutch': ["Business Analytics", "Monitoring", "Leiding", "Metadata", "Cloud", "Business Intelligence", "Data", "Analytisch", "BI", "Database", "Datacenter", "Pipeline", "Analytics", "Bedrijfsanalyse", "Bedrijfsinformatie", "ETL"],
	'Spanish': ["Datos", "ETL", "Nube", "Analítico", "Análisis", "BI", "Inteligencia de Negocios", "Análisis de Negocios", "Base de datos", "Pipeline", "Metadatos", "Monitoreo", "Centro de datos"],
	'Finnish': ["Data", "ETL", "Pilvi", "Analytiikka", "BI", "Liiketoiminta-analytiikka", "Tietokanta", "Putkisto", "Metatiedot", "Seuranta", "Tietokeskus"],
	'French': ["Architecte", "Développeur", "architecte", "SPECIALISTE", "SPÉCIALISTE", "Consultant", "Conseiller", "Ingénieur", "Administrateur", "Architecture", "Manager", "Ingénierie"],
	'Frisian': ["Data", "ETL", "Cloud", "Analytysk", "Analitika", "BI", "Bisykens Intelligence", "Bisykens Analytics", "Database", "Pipeline", "Metadata", "Monitoring", "Datacenter"],
	'Galician':  ["Datos", "ETL", "Nube", "Analítica", "Analítica de datos", "BI", "Intelixencia de negocios", "Analítica de negocios", "Base de datos", "Pipeline", "Metadatos", "Monitorización", "Centro de datos"],
	'Greek': ["Δεδομένα", "ETL", "Νέφος", "Αναλυτική", "Ανάλυση", "BI", "Επιχειρηματική Νοημοσύνη", "Επιχειρηματική Αναλυτική", "Βάση δεδομένων", "Αγωγός", "Μεταδεδομένα", "Παρακολούθηση", "Κέντρο δεδομένων"],
	'Hebrew': ["נתונים", "ETL", "ענן", "ניתוח", "ניתוח נתונים", "BI", "בינה מעסיקתית", "אנליטיקה עסקית", "מסד נתונים", "צינורות נתונים", "מטא נתונים", "מעקב", "מרכז נתונים"],
	'Hungarian': ["Mérnök", "Mérnöki", "Tanácsadó", "építész", "SZAKÉRTŐ", "Menedzser", "Fejlesztő", "Architektúra", "Rendszergazda"],
	'Italian': ["Dati", "ETL", "Cloud", "Analitico", "Analytics", "BI", "Business Intelligence", "Business Analytics", "Database", "Pipeline", "Metadati", "Monitoraggio", "Centro dati"],
	'Kurdish': ["Zanist", "ETL", "Pirsgirêk", "Analytîk", "Analîz", "BI", "Zanistên Kar", "Analîzên Kar", "Bingehbazî", "Pîpelya", "Meta-Data", "Pêşwazî", "Navenda Zanistê"],
	'Norwegian': ["Data", "ETL", "Sky", "Analytisk", "Analytics", "BI", "Forretningsinnsikt", "Forretningsanalyse", "Database", "Pipeline", "Metadata", "Overvåking", "Databehandlingssenter"],
	'Polish': ["Dane", "ETL", "Chmura", "Analityczny", "Analityka", "BI", "Business Intelligence", "Analityka Biznesowa", "Bazy Danych", "Pipeline", "Metadane", "Monitorowanie", "Centrum Danych"],
	'Portuguese': ["Dados", "ETL", "Nuvem", "Analítico", "Análise", "BI", "Inteligência de Negócios", "Análise de Negócios", "Banco de Dados", "Pipeline", "Metadados", "Monitoramento", "Centro de Dados"],
	'Romanian': ["Date", "ETL", "Noroi", "Analitic", "Analize", "BI", "Business Intelligence", "Analiză de afaceri", "Bază de date", "Conductă", "Metadate", "Monitorizare", "Centru de date"],
	'Slovakian': ["Data", "ETL", "Cloud", "Analytický", "Analytika", "BI", "Business Intelligence", "Business Analytics", "Databáza", "Pipeline", "Metadata", "Monitorovanie", "Datacentrum"],
	'Slovenian': ["Podatki", "ETL", "Oblak", "Analitični", "Analitika", "BI", "Poslovna Inteligenca", "Poslovna Analitika", "Podatkovna Baza", "Cevovod", "Metapodatki", "Spremljanje", "Podatkovni Center"],
	'Swedish': ["Data", "ETL", "Moln", "Analys", "Analytik", "BI", "Affärsinriktad Analys", "Business Intelligence", "Databas", "Pipeline", "Metadata", "Övervakning", "Datacenter"],
	'Turkish': ["Veri", "ETL", "Bulut", "Analitik", "Analiz", "BI", "İş Zekası", "İş Analizi", "Veritabanı", "Boru Hattı", "Meta Veri", "İzleme", "Veri Merkezi"],
	'Japanese': ["データ", "ETL", "クラウド", "分析的な", "アナリティクス", "BI", "ビジネスインテリジェンス", "ビジネスアナリティクス", "データベース", "パイプライン", "メタデータ", "モニタリング", "データセンター"],
    'Korean': ["데이터", "ETL", "클라우드", "분석적인", "애널리틱스", "BI", "비즈니스 인텔리전스", "비즈니스 애널리틱스", "데이터베이스", "파이프라인", "메타데이터", "모니터링", "데이터 센터"],
    'Chinese_TR': ["數據", "ETL", "雲端", "分析", "分析學", "商業智慧", "商業分析", "數據庫", "管道", "元數據", "監控", "數據中心"],
    'Chinese_SP': ["数据", "ETL", "云端", "分析", "分析学", "商业智慧", "商业分析", "数据库", "管道", "元数据", "监控", "数据中心"],
}

In [15]:
def is_data_engineering_job(job_title: str, country = None):

    specializations = ["Engineer", "Engineering", "Consultant", "Architect", "Specialist", "Manager", "Developer", "Architecture", "Administrator", "Head of", "Lead", "Director", "Supervisor", "Coordinator", "Executive"]

    # Back-end, fullstack, data scientists are somehow different domains, but sometimes in some companies they are just the same roles as data engineers
    data_terms = ["Data", "ETL", "Cloud", "Analytical", "Analytics", "BI", "Buisness Intelligence", "Buisness Analytics", "Database", "Pipeline", "Metadata", "Monitoring", "Datacenter"]

    any_in_specs = isinstance(job_title, str) and any(isinstance(spec, str) and spec.lower() in job_title.lower() for spec in specializations)

    any_in_terms = isinstance(job_title, str) and any(isinstance(term, str) and term.lower() in job_title.lower() for term in data_terms)

    invalid = [
        "Mobile engineer", "Mobile Developer", "Android Developer", 
        "Android Engineer", "Biomedical Engineer", "Engineering Geologist", 
        "Geotechnical Engineer", "Electrical Engineer", "Project Manager", 
        "Quality Engineer", "QA Engineer", "Mechanical Engineer", 
        "Mechanical-Design Engineer", "Mobile App Developer", "Full-Stack", 
        "Fullstack", "Full Stack", "Machine Learning Engineer", "Front", "FrontEnd",
        "Client-Side", "Support Engineer", "Data Scientist", "Computer Vision Engineer",
        "C# Software Engineer", "Verification engineer", "Networking Software Engineer",
        "Machine Learning Engineer", "Manual", "Deep Learning Engineer", "Reliability Engineer",
        "Field", "Account Manager", "Solutions Engineer"
               ]
    
    if country:

        country_languages = countries_languages[country]

        for language in country_languages:

            _invalid_not_eng = invalid_non_eng[language]
            _specializations_not_eng = specializations_non_english[language]
            _data_terms_not_eng = data_terms_non_english[language]

            invalid.extend(_invalid_not_eng)
            specializations.extend(_specializations_not_eng)
            data_terms.extend(_data_terms_not_eng)
        
    is_valid = isinstance(job_title, str) and not any(isinstance(term, str) and term.lower() in job_title.lower() for term in invalid)

    return any_in_specs and any_in_terms and is_valid

In [16]:
def show_unique_and_its_len(df: pd.Series):
    print(f"{len(df.unique())} :\n{df.unique()}")


##### 6.1 Austria

In [17]:
salary_type = 'Austria'

In [18]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

622 :
['Component Verification and Product Characterization Engineer (f/m/div)*'
 'Freelance Hardware/ Data Centre Field Engineer'
 'Senior Staff Engineer Digital Verification (f/m/div)*'
 'Senior Staff Engineer Product Development for GaN-based ICs (f/m/div)*'
 'Product Application Engineer (f/m/div)*'
 'Senior Firewall Engineer (all genders)'
 'Security Engineer (all genders)'
 'Sr. Software Development Engineer – C++' 'Android Mobile Developer'
 'Junior Data Warehouse und BI Engineer'
 'Oracle APEX Entwickler (m/w/d) - PL/SQL Data Base Engineer'
 'Data Engineer, CRM (m/f/x)'
 'Solution Manager (all genders) #databricks #cloud #python'
 'Senior Java Software Engineer for George (all genders)'
 'Process Manager (m/f/d) 80% Homeoffice'
 'Internship - Data Science or Software Engineering'
 'Junior PHP and Machine Learning Engineer'
 'Data Engineer im Bereich Data Integration (m/w/d)'
 'Data Engineer (m/w/d)' 'Data Engineer (f/m/d)'
 'CBS Data Engineer (f/m/x)' 'Data Engineer' 'System En

In [19]:
df = dfs[salary_type]
filtered_df = df[df.apply(
        lambda row: is_data_engineering_job(row['Job_title'], 
        salary_type
    ), 
    axis=1)]


In [20]:
show_unique_and_its_len(filtered_df['Job_title'])

128 :
['Junior Data Warehouse und BI Engineer'
 'Oracle APEX Entwickler (m/w/d) - PL/SQL Data Base Engineer'
 'Data Engineer, CRM (m/f/x)'
 'Solution Manager (all genders) #databricks #cloud #python'
 'Internship - Data Science or Software Engineering'
 'Data Engineer im Bereich Data Integration (m/w/d)'
 'Data Engineer (m/w/d)' 'Data Engineer (f/m/d)'
 'CBS Data Engineer (f/m/x)' 'Data Engineer'
 'Senior Data Engineer | Europe | Remote' 'Data Engineer*' 'Data engineer'
 'Data Engineer temp. 24 months (w/m/d)' 'Senior Data Engineer (m/f/x)'
 'Data Analyst & -Engineer (m/w/d)'
 'Data Warehouse DevOps Engineer (f/m/x)'
 'Data Engineer (m/f/d) - maternity cover 1yr.'
 'Data Virtualization (Denodo) Engineer (f/m/x)'
 'Junior - Data Engineer (m/f/x)'
 '(Senior) Data Engineer (m/w/d) (Remote innerhalb von...'
 'Data Engineer (f/m/x)' 'Data Engineer (m/w/div.)'
 'Software-Engineer für Data Analytics und Chatbot Design (all genders)'
 'Data Engineer (m/f/d)'
 'Data Center Engineer - Austria - 

In [21]:
dfs[salary_type] = filtered_df

##### 6.2 Belgium

In [22]:
salary_type = 'Belgium'

In [23]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

228 :
['Data Engineer (met affiniteit voor Data Science)' 'data manager'
 'Python Engineer on FAIR Data' 'Electromagnetic Simulations Engineer'
 'Senior Big Data Engineer' 'System Engineer'
 'Mechanical Design Engineer (Hudson Sharp)'
 'ICT Traineeship - Data & Data Analytics' 'Field Engineer'
 'Senior R&D Test Engineer (Fuel cells)' 'Azure Data Platform Engineer'
 'Full-stack Developer | Digital Manufacturing Platform' 'DATA ENGINEER'
 'IT System Engineer' 'Process Engineer Automotive Industry'
 'Photonics Design Engineer' 'Medior Data Protection Engineer'
 'DATA ENGINEER & MODELER' 'Data Engineer Traineeship'
 'Junior Integration Engineer' 'Field and solutions engineer IOT'
 'Supply Chain Engineer' 'R&D Electronics Engineer'
 'ICT Traineeship - Development & Packages' 'Validation Engineer - CSV'
 'R&D Professional – Power Systems / Data Engineer'
 'Manufacturing Test Engineer' 'Junior Data Warehouse Engineer'
 'ICT Traineeship - Networks, Telecom & Cybersecurity' 'Developer'
 'Junior

In [24]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'], 
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

54 :
['Data Engineer (met affiniteit voor Data Science)' 'data manager'
 'Python Engineer on FAIR Data' 'Senior Big Data Engineer'
 'Azure Data Platform Engineer' 'DATA ENGINEER'
 'Medior Data Protection Engineer' 'DATA ENGINEER & MODELER'
 'Data Engineer Traineeship'
 'R&D Professional – Power Systems / Data Engineer'
 'Junior Data Warehouse Engineer' 'Azure Data Engineer'
 'AWS Data Engineer' 'Data Engineer' 'DevOps Engineer (Data)'
 'Data Engineer / Architect' 'Project Engineer Sustainability'
 'Data engineer / ETL consultant' 'Cloud Data Engineer'
 'Datacenter Engineer' 'Enterprise Data Information Architect'
 'PRODUCTION DATA ENGINEER' 'Data Migration Consultant'
 'Cloud Engineer / Cloud Architect' 'Industrial Data Engineer'
 'Product Manager Entertainment - Data Collaborations'
 'Cloud system engineer' 'Cloud solution architect'
 'DATA & APPLICATION SPECIALIST' 'Data Engineer - First IT'
 'Master Data Process Engineer' 'IT DATA ENGINEER'
 'Data engineer | rechterhand IT Manager' 

In [25]:
dfs[salary_type] = filtered_df

##### 6.3 Canada

In [26]:
salary_type = 'Canada'

In [27]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

10 :
['Analytics Engineer' 'BI Engineer' 'Analytics Implementation Engineer'
 'Process Analyst Engineer - Steel Plant'
 'Financial Engineer/ACM Business Analyst'
 'Lead Analytics Solutions Engineer'
 'Analyst/Senior Associate, AML - Financial Engineering & Modeling (FEM)'
 'Business Analyst & Process Engineer'
 'Business Analyst Engineering Budget Planning'
 'Senior Manager, Project Delivery Business Analyst, Retail & Small Business (Contract)']


In [28]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

3 :
['Analytics Engineer' 'BI Engineer' 'Analytics Implementation Engineer']


In [29]:
dfs[salary_type] = filtered_df

##### 6.4 Czech_Republic

But also Slovak Republic

In [30]:
salary_type = 'Czech_Republic'

In [31]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

165 :
['Cloud Services Engineer (Based in Germany)' 'Technical Systems Manager'
 'PLC Engineer ( F/M/X )' 'Application Engineer - suitable for graduates'
 'Data Engineer' 'Data Engineer - Part time for a student'
 'Junior ESG Data Analyst / Tester'
 'Global Support Engineer – Mobile Data Network Czechia'
 'Data Processing and Automation-Development Engineer (f/m/d)'
 'HUS Data Engineer' 'Data QA Engineer' 'Presales Engineer EMEA'
 'Cloud Engineer (f/m/d)' 'DATA ENGINEER (BIOTECH)' 'Data Engineer (AI)'
 'Part-time - Junior Data Analytics Engineer' 'Industrial Engineer'
 'Quality Engineer' 'Erlang Engineer (m/f)' 'Lead Data Engineer'
 'Collaboration Solution Engineer - Part time for a student'
 'Manufacturing Engineer, Mikulov' 'DATA ENGINEER' 'Sr Data Engineer'
 'Data Analytics Engineer'
 'Middle/Senior Python developer for the American company'
 'Catia V5 Design Engineer (m/ž) - projekt. řízení automotive'
 'Strojní Engineer (procesy a technologie)' 'Senior Quality Engineer'
 'Technolo

In [32]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

47 :
['Cloud Services Engineer (Based in Germany)' 'Data Engineer'
 'Data Engineer - Part time for a student'
 'Data Processing and Automation-Development Engineer (f/m/d)'
 'HUS Data Engineer' 'Cloud Engineer (f/m/d)' 'DATA ENGINEER (BIOTECH)'
 'Data Engineer (AI)' 'Part-time - Junior Data Analytics Engineer'
 'Lead Data Engineer' 'DATA ENGINEER' 'Sr Data Engineer'
 'Data Analytics Engineer' 'IT Trainee Program - Junior Cloud Engineer'
 'Data Engineer - ML Ops' 'Staff Data Engineer'
 'Mastercard Graduate Launch Program 2023 - Data Engineer - Prague, Czech Republic'
 'Cloud Engineer - AWS' 'Data Engineer/Scientist' 'Cloud Engineer'
 'Datový analytik/Data Engineer' 'Senior Data Engineer' 'Data Engineer II'
 'Global Azure Data Architect (Engineer)'
 'Cloud Data Engineer - AI & Data Team'
 'Software Engineer - Secure Network Analytics & XDR'
 'Data Engineer / Analyst Senior - migrace do cloudu'
 'Cloud DevOps Engineer'
 'evergreen Public 360° Senior Cloud Data & AI Engineer'
 'Data Engine

In [33]:
dfs[salary_type] = filtered_df

##### 6.5 Denmark

In [34]:
salary_type = 'Denmark'

In [35]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

280 :
['Data Engineer' 'Lead Instrument Engineer'
 'Data Scientist / Machine Learning Engineer, FinTech Remote'
 'Data engineer' 'Field Engineer - NextGen Graduate Program'
 'Lead Structural Engineer' 'Lead Electrical Engineer'
 'Security Engineer II - Product Security'
 'Data Engineer for Research Data Management' 'Analytics Engineer'
 'Student Intern - Road Engineer for COWI in Odense, Denmark'
 'C++ Software Engineer' 'Machine Learning Engineer' 'Systems Engineer'
 'Data Manager' 'Data Engineers and Data Analysts for Copenhagen'
 'Data Engineer for Telia'
 'R&D Engineer Intern – for fast growing MedTech Start-Up' 'Engineer'
 'Forward-deployed Data Engineer' 'Operations Engineer til Big Data'
 'Data Platform Engineer' 'Experienced Data Engineer'
 'Energy analyst with a flair for data' 'Senior Data Engineer - Viby'
 'Machine Learning Engineer Intern – for fast growing MedTech Start-Up'
 'Senior Engineer' 'Fullstack Software Engineer'
 'Staff Software Engineer, Eats | Aarhus, Denmark'


In [36]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

67 :
['Data Engineer' 'Data engineer'
 'Data Engineer for Research Data Management' 'Analytics Engineer'
 'Data Manager' 'Data Engineers and Data Analysts for Copenhagen'
 'Data Engineer for Telia' 'Forward-deployed Data Engineer'
 'Operations Engineer til Big Data' 'Data Platform Engineer'
 'Experienced Data Engineer' 'Senior Data Engineer - Viby'
 'Software Engineer II - Vulnerability Platform' 'Data Center Engineer'
 'Staff Software Engineer, Fleet Reliability and Performance'
 'Data Engineer Consultant' 'Data Engineer til ny dataplatform'
 'Data Engineer til bekæmpelse af skatteunddragelse'
 'Associate Data Engineer' 'Data Engineer - QuantumBlack'
 'Vi søger en Data Engineer som vil være med' 'Data Engineer Director'
 'Lead Data Engineer' 'Data Engineer - Consulting'
 'Sr Software Engineer, Fleet Reliability and Performance'
 'Head of Data & Insights and RTE (m/f/d)' 'Data Engineer (Nordic based)'
 'Azure Data Engineer'
 'Data Engineer – help develop our new data platform'
 'Softwa

In [37]:
dfs[salary_type] = filtered_df

##### 6.6 Finland

In [38]:
salary_type = 'Finland'

In [39]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

163 :
['SOFTWARE ENGINEER' 'Research Engineer/Scientist'
 'Backend engineer (Data science interest)' 'Data Engineer'
 'Data Engineer (Azure)' 'Data Engineer, Data Architect'
 'Data & Cloud Engineer, Betolar Oyj' 'Data Integration Engineer'
 'Grow as Data Engineer with us - Solita´s personalised onboarding program'
 'Sales Engineer' 'Data and Analytics Developer'
 'Data Scientist, Data Engineer, Data Architect' 'Desktop Engineer'
 'Data Engineers' 'IT Support' 'Consumer Data Specialist'
 'Embedded Software Engineer – location free (in Finland)'
 'Data Engineer, Tietohallinto, Helsinki'
 'Data engineer, Data architect, BI Consultant, Data specialist'
 'Data Engineer (AWS' 'Forward-deployed Data Engineer'
 'Optical System Engineer' 'DATA ENGINEER' 'Sr Data Engineer'
 'Junior Software Engineer' 'AI Engineer, Large Language Models'
 'Cloud Data Engineer' 'Lead Data Scientist/Engineer'
 'EUC L1 Support Engineer' 'Service Support Engineer'
 'Diffractive Optics Designer' 'Software Test Enginee

In [40]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

40 :
['Backend engineer (Data science interest)' 'Data Engineer'
 'Data Engineer (Azure)' 'Data Engineer, Data Architect'
 'Data & Cloud Engineer, Betolar Oyj' 'Data Integration Engineer'
 'Grow as Data Engineer with us - Solita´s personalised onboarding program'
 'Data and Analytics Developer' 'Data Engineers'
 'Consumer Data Specialist' 'Data Engineer, Tietohallinto, Helsinki'
 'Data engineer, Data architect, BI Consultant, Data specialist'
 'Data Engineer (AWS' 'Forward-deployed Data Engineer' 'DATA ENGINEER'
 'Sr Data Engineer' 'Cloud Data Engineer' 'Azure Data Engineer'
 'Data Engineer Specialising in Python and AzureDatabricks'
 'Senior Data Engineer' 'Cloud Data Platform Engineer'
 'Azure Developer | Azure Architect| Azure Data Engineer'
 'Data Architect (Azure)'
 'Data Engineer (Mid/Senior experience) - Fluent Finnish required'
 'Azure Data Platform Specialist / Azure Data Engineer' 'Data Architect'
 'Senior Azure Data Engineer' 'Lead Data Engineer, Helsinki,Finland'
 'Senior D

In [41]:
dfs[salary_type] = filtered_df

##### 6.7 France

In [42]:
salary_type = 'France'

In [43]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

288 :
['Data Engineer h/f - Paris'
 'Software Engineer F/H (Dev.se Full-stack, DevOps, SRE) - 63'
 'Manager QA Testing Automation H/F'
 'Senior Data Engineer (online marketplace development)'
 'Alternance Software Engineer C/C++ Data structure - Lyon (F/H)'
 'Software engineer fullstack - Lancement nouveau produit H/F'
 'Data Analytics Leader' 'CDD - DATA ENGINEER F/H'
 'Python Engineer on FAIR Data (relocation to Belgium)'
 'Data Engineer H/F - Remote & Hybride' 'Senior Cybersecurity Engineer'
 'Alternance - Data engineering F/H' 'Application Engineer'
 'Chef de projet Digital dans contexte Data & Aerospace F/H'
 'Cybersecurity Auditor' 'DATA ENGINEER (H/F)'
 'Principal Engineer .Net | Archi Microservice & Azure | Grand Groupe - 75/100K - H/F'
 'Data Engineer - Big Data - Nantes - F/H'
 'Développeur / Software Engineer F/H'
 'System Engineer for advanced projects'
 'CDI - Développeur PYTHON / Data engineer (Media) H/F'
 'Machine Learning Engineer en Alternance' 'Lead tech Data enginee

In [44]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

195 :
['Data Engineer h/f - Paris'
 'Senior Data Engineer (online marketplace development)'
 'Alternance Software Engineer C/C++ Data structure - Lyon (F/H)'
 'Data Analytics Leader' 'CDD - DATA ENGINEER F/H'
 'Python Engineer on FAIR Data (relocation to Belgium)'
 'Data Engineer H/F - Remote & Hybride'
 'Alternance - Data engineering F/H' 'DATA ENGINEER (H/F)'
 'Data Engineer - Big Data - Nantes - F/H'
 'CDI - Développeur PYTHON / Data engineer (Media) H/F'
 'Lead tech Data engineer' 'Data Engineer (H/F)'
 'Data Engineer - Expert en modélisation H/F'
 'CDI - Data Engineer (Média) (F/H)'
 'Data engineer - analyste développeur (H/F)' 'Data Manager H/F'
 'Data engineer - data factory (H/F)' 'Lead Data Analyst (H / F)'
 'CDI - Data Engineer F/H' 'SOFTWARE ENGINEER BIG DATA-(H/F)'
 'CDI - Cloud Data Engineer (Média) F/H' 'Data engineer - H/F'
 'Data Engineer avec une coloration devops - H/F' 'Data Engineer -(H/F)'
 'Data engineer H/F' 'Data architect - data factory (H/F)'
 'DATA ENGINEER (

In [45]:
dfs[salary_type] = filtered_df

##### 6.8 Germany

In [46]:
salary_type = 'Germany'

In [47]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

247 :
['Data Engineer (w/m/d)'
 '(Senior) Data Engineer (w/m/d) – Marketing & Communications'
 'Cloud Solution Engineer - BI/IoT on Azure (m/w/d)'
 'SAP Data Engineer in Kraichtal-Gochsheim mit Homeoffice'
 'Senior Data Engineer (m/f/d)'
 'Data Warehouse Architekt / Entwickler / Data Engineer (m/w/d)'
 'Software Engineer (w/m/d)'
 'Lead Analytics Engineer / BI Engineer (m/f/d)'
 'System Engineer Fahrerassistenzsysteme / Autonomes Fahren (m/w/d)'
 'Data Engineer (w/m/d) Automotive Testing Unit'
 'BI Engineer mit Schwerpunkt Analyse (w/m/d)'
 'Streaming Data Engineer (w/m/d)' 'DevOps Engineer (m/w/d)'
 'Data Engineer (m/w/d)'
 'Data Engineer (m/w/d) Big Data | Python | Azure in Ulm'
 'SENIOR DATA ENGINEER (M/W/D)' 'Consultant Data Engineer (m/w/d)'
 'Avionics System Engineer (f/w/d)'
 '(Senior) Data Engineer Analytics (m/w/d)'
 'Senior ML/AI Engineer (m/w/d)'
 'Big Data Engineer - Data Warehouse / E-Commerce / Big Data Architektur (m/w/d)'
 'RNA Data Engineer (w/m/d)' 'Manager Data Engin

In [48]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'], 
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

130 :
['Data Engineer (w/m/d)'
 '(Senior) Data Engineer (w/m/d) – Marketing & Communications'
 'Cloud Solution Engineer - BI/IoT on Azure (m/w/d)'
 'SAP Data Engineer in Kraichtal-Gochsheim mit Homeoffice'
 'Senior Data Engineer (m/f/d)'
 'Data Warehouse Architekt / Entwickler / Data Engineer (m/w/d)'
 'Lead Analytics Engineer / BI Engineer (m/f/d)'
 'Data Engineer (w/m/d) Automotive Testing Unit'
 'BI Engineer mit Schwerpunkt Analyse (w/m/d)'
 'Streaming Data Engineer (w/m/d)' 'Data Engineer (m/w/d)'
 'Data Engineer (m/w/d) Big Data | Python | Azure in Ulm'
 'SENIOR DATA ENGINEER (M/W/D)' 'Consultant Data Engineer (m/w/d)'
 '(Senior) Data Engineer Analytics (m/w/d)'
 'Big Data Engineer - Data Warehouse / E-Commerce / Big Data Architektur (m/w/d)'
 'RNA Data Engineer (w/m/d)' 'Manager Data Engineering'
 'DevOps Engineer (m/w/d) Data Management Solutions'
 'Data Platforms Engineer (m/w/x)'
 'Senior Test Engineer (m/w/d) Data Analytics Platform'
 'Data Engineer (m/w/d) B2B-Versandhandel'

In [49]:
dfs[salary_type] = filtered_df

##### 6.9 Greece

In [50]:
salary_type = 'Greece'

In [51]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

153 :
['Software Engineer C#, .NET (code_MSW)'
 'Structural Engineer - Πολιτικός Μηχανικός Δομοστατικός'
 'Junior Production Engineer (Data Analyst)'
 'Data Engineer / BI Engineer @Crete'
 'Data Engineer / BI Engineer @Patras'
 'Data Engineer / BI Engineer @ Ioannnina'
 'Data Engineer / BI Engineer @Thessaloniki'
 'Senior Site Reliability Engineer *Remote*' 'Azure Data Engineer'
 'Data Engineer' 'Junior Data Εngineer'
 'Sr. Manager, Regulatory Quality Assurance Data Solutions Engineer'
 'Engineer - Sustainability Consultant' 'Maintenance Engineer'
 'Software Engineer (On behalf of our Client)'
 'Junior Data Center Engineer' 'Junior Software Development Engineer'
 'Support Engineer'
 'Director, Go to Market Strategy and Transformation - REMOTE or FLEX'
 'Senior Network Engineer' 'Data Platform Engineer DWH'
 'Back-End Software Engineer' 'Embedded Software Engineer'
 'DevOps Engineer - Work from home' 'Web Software Engineer (Zing)'
 'Cyber Security Engineer' 'Junior Analytics Software En

In [52]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

43 :
['Junior Production Engineer (Data Analyst)'
 'Data Engineer / BI Engineer @Crete'
 'Data Engineer / BI Engineer @Patras'
 'Data Engineer / BI Engineer @ Ioannnina'
 'Data Engineer / BI Engineer @Thessaloniki' 'Azure Data Engineer'
 'Data Engineer' 'Engineer - Sustainability Consultant'
 'Junior Data Center Engineer' 'Data Platform Engineer DWH'
 'Junior Analytics Software Engineer (remote)'
 'Technical Data Engineer (Customer Excellence)'
 'Manager - Cloudflare and Security Operations Engineer'
 'Data Engineer Greece' 'Technology Consulting - Big Data Engineer'
 'Junior Data Engineer / Reporting Specialist'
 'Associate - Solution Engineer – Master Data'
 'Software Engineer - Data Platform' 'Sr. Big Data Engineer'
 'ML Ops and Data Engineer' 'Cloud Data Engineer' 'Senior Data Engineer'
 'Data Engineer / BI Consultant in Athens'
 'Data Engineer / BI Consultant in Thessaloniki' 'Big Data Engineer'
 'Data Engineers (Technology Associate Program(TAP)) @ Thessaloniki'
 'Data Engineer (

In [53]:
dfs[salary_type] = filtered_df

##### 6.10 Hungary

In [54]:
salary_type = "Hungary"

In [55]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

247 :
['Soldering Technology (f/m/div)*'
 'Factory Integration Engineer - Equipment Automation (f/m/div)*'
 'Electric Development Engineer (L&M ED) (f/m/div)*'
 'Outside Sales Representative, Focus on Eastern Europe'
 'Electrical Engineer / Physicist Electrical Development Power Semiconductors (f/m/div)'
 'Site Reliability Engineer' 'Data Engineer' 'Data Analyst'
 'Data Engineer (software/application)' 'Spark Data Engineer'
 'Lead Back-end & DevOps Engineer (Remote EU - fluent)'
 'AI Data Engineer Intern' 'Sr Data Engineer'
 'Full-Stack Software Engineer' 'Data Scientist Intern - EEA'
 'Deep Learning Engineer'
 'Data Engineer for Data Pipeline Development in automotive standards'
 'Software Engineer Intern - 8 months (Budapest, Hungary) - ET&I'
 'Senior Data Engineer' 'Cloud Data Engineer' 'Data Engineer gyakornok'
 '(Remote, Hungary) Senior Data Engineer'
 'Data Engineer for IoT (REF1449L)' 'Logistic Engineer'
 'Pályakezdő IT Szoftverfejlesztő' 'Adaptive AUTOSAR Software Engineer'
 'D

In [56]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

64 :
['Data Engineer' 'Data Engineer (software/application)'
 'Spark Data Engineer' 'AI Data Engineer Intern' 'Sr Data Engineer'
 'Data Engineer for Data Pipeline Development in automotive standards'
 'Senior Data Engineer' 'Cloud Data Engineer' 'Data Engineer gyakornok'
 '(Remote, Hungary) Senior Data Engineer'
 'Data Engineer for IoT (REF1449L)' 'Data Analytics Engineer'
 'Experienced Azure Data Engineer'
 'Cloud Native Software Engineer with Kafka Experience - Emerging Technologies and Incubation (ET&I)'
 'Lead Python Engineer (Data Enrichment)' 'Data DevOps Engineer'
 'Big Data Engineer' 'Cisco Data 2nd line engineer'
 'Lead Data Software Engineer' 'Staff Software Engineer – Data Enablement'
 'Data Security Engineer' 'Data Platform Engineer'
 'Software Engineer (mid or senior level) for Data Platform'
 'Data Engineer Lead' 'Data Integration Engineer'
 'Test Engineer for Cloud Billing - REF989F'
 'Senior Technical Leader / Data Engineer' 'SENIOR DATA ENGINEER'
 'Software Engineer - 

In [57]:
dfs[salary_type] = filtered_df

##### 6.11 Ireland

In [58]:
salary_type = 'Ireland'

In [59]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

441 :
['Process and applications engineer (f/m/d)-Automation Background'
 'Tendering Engineer' 'Validation Engineer'
 'Senior Infrastructure Engineer (Ireland)' 'Senior Automation Engineer'
 'Back End Engineer - OATS (Ireland)' 'Mechanical BIM Engineer'
 'Senior Engineer' 'Analytics Engineer, Digital Applications & Analytics'
 'QA Specialist' 'Senior Site Engineer - Dublin'
 'Automation Test Engineer, eCommerce' 'Principal Devops Engineer'
 'Splunk Engineer' 'Mechanical Engineer / CM - Leixlip'
 'Senior Site Engineer - Limerick/Cork/Kerry' 'Quality Engineer'
 'In-Die Engineer' 'IT Support Engineer - German Speaking'
 'M&E Project Manager' 'Lead Electrical Engineer' 'Site Engineer'
 'IT Support Engineer - Italian Speaking'
 'Technical Support Engineer - Performance'
 'Research & Development Engineer' 'Senior Software Engineer - DevSec Ops'
 'R&D Engineer' 'Inventory Manager' 'Manufacturing Engineer'
 'M+E Design Engineer'
 'Project Engineer (2 year fixed term contract) - Wexford, Irelan

In [60]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

62 :
['Mechanical BIM Engineer'
 'Analytics Engineer, Digital Applications & Analytics'
 'Google Cloud Platform - GCP Technical Architect'
 'BIM Engineer / Technician - Data Centres' 'BIM Engineer'
 'Principal Electrical Design Engineer - Data Centres'
 'Senior Technical Support Manager-Cloud'
 'Senior Software Engineer - Database Specialist (DT17890)'
 'BIM Lead -Industrial Projects' 'Senior Automation Engineer- BioPharma'
 'BIM Lead' 'Software Engineer - Product Data Quality'
 'QA Validation Specialist - Biotech' 'Data Engineer - Web Scraper'
 'Data Engineer' 'Database Engineer' 'Grade VII Data Engineer'
 'Data Centre Network Engineer'
 'Software Engineer, Data Pipelines - Opportunity for Working Remotely Cork,'
 'BI Developers, Data Architects, Data Engineers - Data Analytics, Digital'
 'Data Centre Engineer' 'Cloud Engineer ( Data Analytics) Hybrid'
 'Big Data Engineer' 'Software Development Engineer (Data Science Team)'
 'Quantitative Data Engineer' 'Senior Data Engineer- Hybrid'


In [61]:
dfs[salary_type] = filtered_df

##### 6.12 Israel

In [62]:
salary_type = "Israel"

In [63]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

330 :
['Data Engineer' '\uf4ccData Engineer\uf4cc(Hybrid' 'Junior Data Engineer'
 'Cloud Data Engineer' 'Junior Data Engineer לארגון מוביל'
 'Data Engineer - Data Assets Group' 'Storage Analytics - Data Engineer'
 'Deep Learning Engineer' 'Fullstack Engineer'
 'Junior Windows Low Level Engineer - Engine team'
 'Full-Stack Software Engineer III' 'Experienced Data Engineer'
 'Silicon Reliability Engineer' 'Manual QA Engineer'
 'Senior Data Engineer'
 'לחברת סטראט-אפ בתחום הסייבר דרוש\\ה Data Engineer' 'Big Data Engineer'
 'Senior Frontend Engineer' 'Data Platform Engineer' 'DATA ENGINEER'
 'Data Scientist - Thetaray' 'Data Tools Engineer'
 'Data Solutions Engineer' 'Application Developer'
 'Software Engineer, Infrastructure' 'QA Engineer'
 'Senior Big Data Engineer' 'Senior/Junior Data Engineer'
 'Data Engineer Team Leader' 'Integration Engineer Student'
 'Data Engineer לארגון מסווג' 'Automation Engineer - Data Group'
 'Back End Engineer' 'Data Engineer, Mobile Identity'
 'Software Engin

In [64]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

218 :
['Data Engineer' '\uf4ccData Engineer\uf4cc(Hybrid' 'Junior Data Engineer'
 'Cloud Data Engineer' 'Junior Data Engineer לארגון מוביל'
 'Data Engineer - Data Assets Group' 'Storage Analytics - Data Engineer'
 'Experienced Data Engineer' 'Senior Data Engineer'
 'לחברת סטראט-אפ בתחום הסייבר דרוש\\ה Data Engineer' 'Big Data Engineer'
 'Data Platform Engineer' 'DATA ENGINEER' 'Data Tools Engineer'
 'Senior Big Data Engineer' 'Senior/Junior Data Engineer'
 'Data Engineer Team Leader' 'Data Engineer לארגון מסווג'
 'Automation Engineer - Data Group' 'Data Engineer, Mobile Identity'
 'Data Engineer!' 'חברה מצליחה ומובילה מגייסת BI data engineer'
 'חברת טלדור מגייסת data Engineer לארגון מסווג במרכז הארץ'
 'Data Engineer לחברה פיננסית' 'Data Engineer Lead'
 'Data Engineer לחברת סטארטאפ'
 'Data Engineer לחברת סטארט -אפ מובילה בכפר סבא'
 'חברה טכנולוגית מצליחה מגייסת Data Engineer'
 'חברת SU מדהימה מגייסת Data Engineer' 'חברת Gaming מגייסת Data Engineer'
 'חברת הייטק גלובלית מחפשת Data Engine

In [65]:
dfs[salary_type] = filtered_df

##### 6.13 Italy

In [66]:
salary_type = 'Italy'

In [67]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

289 :
['IT Data Engineer' 'Digital ASIC Design Engineer'
 'Senior Application Engineer M/F' 'Progettista elettrico'
 'Network Engineer - Data Center Network Specialist'
 'SENIOR ENGINEER - SVILUPPATORE WEB / E-COMMERCE'
 'MECHANICAL PROPOSAL ENGINEER'
 'Stage progettista e consulente sistemi energetici'
 'Cerchiamo Architetti a Lucca' 'Sviluppatore software C++'
 'Data Engineer / ETL Developer' 'Impiegato amministrativo contabile'
 'Data Engineer - Powercenter' 'Process Control Engineering M/F'
 'Mechanical Design Engineer - Progettista Meccanico'
 'SiC Process Engineering - Process Integration M/F'
 'PMO Junior -Neolaureato' 'Software Developer' 'Project Sourcing'
 'Backend Engineer' 'DATA ARCHITECT/FULL STACK DEVELOPER'
 'Software Engineer/Developer (C++)' 'Backend Developer | JEE'
 'Responsabile Compliance' 'Stagista/Tirocinante' 'Data Analyst'
 'Senior System Engineer Evangelist' 'Senior Piping Engineer'
 'RISORSA PER AREA TECNICA' 'Software Developer Junior' 'Data Engineer'
 'INGE

In [68]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

52 :
['IT Data Engineer' 'Network Engineer - Data Center Network Specialist'
 'Data Engineer / ETL Developer' 'Data Engineer - Powercenter'
 'Data Engineer' 'DataOps Engineer' 'Senior Data Engineer'
 'Rookie Data Engineer' 'BNL – DIT – Junior Data Engineer (m/f/x)'
 'SITE MANAGER - RESPONSABILE DI MANUTENZIONE' 'Lead Database Developer'
 'Data Engineer (Junior)' 'DATA ENGINEER' 'Data Engineer & Analyst Junior'
 'Software Engineer - Big Data' 'Manufacturing Data Engineer'
 'Junior Data Engineer - Data Integration' 'Mechatronic & Data Engineer'
 'Junior Data Engineer' 'DATA ARCHITECT JUNIOR'
 'Junior Functional Analyst/IT Data Engineer ambito Sistemi di Pagamento'
 'Big Data Engineer, Machine Learning Specialist, Big Data Analyst'
 'Data Engineer Junior' 'ICT Data Engineer'
 'Specialist Software Engineer (Data Protection Service)' 'Data engineer'
 'JR DATA SCIENCE ENGINEER'
 'DATA WAREHOUSE ENGINEER | BIG DATA ENGINEER | DATA ENGINEER'
 'Microsoft Data Engineer' 'Data Engineer - Data Off

In [69]:
dfs[salary_type] = filtered_df

##### 6.14 Luxembourg

In [70]:
salary_type = 'Luxembourg'

In [71]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

176 :
['Senior DevOps Engineer Elastic Stack' 'Architect Security'
 'Full Stack Developer' 'IT Class September 2023' 'Data Scientist'
 'Data Engineer' 'Data Engineer (ETL)' 'DATA ENGINEER'
 'Data Engineer (H/F)' 'R&D Engineer' 'Data Engineer (m/f/gn)'
 'Senior Data Engineer (F/M)' '2023 Data Engineer Internship'
 'Senior Data Engineer (f/m/d)' 'Test Engineer'
 'Software Engineers Data (with ETL specialization) (M/F)'
 'Internship as control systems engineer (F/M)'
 'Advanced Analytics & Big data – Senior Data Scientist (M/F)'
 'Application Engineer (IT) / Freelance'
 'Energy and Decarbonisation engineer – Rolling Mills (H/F)'
 'Security Operations Engineer (F/M)'
 'Corporate R&D & Industrialization Engineer (m/f)'
 'Software Engineer PL/SQL'
 'Information Security Data Loss Prevention (DLP) Engineer (m/f)'
 'Data Architect/Senior Data Engineer (m/f/n)' 'Engineer Aircraft Systems'
 'Senior Software Engineer (Full Stack)' 'Data Development Engineer (M/W)'
 'Senior Test Analyst / Test Eng

In [72]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

33 :
['Data Engineer' 'Data Engineer (ETL)' 'DATA ENGINEER'
 'Data Engineer (H/F)' 'Data Engineer (m/f/gn)'
 'Senior Data Engineer (F/M)' '2023 Data Engineer Internship'
 'Senior Data Engineer (f/m/d)'
 'Software Engineers Data (with ETL specialization) (M/F)'
 'Information Security Data Loss Prevention (DLP) Engineer (m/f)'
 'Data Architect/Senior Data Engineer (m/f/n)'
 'Data Development Engineer (M/W)'
 'Software Engineers Data with ETL specialization – FR/EN (F/M)'
 'Cloud Engineer / Architect' 'Sr. Technical Program Manager - Cloud'
 'Cloud Engineer (m/f)' 'Team leader cloud virtualization engineer'
 'Senior Software-Defined Data Center Engineer' 'Data Centre Engineer'
 'data engineer (m/f)' 'Data Engineer Cloud / Lead Dev Senior (H-F)'
 'Analytics Engineer (m/f/d)' 'Senior Data Engineer (m/f/d)'
 'CloudOps Engineer' 'ETL/Data Engineer (m/w/d)'
 'Microsoft Cloud Engineer' 'Senior Cloud Engineer (m/f/x)'
 'Cloud on Prem Engineer - O' 'Analytics Engineer'
 'Senior Data Privacy Consu

In [73]:
dfs[salary_type] = filtered_df

##### 6.12 Netherlands

In [74]:
salary_type = 'Netherlands'

In [75]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

126 :
['Lead Data Engineer' 'Data Engineer' 'Supply Chain Engineer'
 'Services Engineer Networking' 'QHSE Manager' 'DevOps Engineer'
 'ILS Engineer' 'Application Engineer - portal beheerder'
 'Senior AWS Cloud Engineer' 'Production Engineer'
 'R&D Energie in Gebouwen' 'Data Engineer DWH & BI'
 'Senior Database Specialist' 'Medior/Senior Embedded Systems Engineer'
 'Maintenance Engineer'
 'Geo-ICT Traineeship, open voor alle achtergronden' 'Engineer'
 '(Senior) MES Engineer (Relocation)' '(Junior) Presales Engineer - Breda'
 'Cloud Workspace Engineer' 'Data engineer' 'Systeembeheerder'
 'Data engineer toezicht, bezwaar en klantinteractie' 'DataHub architect'
 'Slimme modelleur van digitale woningen'
 'IAM Security Engineer Trust Services' 'Security Engineer'
 'Service Engineer Waterstof' 'Thermal Systems Engineer Heat Pump system'
 'Proces Engineer Tilburg' 'Lead Software Engineer'
 'Helicopter Engineer AW-139' 'Monteur Elektrotechniek'
 'Platform Data Engineer' 'service engineers'
 'We

In [76]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

27 :
['Lead Data Engineer' 'Data Engineer' 'Senior AWS Cloud Engineer'
 'Data Engineer DWH & BI' 'Senior Database Specialist'
 'Cloud Workspace Engineer' 'Data engineer'
 'Data engineer toezicht, bezwaar en klantinteractie' 'DataHub architect'
 'Platform Data Engineer' 'Data DevOps Engineer'
 'Engineering Service Technician / Fietsenmaker / Monteur (E-Bike)'
 'Systeem Data Engineer' 'Engineering Manager - Big Data'
 'Senior Data Engineer Klantinteractie'
 'Engineering Manager - Site Reliability, FinTech'
 'Master Data Specialist' 'BI-consultant' 'BI Developer'
 'Operationeel/Technisch netwerk monitoring specialist'
 'Datawarehouse developer' 'Engineering Manager - Cloud Security'
 'Technical Outsource Manager Data Management'
 'Engineering Manager - Observability' 'Medior Data Engineer'
 'Junior Data Engineer' 'Junior Data engineer']


In [77]:
dfs[salary_type] = filtered_df

##### 6.13 Norway

In [78]:
salary_type = 'Norway'

In [79]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

111 :
['Cloud Services Engineer (Based in Germany)'
 'Contracts Officer (Based in Germany)'
 'Data Governance Engineer (m/f/d) 80% Homeoffice' 'Data Engineer'
 'Sommerjobb for data scientists / data engineers — Oslo'
 'Data Engineer (Java)' 'Data Engineer (m/f/d)'
 'Data Engineer – Java Developer'
 'Data Engineer | NorgesGruppen | Skøyen, Oslo'
 'Software Support Engineer' 'Support Engineer'
 'Senior Full Stack Engineer GIS (remote)'
 'VIE Program_FLOW INSSURANCE ENGINEER (M/F)_NORWAY'
 'Data Center ICT Engineer / ICT Engineer Data Center'
 'Customer Support Engineer (All Levels)'
 'Operations Engineer (Stavanger)'
 'Senior Engineer for our global frontend engineering team at the HISP Centre.'
 'ICT Engineer / ICT Technician' 'DATA ENGINEER'
 'Machine Learning / Software Engineer'
 'We are looking for a Senior Data Engineer to join Cognizant!'
 'Senior Hydrodynamic Engineer' 'Petroleum Engineer'
 'Data Scientist Biodiversity Informatics'
 'Software Engineer - App Stores Backend (Remote

In [80]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

24 :
['Cloud Services Engineer (Based in Germany)'
 'Data Governance Engineer (m/f/d) 80% Homeoffice' 'Data Engineer'
 'Data Engineer (Java)' 'Data Engineer (m/f/d)'
 'Data Engineer – Java Developer'
 'Data Engineer | NorgesGruppen | Skøyen, Oslo'
 'Data Center ICT Engineer / ICT Engineer Data Center' 'DATA ENGINEER'
 'We are looking for a Senior Data Engineer to join Cognizant!'
 'Løsningsarkitekt Analytics / Data Engineer' 'Cloud Engineer'
 'Data Engineer - Ocean' 'Azure Data Engineer with DataBricks - Oslo'
 'Cloud engineer' 'Data/ML Engineer'
 'Lead Software Engineer, Global Bill Pay'
 'Graduate Cloud Native Platform Engineer - Norway'
 'DevOps Engineer Network Cloud' 'IT Operations Cloud Engineer'
 'Senior Cloud Engineer' 'Mobile Core Network Engineer'
 'Integration Engineer - Cloud Solutions to Ericsson in Fornebu Oslo.'
 "Klar for å løfte BI's skyplattformer til nye høyder i rollen som Cloud Engineer?"]


In [81]:
dfs[salary_type] = filtered_df

##### 6.14 Poland

In [82]:
salary_type = 'Poland'

In [83]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

143 :
['Backend Product Software Engineer' 'Front End Product Software Engineer'
 'Infrastructure Software Engineer'
 'Senior Backend Product Software Engineer'
 'Senior Infrastructure Software Engineer'
 'Senior Front End Product Software Engineer'
 'Cloud Services Engineer (Based in Germany)'
 'Electro Mechanical Design Engineer'
 'Technical Reporting & Compliance Lead' 'Data Engineer (DWH) (m/f/d)'
 'Lead .NET Software Engineer (Remote)'
 'Advanced Software Engineer - AI (remote)'
 'Senior Software Engineer - AI (remote)'
 '.NET Senior Software Engineer (Remote or Hybrid)'
 'Advanced Software Engineer - AI'
 'Advanced Software Engineer (Enrichment)'
 '.NET Senior Software Engineer (Remote)'
 '.NET Software Engineer - Search Excellence'
 'Senior Software Engineer with Elasticsearch'
 'Senior Fullstack Software Engineer (.NET) - Commodity Insights'
 'Contracts Officer (Based in Germany)' 'Full Stack Developer'
 'Lead Software Engineer with Elasticsearch'
 'Senior Fullstack Software En

In [84]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

77 :
['Cloud Services Engineer (Based in Germany)'
 'Data Engineer (DWH) (m/f/d)' 'Data Engineer'
 'Data Engineer - Summer Internship'
 'Sr. Data Engineer (Snowflake, Matillion ETL, Python)'
 'Data Engineer for Machine Learning in Computer Vision (Robotics)'
 'Data Analyst/Engineer Internship' 'Data Engineer Intern'
 'Crypto Data Engineer Intern (Remote)' 'Data Analysis Engineer'
 'DATA ENGINEER' 'Junior DWH Data Engineer (get the DWH skill)'
 'Process Data Engineer - Subsurface' 'Junior Process Data Engineer'
 'Data Engineer (Hybrid)'
 'Data Engineer with Scala - Anti-Money Laundering' 'Senior Data Engineer'
 'Snowflake Data Engineer (zdalnie)' 'Cloud Data Engineer'
 'Data Engineer (Microsoft)' 'Data Engineer (Databricks)'
 'IT Data Lake Engineer' 'Senior Data Engineer - Oracle'
 'Big Data Engineer' 'Lead Data Engineer (remote Europe)'
 'Product Data Management Engineer' 'Data & BI Developer' 'Data engineer'
 'Python Data Engineer (Snowflake)'
 'Data Engineer - NeoXam (financial secto

In [85]:
dfs[salary_type] = filtered_df

##### 6.15 Portugal

In [86]:
salary_type = 'Portugal'

In [87]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

525 :
['Senior AWS DevOps Engineer' 'Full Stack Developer' 'Data Engineer'
 'Software Engineer, Data Architecture'
 'Ingestion Senior Data Engineer (f/m/d)' 'SQL Engineer'
 'Senior Data Engineer- Analytics' 'Software Integration Engineers'
 'Aptiv.io - Data Engineer - Remote' 'Software Engineer (SAP PLM)'
 'Software Q&A Engineer/Technician' 'Lead Data Engineer'
 'Technical Support Engineer'
 'Senior Data Engineer - LatAm, Western or Eastern Europe'
 'Senior Backend Software Engineer - AWS / APIs / Databases / Home Office (m/f/d)'
 'Data Engineer Trainee (m/f/d)' 'Business Analyst - (Remote)'
 'Back-End Engineer (Home-Based Portugal)' 'Data Engineer | AWS'
 'Release & Code Analyst' 'Machine Learning Engineer'
 'Data Engineer - Azure Datafactory' 'DATA ANALYST ENGINEER (M/F/D)'
 'Microsoft Dynamics 365 (CRM) integration Developer'
 'Cloud Data Engineer' 'Data Engineer (M/F)'
 'Senior Software Engineer (SAP PLM)' 'DATA ENGINEER'
 'DATA ENGINEER - POWERBI & AZURE'
 'DATA ENGINEER SNOWFLAKE

In [88]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'], 
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

123 :
['Data Engineer' 'Software Engineer, Data Architecture'
 'Ingestion Senior Data Engineer (f/m/d)'
 'Senior Data Engineer- Analytics' 'Aptiv.io - Data Engineer - Remote'
 'Lead Data Engineer'
 'Senior Data Engineer - LatAm, Western or Eastern Europe'
 'Senior Backend Software Engineer - AWS / APIs / Databases / Home Office (m/f/d)'
 'Data Engineer Trainee (m/f/d)' 'Data Engineer | AWS'
 'Data Engineer - Azure Datafactory' 'DATA ANALYST ENGINEER (M/F/D)'
 'Cloud Data Engineer' 'Data Engineer (M/F)' 'DATA ENGINEER'
 'DATA ENGINEER - POWERBI & AZURE'
 'DATA ENGINEER SNOWFLAKE/ AZURE DATABRICKS' 'TECH LEAD DATA ENGINEER'
 'SENIOR DATA ENGINEER' 'SQL DATA ENGINEER (F/M/D)'
 'TECH LEAD/DATA ENGINEER-TALEND' 'SENIOR INGESTION DATA ENGINEER (F/M/D)'
 'Senior Data Engineer' 'DATA ENGINEER (M/F/D)'
 'INGESTION DATA ENGINEER (F/M/D)' 'Data Engineer - (Viator)'
 'ETL Data Engineer' 'SQL Data Engineer' 'Data Software Engineer - Senior'
 'Azure Data Engineer' 'Data Engineer (Spark/Databricks)'


In [89]:
dfs[salary_type] = filtered_df

##### 6.16 Romania

In [90]:
salary_type = 'Romania'

In [91]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

418 :
['Data Engineer' 'Functional Safety Engineer (m/f/d)'
 'Cloud Services Engineer (Based in Germany)'
 'Sr. Backend Software Engineer - Python, Go, OpenSearch a.k.a Elasticsearch, MySQL, AWS (Romania)'
 'Backend Software Engineer (Go, AWS, Cassandra) - Cloud Security (Remote or Hybrid, Romania)'
 'Software Backend Engineer (Discover), Remote or Hybrid'
 'Software Development Engineer in Test (Java) - Sandbox | Romania (Remote)'
 'Frontend Engineer (Discover), (Remote)'
 'Cloud Developer | Discover | Golang, Kafka, Cassandra, AWS - Remote or Hybrid'
 'Software Test Engineer' 'OutSystems Software Engineer'
 'Back-End Software Engineer' 'Data Engineer - Talend'
 'Frontend Engineer - Partner Data' 'Data Lake Engineer'
 'Python Data Engineer' 'Big Data Engineer (f/m/d)'
 'Fixed Data Configuration 2nd Line Engineer'
 'CSA SIEM Engineer (Chronicle)'
 'Tableau/Elastic Search/Hadoop ( Data lake) Engineer for...'
 'IPB Agile - L2/RM engineer' 'Senior Consultant (f/m) Data Engineer'
 'Data En

In [92]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

97 :
['Data Engineer' 'Cloud Services Engineer (Based in Germany)'
 'Backend Software Engineer (Go, AWS, Cassandra) - Cloud Security (Remote or Hybrid, Romania)'
 'Cloud Developer | Discover | Golang, Kafka, Cassandra, AWS - Remote or Hybrid'
 'Data Engineer - Talend' 'Data Lake Engineer' 'Python Data Engineer'
 'Big Data Engineer (f/m/d)' 'Fixed Data Configuration 2nd Line Engineer'
 'Tableau/Elastic Search/Hadoop ( Data lake) Engineer for...'
 'Senior Consultant (f/m) Data Engineer' 'Data Engineer with Python'
 'Software Test Engineer, FitbitOS Release Testing'
 'Data Engineer (C#, Python, Elastic)' 'Data Engineer Manager'
 'Data Warehouse Engineer' 'Azure Data Engineer'
 'ETL Engineer|Unite Data Management @ ING Hubs Romania'
 'Data Engineer (F/M)' 'Data Engineer (Python)'
 'Java Engineer - DBNL Data Management @ING Hubs Romania'
 'Software Test Engineer, Fitbit Activity' 'Data Engineer and Analyst'
 'Data Solution Engineer' 'Senior AWS Cloud Infrastructure Engineer'
 'Mid-Senior Da

In [93]:
dfs[salary_type] = filtered_df

##### 6.17 Spain

In [94]:
salary_type = 'Spain'

In [95]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

129 :
['Data Engineer'
 'Project Manager / Engineer Smart Cities & Smart Grids (m/f/x)'
 'Technical Service Engineer' 'Senior IT Architect'
 'Systems Engineer Future Combat Air System Relocation to Germany (f/m/d)'
 'Research Engineer (m/f/x/d)' 'M3 Functional Analyst (Remote Role)'
 'Senior Data Engineer - BCG X' 'Data Engineer, 100% En remoto'
 'Python Data Engineer (Remote)' '(Mid) Data Engineer'
 'Data Engineer entry level (20 - 30K)' 'JUNIOR DATA ENGINEER'
 'DevOps Engineer - Work from home'
 'Mobility Analytics Product Engineer & Data Scientist'
 'Java R&D Engineer *REMOTE* f/m' 'Head of Product Data - 100% Remote'
 'Junior Data Engineer'
 'Data Engineer Spark - B2C1 - Remoto 100%, 100% En remoto'
 'Data Engineer PowerCenter, 100% En remoto'
 'Data Engineer exp +2 (30-40K)' 'Data Analyst, GeoAnalytics – BCG X'
 'EMEA Data Scientist- Remote (open to candidates across Europe)'
 'Data Engineer (Experimentation)'
 'Lead Data Engineer (Google Cloud, Scala, CI/CD)'
 'Senior Cloud R&D E

In [96]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'], 
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

78 :
['Data Engineer' 'Senior Data Engineer - BCG X'
 'Data Engineer, 100% En remoto' 'Python Data Engineer (Remote)'
 '(Mid) Data Engineer' 'Data Engineer entry level (20 - 30K)'
 'JUNIOR DATA ENGINEER' 'Head of Product Data - 100% Remote'
 'Junior Data Engineer'
 'Data Engineer Spark - B2C1 - Remoto 100%, 100% En remoto'
 'Data Engineer PowerCenter, 100% En remoto'
 'Data Engineer exp +2 (30-40K)' 'Data Engineer (Experimentation)'
 'Lead Data Engineer (Google Cloud, Scala, CI/CD)'
 'Senior Cloud R&D Engineer f/m' 'Data Engineer (I&D)'
 'Data Engineer/Power BI Developer, Barcelona'
 'Senior Data Engineer - SCALA' 'Senior Data Engineer'
 'Google Professional Data Engineer' 'Data Visualization Engineer'
 'Data Engineer _ Analytics and Modeling analyst'
 'Senior Data Engineer (40k-50k)' 'Data Engineer - PagoNxt'
 'Data Engineer AWS, Madrid' 'Data Engineer Intern' 'Data Engineer AWS'
 'IA and Data Engineer (Hybrid)' 'Junior Data Engineer (OF0523)'
 'Data Software Engineer' 'Data Engineer 

In [97]:
dfs[salary_type] = filtered_df

##### 6.18 Sweden

In [98]:
salary_type = 'Sweden'

In [99]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

185 :
['Kreativ BI-utvecklare med passion för dataanalys'
 'Data Governance Engineer (m/f/d) 80% Homeoffice'
 'Examensarbete - Inverkan av materialstruktur på livslängd hos polykristallina superlegeringar'
 'DevOps Engineer to Zebware'
 'Manager Parts Business - STG Steam Turbines, Finspång fleet'
 'Service Application Engineer' 'Head of Azure till Bitlog'
 'Network Data Engineer' 'Battery Module Safety Engineer'
 'Process Manager (m/f/d) 80% Homeoffice'
 'Data/ cloud engineer for a global telecom company!'
 'Senior Software Engineer / Architect for biotech company'
 'Public Cloud Engineer Azure' 'SIEM Engineer' 'Data Engineer'
 'Junior Data Engineer' 'Backend engineer with a love for gaming'
 'Data Engineer (Stockholm)'
 'Data Engineer (Power BI/SQL) - Health Analytics'
 'Azure AI/Data Engineer' 'Junior Engineer - JavaScript'
 'Frontend Engineer' 'Lead CI Engineer till Tutus Data AB!'
 'Junior Systemutvecklare till Axis i Lund!'
 'Ingenjörer / Produkt testingenjörer till Sigma Connect

In [100]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

85 :
['Data Governance Engineer (m/f/d) 80% Homeoffice'
 'Manager Parts Business - STG Steam Turbines, Finspång fleet'
 'Head of Azure till Bitlog' 'Network Data Engineer'
 'Data/ cloud engineer for a global telecom company!'
 'Senior Software Engineer / Architect for biotech company'
 'Public Cloud Engineer Azure' 'Data Engineer' 'Junior Data Engineer'
 'Data Engineer (Stockholm)'
 'Data Engineer (Power BI/SQL) - Health Analytics'
 'Azure AI/Data Engineer' 'Lead CI Engineer till Tutus Data AB!'
 'Data Engineer to Business Intelligence & Analytics...'
 'Data Engineer (Google Looker) - Fully Remote (LATAM)'
 'Senior Data Engineer' 'Data Engineer to Vetfamily'
 'DATA ENGINEER AHLSELL' 'Data Infrastructure Engineer'
 'Data Engineer - Nyfiken på nya utmaningar?' 'Data Science Engineer'
 'Data Engineer – Data platform & device telemetry'
 'IT solution engineer / Cloud engineer till QD'
 'Purchasing Data Engineer' 'Software Engineer - Data Platform'
 'Data Engineer for Deep Learning within A

In [101]:
dfs[salary_type] = filtered_df

##### 6.19 Switzerland

In [102]:
salary_type = 'Switzerland'

In [103]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

136 :
['Data Engineer (w/m) | 80% oder mehr'
 'Leiter Data Analytics & Data Science (a)'
 'Big Data & Platform Engineer 80-100%'
 'Data Engineer Messtechnik (m/w/d)' 'Data Scientist (60%-100%)'
 'Platform-Engineer Information Management (w/m/d)'
 'Software Systems Engineer - Client Services Technology'
 'Data Engineer / Analyst (m/f/d)'
 'Manufacturing Engineer - Medical Device'
 'Software Systems Engineer – Business Support Services Technology'
 'System Engineer (RFID)'
 'Global Student Internship in Innovation & Sustainability - Drive Sustainable Change through Robotic Solubility Determination (m/f/d; starting in July 2023 for 3 months, with a possibility of extension)'
 'Product Engineer - ArcGIS JavaScript API'
 'Senior Power BI Engineer (m/w/d)' 'Data- & Integration Engineer (w/m/d)'
 'Cold Chain Packaging Engineer - temporary for 12 months'
 'Development Reliability Engineer (contract)'
 'Automation Engineer (m/w/d)' 'Senior Quality Engineer (m/f/d)'
 'IT Test Engineer'
 'Global 

In [104]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

51 :
['Data Engineer (w/m) | 80% oder mehr'
 'Big Data & Platform Engineer 80-100%'
 'Data Engineer Messtechnik (m/w/d)' 'Data Engineer / Analyst (m/f/d)'
 'Senior Power BI Engineer (m/w/d)' 'Data- & Integration Engineer (w/m/d)'
 'Postdoctoral Fellow, Retinal Organoid Bioengineering (m/f/d)'
 'Data Management Engineer (m/f)' 'DevOps Data Engineer 100%'
 'Data Engineer / Analyst'
 'Belle opportunité pour un/e Data Engineer Junior - Urgent - Banque'
 'Mobile Software Engineer Flutter' 'Data Management Engineer (w/m/d)'
 'Scientist/Engineer at the cryoEM facility at Biozentrum'
 'Azure Data Engineer - Spark' 'IT Data Engineer'
 "Manager de l'équipe Data Engineer"
 'Data Engineer mit mehrjähriger Erfahrung im SQL Bereich (m/w/d)'
 'Senior Data Science Engineer & Business Intelligence Developer (Power BI) (m/w/d)'
 'ICT System Engineer (m/w/d) Datacenter und Cloud'
 'Big Data Engineer (m/w/d)' 'IT System Engineer - Data Center (m/w/d)'
 'Data Engineer (m/w/d)' 'Data Engineer - Sustainabili

In [105]:
dfs[salary_type] = filtered_df

##### 6.20 Turkey

In [106]:
salary_type = 'Turkey'

In [107]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

105 :
['Key Account Engineer (w/m/x)' 'Technology Service Engineer (Turkey)'
 'DevOps Engineer / IT Support (m/w/d) - Remote/(Home) Office'
 'Service Engineer - Turbocharging / Servis Mühendisi - Turbocharging (80-100%)'
 'DevOps Engineer - Remote' 'Data Engineer - Remote'
 'Machine Learning Engineer With Data Science Experience' 'Data Engineer'
 'Big Data & AI Engineer' 'Network Data Solutions Delivery Engineer'
 'Engineer, Field Applications'
 '2023 Bilişim Vadisi Stajyer İstihdam Programı' 'Stajyer'
 'Analytics Engineer' 'Continuous Improvement Engineer'
 'Customer Engineer, Data Analytics, Google Cloud'
 'Software Test Engineer' 'QA Engineer' 'Artificial Intelligence Engineer'
 'Üretim Sorumlusu' 'Computer Vision Engineer' 'Civil Engineer (m/f/d)'
 'HSE Engineer' 'Senior Data Engineer'
 'Full stack Developer - Angular deneyimli önemli - Remote'
 'DWH ETL Engineer - Remote' 'Platform Engineer - Devops - Remote'
 'Engineer - Product Definition - Hybrid' 'Jr DevOps Engineer'
 'Smart S

In [108]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

17 :
['Data Engineer - Remote' 'Data Engineer' 'Big Data & AI Engineer'
 'Network Data Solutions Delivery Engineer' 'Analytics Engineer'
 'Customer Engineer, Data Analytics, Google Cloud' 'Senior Data Engineer'
 'DWH ETL Engineer - Remote' 'Software & Big Data Engineer'
 'Database Engineer - Remote' 'Software Engineer - Data Platform'
 'data engineer' 'Planning Engineer @Siemens Mobility Türkiye'
 'Site Installation Engineer @Siemens Mobility Türkiye' 'Data Engineer II'
 'Cloud Engineer (Kubernetes)'
 'Security and Personal Data Protection Engineer']


In [109]:
dfs[salary_type] = filtered_df

##### 6.21 United_Kingdom

In [110]:
salary_type = 'United_Kingdom'

In [111]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

47 :
['Data Engineer' 'Data Engineer - Submarines' 'Data Engineer (Junior)'
 'Senior Data Engineer' 'Machine Learning Data Engineer'
 'Data Engineer (100% Remote)' 'Prover BI Engineer'
 'Geospatial Data Engineer' 'Trainee Data Engineer'
 'Data Engineer - London' 'Data Engineer/ ETL Engineer'
 'Junior Data Engineer' 'Lead Data Engineer' 'Graduate Data Engineer'
 'Data Pipeline Engineer' 'Financial Crime Data Engineer' 'BI Engineer'
 'Power BI Developer' 'Customer Lead Data Engineer' 'Data Engineer (Java)'
 'Azure Data Engineer' 'Data Developer Engineer'
 'Junior Software Engineer (C#, SQL)'
 'Analyst, Data Engineer, AI & Data, 12 Week Placement (FTC / Secondment)'
 'GCP Data engineer' 'Azure Data Engineer - 6 month contract'
 'UK Internship Programme - Data Engineering'
 'Software Engineer - Data Platform' 'Data Engineer - Snowflake'
 'Data Engineer - Outside IR35 - Python, Spark, SQL'
 'Data DevOps Engineer' 'Data Engineer - Geospatial'
 'Graduate Data Engineer - Bristol' 'Junior SQL e

In [112]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
            job_title=row['Job_title']
        ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

45 :
['Data Engineer' 'Data Engineer - Submarines' 'Data Engineer (Junior)'
 'Senior Data Engineer' 'Machine Learning Data Engineer'
 'Data Engineer (100% Remote)' 'Prover BI Engineer'
 'Geospatial Data Engineer' 'Trainee Data Engineer'
 'Data Engineer - London' 'Data Engineer/ ETL Engineer'
 'Junior Data Engineer' 'Lead Data Engineer' 'Graduate Data Engineer'
 'Data Pipeline Engineer' 'Financial Crime Data Engineer' 'BI Engineer'
 'Power BI Developer' 'Customer Lead Data Engineer' 'Data Engineer (Java)'
 'Azure Data Engineer' 'Data Developer Engineer'
 'Analyst, Data Engineer, AI & Data, 12 Week Placement (FTC / Secondment)'
 'GCP Data engineer' 'Azure Data Engineer - 6 month contract'
 'UK Internship Programme - Data Engineering'
 'Software Engineer - Data Platform' 'Data Engineer - Snowflake'
 'Data Engineer - Outside IR35 - Python, Spark, SQL'
 'Data DevOps Engineer' 'Data Engineer - Geospatial'
 'Graduate Data Engineer - Bristol' 'Forward Deployed Data Engineer'
 'Data Platform En

In [113]:
dfs[salary_type] = filtered_df

##### 6.22 USA 🦅

In [114]:
salary_type = 'United_States'

In [115]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

82 :
['Junior Data Engineer (Remote)' 'AWS Data Engineer' 'Data Engineer'
 'Azure Data Engineer' 'Data Engineer - REMOTE' 'Data Engineer - Flink'
 'Snowflake Data Engineer' 'Software/Data Engineer (Entry'
 'Data Analytics Engineer - GCP' 'MLOps Engineer / Data Scientist'
 'Data Pipeline Engineer' 'Big Data Engineer'
 'Azure Data Engineer (US citizens or GC holders) - Locals ONLY'
 'Data Analytics Engineer' 'GCP Data Engineer'
 'Big Data Developer (Data Engineer)' 'Big Data Operations Engineer'
 'Healthcare AWS Data Engineer' '100% REMOTE // GCP Data Engineer'
 'Data Engineer I' 'Sr. Data Engineer' 'Senior Data Engineer'
 'Google Cloud Data Engineer' 'Data Engineer II' 'Network Data Engineer'
 'Data Engineer with Analytics Background'
 'Data Engineer with PL/SQL Developer' 'Senior Data Engineer, Analytics'
 'Data Migration Engineer' 'Data Engineer Level 3'
 'Data Engineer (Mid/Jr)' 'Data Integration Engineer(Banking)'
 'Data Visualization Engineer' 'Software Engineer (Data)'
 'Jr. Data 

In [116]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

79 :
['Junior Data Engineer (Remote)' 'AWS Data Engineer' 'Data Engineer'
 'Azure Data Engineer' 'Data Engineer - REMOTE' 'Data Engineer - Flink'
 'Snowflake Data Engineer' 'Software/Data Engineer (Entry'
 'Data Analytics Engineer - GCP' 'Data Pipeline Engineer'
 'Big Data Engineer'
 'Azure Data Engineer (US citizens or GC holders) - Locals ONLY'
 'Data Analytics Engineer' 'GCP Data Engineer'
 'Big Data Developer (Data Engineer)' 'Big Data Operations Engineer'
 'Healthcare AWS Data Engineer' '100% REMOTE // GCP Data Engineer'
 'Data Engineer I' 'Sr. Data Engineer' 'Senior Data Engineer'
 'Google Cloud Data Engineer' 'Data Engineer II' 'Network Data Engineer'
 'Data Engineer with Analytics Background'
 'Data Engineer with PL/SQL Developer' 'Senior Data Engineer, Analytics'
 'Data Migration Engineer' 'Data Engineer Level 3'
 'Data Engineer (Mid/Jr)' 'Data Integration Engineer(Banking)'
 'Data Visualization Engineer' 'Software Engineer (Data)'
 'Jr. Data Engineer' 'Big Data Engineer - PyS

In [117]:
dfs[salary_type].shape[0]

244

In [118]:
dfs[salary_type] = filtered_df

##### 6.23 Japan

In [119]:
salary_type = 'Japan'

In [120]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

300 :
['【rinna Summer Internship】技術職・Technical Role (Software Engineer Intern)'
 'Internship (Frontend Engineer)' 'Data Engineer/Scientist'
 'Technical Support Engineer - Fukuoka' 'Application Engineer'
 'GCP Professional Data Engineerの通信講座講師' 'IT Support engineer'
 'Principal Test Engineer' 'Mechanical (Physical Simulation) Engineer'
 'Data/Business Intelligence Engineer opening!'
 'データサイエンティスト / Data Scientist' 'VED_Vehicle Integration CAN HMI Engineer'
 'Internship (AI Engineer)' 'Wind Turbine Load Engineer'
 'Data Center Engineer' 'Electrical Engineer I'
 'Bilingual IT Support Engineer (Desktop & Network Infra)'
 'JR network engineer' 'Software Engineer - Data Platform' 'Data Engineer'
 'IT system engineer 1' 'Technical Support Engineer - Nagoya'
 'Desktop Support' 'データ・エンジニア / Data Engineer, Search Engine Analysis'
 'Data Center Engineer (Remote Hands)'
 'Staff Software Engineer - Applications Backend' 'Project Engineer'
 'データエンジニア (リアルタイムデータパイプライン) / Data Engineer (Realtime Data 

In [121]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

41 :
['Data Engineer/Scientist' 'GCP Professional Data Engineerの通信講座講師'
 'Data/Business Intelligence Engineer opening!'
 'Wind Turbine Load Engineer' 'Data Center Engineer'
 'Software Engineer - Data Platform' 'Data Engineer'
 'データ・エンジニア / Data Engineer, Search Engine Analysis'
 'Data Center Engineer (Remote Hands)'
 'データエンジニア (リアルタイムデータパイプライン) / Data Engineer (Realtime Data Pipiline)'
 'Data Center Customer Operations Engineer III'
 'Python Engineer - Data Center Hardware Integration (Taipei)'
 'Azure data engineer (DTS-180)'
 'Data and Digital Engineer Staff/Manager, Customer Experience Group, BIAHJ'
 'Senior Software Engineer - Data Platform'
 'Automation Data Governance Engineer / オートメーション データガバナンス エンジニア'
 'Data Platform Engineer' 'FT: Data Engineer'
 'Senior Client Engineer, Tools & Pipeline - Lightspeed'
 'データ分析基盤エンジニア/Data Analysis Infrastructure Engineer'
 'IT Lab Data Mgmt Senior System Engineer'
 'Production Services Engineer (Data Services)'
 'AI Research Engineer at Data Se

In [122]:
dfs[salary_type] = filtered_df

##### 6.24 Singapore

In [123]:
salary_type = 'Singapore'

In [124]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

142 :
['Data Engineer, ETL (SG)'
 'SAS Data Engineer, Data & Analytics, Technology Consulting'
 'Data Engineer' 'Junior Data Engineer'
 'Associate Data Centre Engineer (fresh graduates are welcome to apply)'
 'Data Centre Cabling & Hardware Support Engineer'
 'Data Centre Engineer, Analyst' 'Machine Learning / Data Engineer'
 'Data Center Engineer'
 'Associate Data Engineer, Data & Analytics - Technology Consulting'
 'Machine Learning Engineer - Data Solutions Center'
 '[LTA-ITCD] DATA ENGINEER / SCIENTIST'
 'Data Center Critical Facilities Engineer' 'Data Scientist - Payments'
 'Data Engineer – Data Analytics, Global Wholesale Banking (1 year contract)'
 'Desktop Support Engineer (Desktop Support L2 + Data Centre)'
 'Data Analyst Intern, Regional BI & Planning (Summer 2023)'
 'Data Engineer (Taiwan)'
 'Data Analyst Intern - Ops Projects, Business Development (Summer 2023)'
 'Data Engineer (AWS)' 'Data Engineer (Adobe Tag Management)'
 'DevOps Engineer (Data Warehouse, Backend Systems)

In [125]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

94 :
['Data Engineer, ETL (SG)'
 'SAS Data Engineer, Data & Analytics, Technology Consulting'
 'Data Engineer' 'Junior Data Engineer'
 'Associate Data Centre Engineer (fresh graduates are welcome to apply)'
 'Data Centre Engineer, Analyst' 'Machine Learning / Data Engineer'
 'Data Center Engineer'
 'Associate Data Engineer, Data & Analytics - Technology Consulting'
 '[LTA-ITCD] DATA ENGINEER / SCIENTIST'
 'Data Center Critical Facilities Engineer'
 'Data Engineer – Data Analytics, Global Wholesale Banking (1 year contract)'
 'Data Engineer (Taiwan)' 'Data Engineer (AWS)'
 'Data Engineer (Adobe Tag Management)'
 'DevOps Engineer (Data Warehouse, Backend Systems) - 2023 Start'
 'Data Engineer (Remote)' 'Data Center Customer Operations Engineer II'
 'Informatica/Python Data Engineer' 'Data Centre Engineer #FreshGraduate'
 'Cloud Operations Engineer, Associate/Senior, Technology Consulting, Data & Analytics'
 'AWS with ETL Data Engineer' 'AI Data Engineer'
 'Cloudera Data Platform Engineer

In [126]:
dfs[salary_type] = filtered_df

##### 6.25 New Zealand

In [127]:
salary_type = 'New_Zealand'

In [128]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

302 :
['Data Engineer' 'Senior Data Engineer'
 'Data Engineer (Auckland or Wellington)'
 'Senior Data Operations Engineer' 'Data Engineer Chapter Member'
 'Process Control Engineer' 'Junior Systems Engineer'
 'Data Engineer - Entry/Senior level' 'Data Engineer - QuantumBlack'
 'Data Engineer (Flexible, Hybrid or Full Remote)'
 'Engineer Innovation and Development'
 'Senior Data Engineer (Auckland or Wellington)' 'Lead Engineer'
 'Data Science Trainee'
 'Data Engineer/Integration Specialist - Fully remote'
 'Junior/Intermediate Software Engineer'
 'Python Engineer - Data Center Hardware Integration (Taipei)'
 'Quality Assurance Engineer - New Zealand/Canada' 'Data Scientist'
 'Senior Analytics Engineer' 'Enterprise Wireless Engineer'
 'Expressions of Interest- Work in Data & Analytics Consultant/Senior Consultant/Manager'
 'SNOC Engineer' 'Senior Data Analyst' 'IT Operations Engineer'
 'Data Scientist - QuantumBlack'
 'Kaipūhanga Pūnaha Mātāmua | Senior Systems Engineer/Systems Engineer

In [129]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

33 :
['Data Engineer' 'Senior Data Engineer'
 'Data Engineer (Auckland or Wellington)'
 'Senior Data Operations Engineer' 'Data Engineer Chapter Member'
 'Data Engineer - Entry/Senior level' 'Data Engineer - QuantumBlack'
 'Data Engineer (Flexible, Hybrid or Full Remote)'
 'Senior Data Engineer (Auckland or Wellington)'
 'Data Engineer/Integration Specialist - Fully remote'
 'Python Engineer - Data Center Hardware Integration (Taipei)'
 'Senior Analytics Engineer'
 'Expressions of Interest- Work in Data & Analytics Consultant/Senior Consultant/Manager'
 'Lead / Senior Data Engineer - Christchurch'
 'Expressions of Interest | Senior Data Engineers' 'Senior Cloud Engineer'
 'Data Engineer - Azure Cloud' 'Senior Cloud PHP / JS Software Engineer'
 'Database Performance Consultant - MySQL (Remote)' 'Database Engineer'
 'Team Lead für Data Analytics' 'Azure Cloud Solutions Architect'
 'Data Migration Technical Lead' 'Integration Cloud Engineer'
 'Senior Systems Engineer (Cloud)' 'Cloud Consu

In [130]:
dfs[salary_type] = filtered_df

##### 6.26 Australia

In [131]:
salary_type = 'Australia'

In [132]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

83 :
['Data Centre Cabling & Hardware Support Engineer' 'Data Engineer'
 'Expressions of Interest - Data Engineer, Data & Analytics'
 'Data Engineer (Melbourne)' 'Senior Data Engineer' 'Graduate Engineer'
 'Data Engineer - ANZ Plus' 'Field Engineer - APAC' 'Engineer'
 'NAB Technology Internship Program July 2023'
 'Junior Support Engineer – Remote Australia' 'Mechanical Engineer'
 'Data & Control Systems Engineer' 'Specialist Data engineer'
 'Mining Engineer' 'Informatica Data Engineer'
 'Production Improvement Engineer' 'Data Engineer/Snowflake Consultant'
 'Data/Systems Engineer – Vehicle Monitoring'
 'Associate Engineer Intern - Data Centre Solutions'
 'Data Engineer - All Australia Locations'
 'Data Engineer - Spark and PySpark | Data and Analytics' 'DATA ENGINEER'
 'Senior Software Engineer, Data Safety' 'Data Engineer Roles'
 'Storage Engineer | Data Centre Projects'
 'Data Analytics/ Data Governance/ Data Engineer Expression of Interest'
 'Data Platform Engineer' 'Data Engineer 

In [133]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

42 :
['Data Engineer'
 'Expressions of Interest - Data Engineer, Data & Analytics'
 'Data Engineer (Melbourne)' 'Senior Data Engineer'
 'Data Engineer - ANZ Plus' 'Data & Control Systems Engineer'
 'Specialist Data engineer' 'Informatica Data Engineer'
 'Data Engineer/Snowflake Consultant'
 'Data/Systems Engineer – Vehicle Monitoring'
 'Associate Engineer Intern - Data Centre Solutions'
 'Data Engineer - All Australia Locations'
 'Data Engineer - Spark and PySpark | Data and Analytics' 'DATA ENGINEER'
 'Senior Software Engineer, Data Safety' 'Data Engineer Roles'
 'Storage Engineer | Data Centre Projects'
 'Data Analytics/ Data Governance/ Data Engineer Expression of Interest'
 'Data Platform Engineer' 'Data Engineer - Data Portfolio'
 'Data Engineer - FinOps' 'Data Center Customer Operations Engineer IV'
 'Data Engineer - Quantexa' 'Big Data Engineer'
 'Data Science and Software Engineer Analyst'
 'Data Analyst/Engineer/Scientist' 'Azure Data Engineer'
 'Data Engineer / Data Analyst' 

In [134]:
dfs[salary_type] = filtered_df

##### 6.27 Hong Kong

In [135]:
salary_type = 'Hong_Kong'

In [136]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

468 :
['AI/ML Engineer' 'Data Engineer'
 'Data Scientist / Big Data Engineer (Fresh Graduate and Interim Welcome)'
 'Data Engineer (BigData)' 'Graduate Trainee Engineer Program 2023'
 'Data Engineer (Financial/ E-commerce)'
 'Graduate Machine Learning Engineer 2022-2023' 'IECC Engineer'
 'Remote Data Engineer' 'Data Support Engineer' 'Deep Learning Engineer'
 'Software Engineer – Intern' 'Data Engineer - Mox'
 'Senior Data Engineer - IC' 'Machine Learning Engineer'
 'Junior Frontend Engineer (Remote)' 'Data Engineer (Hong Kong)'
 'Data Centre Engineer'
 'Data Engineer/ Scientist/ Analyst/ Governance (30K-85K)'
 'Software Engineer - App Stores Backend (Remote)'
 'AIML - Siri Language Engineer for Hong Kong Cantonese'
 'Cybersecurity Engineer (English+Cantonese speaking)'
 'Assistant Engineer' 'Data Scientist'
 'IT Technical Engineer / Support Engineer (HKD 23 - 35k / month)'
 'Senior Engineer/Consultant (Video Analytics / IoT / Big Data) - IC'
 'AI Engineer'
 'Regional Data Scientist - 

In [137]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

83 :
['Data Engineer' 'Data Engineer (BigData)'
 'Data Engineer (Financial/ E-commerce)' 'Remote Data Engineer'
 'Data Engineer - Mox' 'Senior Data Engineer - IC'
 'Data Engineer (Hong Kong)' 'Data Centre Engineer'
 'Data Engineer/ Scientist/ Analyst/ Governance (30K-85K)'
 'Senior Engineer/Consultant (Video Analytics / IoT / Big Data) - IC'
 'Data Engineer - Conglomerate' 'Junior Biology Engineer'
 '17229-Senior Data Mining Engineer'
 'Duty Engineer (Sha Tin Data Centre) [Ref no.: 20010058]'
 'Network Engineer (Voice & Data Team)'
 'APAC Global Expansion Engineer, Data Center Design Engineering'
 '(PERM /URGENT) Data Engineer' 'Azure Data Engineer x 1'
 'Data Engineer - Public Cloud Exposure'
 'Python Engineer - Data Center Hardware Integration (Taipei)'
 'Senior Engineer - Data Engineering & Analytics'
 'Senior Data Engineer, Data & Analytics - Hong Kong'
 'Hong Kong - Data Engineering Graduate Programme'
 'Data Engineer/50-60K/Real Estate'
 'HVAC Engineer | Data Center | Kwai Chung'

In [138]:
dfs[salary_type] = filtered_df

##### 6.28 Taiwan

In [139]:
salary_type = 'Taiwan'

In [140]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

125 :
['Data Center Facilities Mechanical Engineer (English, Mandarin)'
 'Data Engineer'
 'Data Analytics and Supply Chain Management Engineer - 身心障礙人才招募'
 'Big Data Engineer : Taiwan' 'Data Scientist Engineer'
 'Contract Engineer I' 'Mask Data Preparation Engineer'
 'Data Engineer-身心障礙人才招募'
 'Python Engineer - Data Center Hardware Integration (Taipei)'
 'Precision Optics Manufacturing Engineer' 'Clinical Data Engineer II'
 'Technical Quality Engineer' 'MTB IE Engineer (Intern)'
 'INTERN--Taichung Process Engineer' 'Data Engineer 數據工程師 / BI工程師'
 '【採線上面談】Data Engineer 資料工程師' 'Service Engineer'
 'Manufacturing - Business Engineer - Linkou'
 '(2023實習) Machine Learning Engineer (時薪260)(可視訊面試及彈性WFH)'
 'Configuration Engineer (EPG) II - (E2)' 'Engineer III' 'RD Engineer'
 'AI/ML Engineer'
 'Gas Sensor Engineer ( Calibration, Experiment Data Analysis )'
 'Data Engineer (Python)'
 '數據工程師 Data Engineer / 資深數據工程師 Senior Data Engineer'
 'Data Science Engineer (Application Engineer)'
 'Data Engine

In [141]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

36 :
['Data Engineer'
 'Data Analytics and Supply Chain Management Engineer - 身心障礙人才招募'
 'Big Data Engineer : Taiwan' 'Mask Data Preparation Engineer'
 'Data Engineer-身心障礙人才招募'
 'Python Engineer - Data Center Hardware Integration (Taipei)'
 'Clinical Data Engineer II' 'Data Engineer 數據工程師 / BI工程師'
 '【採線上面談】Data Engineer 資料工程師'
 'Gas Sensor Engineer ( Calibration, Experiment Data Analysis )'
 'Data Engineer (Python)'
 '數據工程師 Data Engineer / 資深數據工程師 Senior Data Engineer'
 'Data Science Engineer (Application Engineer)'
 'Data Engineer, ETL & Data Pipeline' 'Data Center Network Engineer'
 'senior data engineer'
 'Data Automation Engineer and Data Access Governance'
 '外商 Fintech Data Engineer' 'Data Engineer/Data Architect(內湖瑞光路)'
 '[TW Pipeline] CSCM - Global Operations Center Data Engineer - Linkou/Taichung'
 'Senior Data Engineer (Big Data)' 'Data Engineer|資料工程師'
 'Data Center Platform Application Engineer - Signal Integrity'
 '(台南)【2023研發替代役】資料工程師 Data Engineer' 'BI system engineer'
 'D

In [142]:
dfs[salary_type] = filtered_df

##### 6.29 South Korea

In [143]:
salary_type = 'South_Korea'

In [144]:
show_unique_and_its_len(dfs[salary_type]['Job_title'])

379 :
['APAC Global Expansion Engineer, Data Center Design Engineering'
 '[데브시스터즈] Data Engineer - Analytics/BI' 'Data Engineer'
 'Data Platform Engineer' 'Data/Machine Learning Engineer (신입)'
 '[컬리] 데이터플랫폼팀 데이터 엔지니어 (Data Engineer)'
 '부산본사 개발자 (Data Engineer / Backend Develope)'
 'Customer Engineer, Data Analytics, Google Cloud'
 'Associate Customer Engineer, Data Analytics, Google Cloud, gReach Program for People with Disabilities (장애인 채용)'
 '데이터 엔지니어 (Data Engineer)' 'Rust Engineer (Greenfield project)'
 'S/W Development Engineer - R&D'
 'Machine Learning Engineer Intern - Korea' '[11번가] BI/DW Engineer'
 '각 부문별' '[IT 부문 대규모 경력 채용] 데이터 엔지니어링[매니저, 책임매니저]'
 'Data Platform Kubernetes Engineer' 'Refining Tech Services Engineer'
 'Software Engineer Intern - Korea'
 'Python Engineer - Data Center Hardware Integration (Taipei)'
 '[데브시스터즈] Machine Learning Engineer (경력)' 'HPC Engineer'
 'Data Center Engineering Operations Engineer'
 'Regional Safety Engineer - South Korea, Data Center Health

In [145]:
df = dfs[salary_type]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        salary_type
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

37 :
['APAC Global Expansion Engineer, Data Center Design Engineering'
 '[데브시스터즈] Data Engineer - Analytics/BI' 'Data Engineer'
 'Data Platform Engineer' '[컬리] 데이터플랫폼팀 데이터 엔지니어 (Data Engineer)'
 '부산본사 개발자 (Data Engineer / Backend Develope)'
 'Customer Engineer, Data Analytics, Google Cloud'
 'Associate Customer Engineer, Data Analytics, Google Cloud, gReach Program for People with Disabilities (장애인 채용)'
 '데이터 엔지니어 (Data Engineer)' '[11번가] BI/DW Engineer'
 'Data Platform Kubernetes Engineer'
 'Python Engineer - Data Center Hardware Integration (Taipei)'
 'Data Center Engineering Operations Engineer'
 'Regional Safety Engineer - South Korea, Data Center Health and Safety'
 'Robotics Engineer [LiDAR(PointCloud) Perception][책임연구원]'
 'Data Specialist (Scientist&&Engineer)' 'Data Engineer(3-10년)'
 '[ICT] Data Engineering[책임매니저(G3)]' 'Senior Data Engineer'
 'Data Center Critical Facilities Engineer'
 'Customer Engineer, Application Modernization, Google Cloud'
 '[메가존클라우드] Hybrid Cloud Enginee

In [146]:
dfs[salary_type] = filtered_df

Cleanup

In [147]:
del specializations_non_english, data_terms_non_english, invalid_non_eng, filtered_df, salary_type

### 7. Split `Location` column into `Region`, `Country`, `State` and `City`.

#### 7.1 Add columns

In [148]:
remote_language_versions = {
    'Arabic': ["عن بعد", "مكتب منزلي", "العمل عن بعد", "مكتب افتراضي", "خارج الموقع", "العمل من أي مكان", "فريق موزع", "غير معتمد على الموقع", "قوة عمل متنقلة", "مكتب سحابي", "مساحة عمل عبر الإنترنت", "الرحالة الرقمي", "موقع مرن", "مكتب أي مكان", "عمل عن بعد", "مكان عمل افتراضي", "مكتب متنقل", "وظيفة تجوالية", "مكتب بلا حدود", "مكتب فرعي", "ممكّن عن بعد", "العمل عن بعد"],
    'Basque': ["Urruneko", "Etxebizitza", "Telelan", "Birtual", "Kanpoaldeko", "Lan egitea nondik nahi izan", "Banatutako taldea", "Kokapenik gabeko", "Mugikor lan taldea", "Hodeiko bulegoa", "Online lan gunea", "Nabigatzaile digitala", "Kokapen aldaerazpena", "Inon edozein bulego", "Urruneko lan", "Birtual bulego", "Mugikor bulegoa", "Lan ibiltaria", "Mugikor ofizina", "Mugikor gunea", "Mendebalde ofizina", "Urruneko gaitasuna", "Urruneko lan"],
    'Catalan':  ["Remot", "Oficina a casa", "Telecommute", "Oficina virtual", "Fora del lloc", "Treballa des de qualsevol lloc", "Equip distribuït", "Independent de la ubicació", "Força laboral mòbil", "Oficina a la núvol", "Espai de treball en línia", "Nòmada digital", "Ubicació flexible", "Oficina a qualsevol lloc", "Treball a distància", "Espai de treball virtual", "Oficina mòbil", "Feina itinerant", "Oficina sense fronteres", "Oficina satèl·lit", "Habilitat remotament", "Treball des de lluny"],
    'Czech': ["Vzdálený", "Domácí kancelář", "Telekomunikace", "Virtuální kancelář", "Mimo provozovnu", "Práce odkudkoliv", "Rozptýlený tým", "Nezávislost na místě", "Mobilní pracovní síla", "Cloudová kancelář", "Online pracovní prostor", "Digitální nomád", "Flexibilní místo", "Kancelář kdekoli", "Práce na dálku", "Virtuální pracoviště", "Mobilní kancelář", "Pohyblivá práce", "Bezhraniční kancelář", "Satelitní kancelář", "Možnost práce na dálku", "Práce z dálky"],
    'German':  ["Remote", "Home-Office", "Telearbeit", "Virtuelles Büro", "Externer Arbeitsplatz", "Arbeit von überall", "Verteiltes Team", "Ortsunabhängigkeit", "Mobile Belegschaft", "Cloud-Office", "Online-Arbeitsplatz", "Digitaler Nomade", "Flexible Arbeitsplätze", "Arbeitsplatz überall", "Fernarbeit", "Virtueller Arbeitsplatz", "Mobiles Büro", "Roaming-Job", "Grenzenloses Büro", "Satelliten-Büro", "Remote-fähig", "Arbeit von fern", "Dezentrale Arbeit", "Heimarbeit", "Telearbeit"],
    'Danish': ["Fjern", "Hjemmekontor", "Telekommunikation", "Virtuelt kontor", "Uden for lokalerne", "Arbejd fra ethvert sted", "Fordelt hold", "Placering-uafhængig", "Mobil arbejdsstyrke", "Cloud-kontor", "Online arbejdsområde", "Digital nomade", "Fleksibel placering", "Kontor hvor som helst", "Distancearbejde", "Virtuelt arbejdssted", "Mobilkontor", "Roaming-job", "Grænseløst kontor", "Satellitkontor", "Fjern-kompatibel", "Arbejd fra lang afstand", "Eksternt arbejde", "Hjemmearbejde", "Telearbejde"],
    'Spanish':  ["Remoto", "Oficina en casa", "Teletrabajo", "Oficina virtual", "Fuera del sitio", "Trabajar desde cualquier lugar", "Equipo distribuido", "Independencia del lugar", "Fuerza laboral móvil", "Oficina en la nube", "Espacio de trabajo en línea", "Nómada digital", "Ubicación flexible", "Oficina en cualquier lugar", "Trabajo a distancia", "Lugar de trabajo virtual", "Oficina móvil", "Trabajo itinerante", "Oficina sin fronteras", "Oficina satélite", "Habilitado para trabajar de forma remota", "Trabajo desde lejos"],
    'Finnish': ["Etä-", "Kotitoimisto", "Etätyö", "Virtuaalitoimisto", "Poissa toimistolta", "Työskentely mistä tahansa", "Hajautettu tiimi", "Sijaintiriippumaton", "Mobiilityövoima", "Pilvitoimisto", "Verkkotyötila", "Digitaalinen kulkuri", "Joustava sijainti", "Missä tahansa toimisto", "Etätyöskentely", "Virtuaalityöpaikka", "Mobiilitoimisto", "Kiertävä työ", "Rajaton toimisto", "Satelliittoimisto", "Etätyö mahdollistettu", "Työskentely kaukaa"],
    'French': ["À distance", "Télétravail", "Bureau virtuel", "Télétravailler", "Hors site", "Travailler de n'importe où", "Équipe distribuée", "Indépendant de l'emplacement", "Main-d'œuvre mobile", "Bureau de nuage", "Espace de travail en ligne", "Nomade digital", "Emplacement flexible", "Bureau n'importe où", "Travail à distance", "Espace de travail virtuel", "Bureau mobile", "Emploi itinérant", "Bureau sans frontières", "Bureau satellite", "Activé à distance", "Travailler à distance"],
    'Frisian': ["Op ôfstân", "Thús kantoar", "Telekomme", "Firtueel kantoar", "Bûtenshûs", "Wurkje fan hokker plak dan ek mar", "Ferdield team", "Lokaasjefrij", "Mobile wurkforc", "Cloud kantoar", "Online wurkromte", "Digitale nomade", "Fleksibele lokaasje", "Dochs dêr kantoar", "Ofstân wurkje", "Virtuele wurkromte", "Mobile kantoar", "Roambanen", "Grenzelos kantoar", "Satellietkantoar", "Op ôfstân mooglik makke", "Wurkje fan fierren"],
    'Galician': ["Remoto", "Oficina en casa", "Teletraballo", "Oficina virtual", "Fora do lugar de traballo", "Traballar desde calquera lugar", "Equipo distribuído", "Independente da localización", "Forza de traballo móbil", "Oficina en nube", "Espazo de traballo en liña", "Nómada dixital", "Localización flexible", "Oficina en calquera lugar", "Traballo a distancia", "Espazo de traballo virtual", "Oficina móbil", "Traballo itinerante", "Oficina sen fronteiras", "Oficina satélite", "Activado remotamente", "Traballar desde lonxe"],
    'Greek': ["Απομακρυσμένο", "Γραφείο στο σπίτι", "Τηλεργασία", "Εικονικό γραφείο", "Εκτός έδρας", "Εργασία από οπουδήποτε", "Διανεμημένη ομάδα", "Ανεξάρτητος τόπος εργασίας", "Κινητό εργατικό δυναμικό", "Γραφείο στο cloud", "Διαδικτυακός χώρος εργασίας", "Ψηφιακός ταξιδιώτης", "Ευέλικτη τοποθεσία", "Γραφείο από οπουδήποτε", "Εργασία από απόσταση", "Εικονικός χώρος εργασίας", "Κινητό γραφείο", "Επαγγελματίας χωρίς σταθερή τοποθεσία", "Σατελλίτε γραφείο", "Επιτρεπόμενο απομακρυσμένο", "Εργασία από μακριά"],
    'Hebrew': ["מרוחק", "משרד ביתי", "טלקום", "משרד וירטואלי", "מחוץ למשרד", "עבודה מכל מקום", "צוות מבוזר", "ללא תלות מיקום", "כוח עבודה נייד", "משרד ענן", "מרחב עבודה מקוון", "נומד דיגיטלי", "מיקום גמיש", "משרד מכל מקום", "עבודה מרחוק", "משרד וירטואלי", "משרד נייד", "עבודה רומנטית", "משרד לא מתפקד", "משרד סטליט", "אפשרי רחוק", "עבודה מרחוק"],
    'Hungarian': ["Távoli", "Otthoni iroda", "Távmunka", "Virtuális iroda", "Távoli munkavégzés", "Elosztott csapat", "Helyfüggetlen", "Mobil munkaerő", "Felhő alapú iroda", "Online munkaterület", "Digitális nomád", "Rugalmas munkavégzés helye", "Bárholi iroda", "Távolléti munka", "Virtuális munkahely", "Mobil iroda", "Vándormunka", "Határok nélküli iroda", "Táviroda", "Távmunka engedélyezve", "Munka távolról"],
    'Italian': ["Remoto", "Ufficio a casa", "Telelavoro", "Ufficio virtuale", "Fuori sede", "Lavorare ovunque", "Team distribuito", "Indipendenza dalla posizione", "Forza lavoro mobile", "Ufficio in cloud", "Spazio di lavoro online", "Nomade digitale", "Posizione flessibile", "Ufficio ovunque", "Lavoro a distanza", "Posto di lavoro virtuale", "Ufficio mobile", "Lavoro in itineranza", "Ufficio senza confini", "Ufficio satellite", "Abilitato al lavoro remoto", "Lavoro da lontano"],
    'Kurdish': ["Dûrxistin", "Birca malê", "Telekomût", "Birca virtual", "Dîlber", "Karê ji her derê", "Tîma belavkirî", "Hînariya cîhêve", "Hêja kariyê", "Birca cloud", "Cîhê karê online", "Nomadê rûniştinê", "Cîhê karî kirinê kêfxweş", "Birca her derê", "Karê dûrve", "Cîhê karê rûniştinê", "Birca mobîl", "Kariya serderê", "Birca beşdarî nekirî", "Birca navendî", "Dabeşkirina karê dûrxistinê", "Karê ji dûrve"],
    'Dutch': ["Afstandswerk", "Thuiswerkplek", "Telewerken", "Virtueel kantoor", "Buiten de deur", "Werk vanaf elke locatie", "Verspreid team", "Locatie-onafhankelijk", "Mobiele beroepsbevolking", "Cloud-kantoor", "Online werkruimte", "Digitale nomade", "Flexibele locatie", "Kantoor op elke locatie", "Werk op afstand", "Virtuele werkplek", "Mobiel kantoor", "Zwerfbaan", "Grenzeloos kantoor", "Satellietkantoor", "Op afstand mogelijk gemaakt", "Werken vanaf afstand"],
    'Norwegian': ["Fjern", "Hjemmekontor", "Telekommunikasjon", "Virtuelt kontor", "Off-site", "Arbeid fra hvor som helst", "Distribuert team", "Stedsuavhengig", "Mobil arbeidsstyrke", "Skykontor", "Nettbasert arbeidsområde", "Digital nomade", "Fleksibelt sted", "Kontor hvor som helst", "Fjernarbeid", "Virtuelt arbeidsmiljø", "Mobilkontor", "Rundreisejobb", "Grenseløst kontor", "Satellittkontor", "Fjernaktivert", "Arbeid fra avstand"],
    'Polish':  ["Zdalny", "Praca zdalna", "Telepraca", "Wirtualne biuro", "Praca poza siedzibą", "Praca z dowolnego miejsca", "Zespoły rozproszone", "Nieuzależniony od miejsca pracy", "Mobilna siła robocza", "Biuro w chmurze", "Przestrzeń robocza online", "Cyfrowy nomada", "Elastyczne miejsce pracy", "Biuro w dowolnym miejscu", "Praca na odległość", "Wirtualne miejsce pracy", "Mobilne biuro", "Praca mobilna", "Biuro bez granic", "Biuro satelitarne", "Zdalnie zarządzany", "Praca zdalna"],
    'Portuguese': ["Remoto", "Escritório em casa", "Teletrabalho", "Escritório virtual", "Fora do local", "Trabalho em qualquer lugar", "Equipe distribuída", "Independente de localização", "Força de trabalho móvel", "Escritório em nuvem", "Espaço de trabalho online", "Nômade digital", "Localização flexível", "Escritório em qualquer lugar", "Trabalho à distância", "Local de trabalho virtual", "Escritório móvel", "Trabalho itinerante", "Escritório sem fronteiras", "Escritório satélite", "Habilitado para trabalho remoto", "Trabalho a distância"],
    'Romanian': ["La distanță", "Lucru de acasă", "Telecomutare", "Birou virtual", "În afara locului de muncă", "Lucru de oriunde", "Echipa distribuită", "Independență față de locație", "Forță de muncă mobilă", "Birou în cloud", "Spațiu de lucru online", "Nomad digital", "Locație flexibilă", "Birou oriunde", "Lucru la distanță", "Loc de muncă virtual", "Birou mobil", "Muncă itinerantă", "Birou fără granițe", "Birou satelit", "Activat pentru lucru la distanță", "Lucru de departe"],
    'Slovakian': ["Vzdialený", "Domáca kancelária", "Telekomutácia", "Virtuálna kancelária", "Mimo pracoviska", "Práca z akéhokoľvek miesta", "Distribuovaný tím", "Nezávislosť na mieste", "Mobilná pracovná sila", "Cloudová kancelária", "Online pracovný priestor", "Digitálny nomád", "Flexibilné miesto", "Kancelária kdekoľvek", "Práca na diaľku", "Virtuálna pracovná plocha", "Mobilná kancelária", "Roamingová práca", "Bezhraničná kancelária", "Satelitná kancelária", "Vzdialene zapojený", "Práca z diaľky"],
    'Slovenian': ["Oddaljeno", "Domača pisarna", "Telekomutiranje", "Virtualna pisarna", "Oddaljeno delo", "Delo od koderkoli", "Distribuirana ekipa", "Lokacijsko neodvisno", "Mobilna delovna sila", "Oblak pisarna", "Spletni delovni prostor", "Digitalni nomad", "Fleksibilna lokacija", "Pisarna kjerkoli", "Delo na daljavo", "Virtualno delovno okolje", "Mobilna pisarna", "Potujoče delo", "Brezmejna pisarna", "Satelitska pisarna", "Oddaljeno omogočeno", "Delo od daleč"],
    'Swedish': ["Distans", "Hemmakontor", "Telekommunikation", "Virtuellt kontor", "Utomhusarbete", "Arbeta från vilken plats som helst", "Distribuerat team", "Plats oberoende", "Mobil arbetsstyrka", "Molnkontor", "Online arbetsutrymme", "Digital nomad", "Flexibel plats", "Kontor var som helst", "Distansarbete", "Virtuellt arbetsområde", "Mobilt kontor", "Roaming-jobb", "Gränslöst kontor", "Satellitkontor", "Fjärrstyrt", "Arbeta från avlägsna platser"],
    'Turkish': ["Uzaktan", "Ev ofisi", "Uzaktan çalışma", "Sanal ofis", "Ofis dışı", "Herhangi bir yerden çalışma", "Dağıtılmış ekip", "Konum bağımsız", "Mobil işgücü", "Bulut ofis", "Çevrimiçi çalışma alanı", "Dijital gezgin", "Esnek konum", "Her yerde ofis", "Uzaktan çalışma", "Sanal işyeri", "Mobil ofis", "Gezici iş", "Sınır tanımayan ofis", "Uydu ofisi", "Uzaktan çalışmaya uygun", "Uzaktan çalışma"],
    'Japanese': ["在宅勤務", "テレワーク", "バーチャルオフィス", "オフサイト勤務", "どこでも勤務", "分散チーム", "場所に依存しない", "モバイルワークフォース", "クラウドオフィス", "オンラインワークスペース", "デジタルノマド", "柔軟な場所", "どこでもオフィス", "遠隔勤務", "仮想ワークプレイス", "モバイルオフィス", "ローミングジョブ", "境界のないオフィス", "サテライトオフィス", "リモート対応", "遠隔からの勤務"],
    'Korean': ["재택 근무", "원격근무", "가상사무실", "사이트 외 근무", "어디서든 근무", "분산 팀", "위치 독립적", "모바일 근무", "클라우드 사무실", "온라인 작업 공간", "디지털 노마드", "유연한 위치", "어디서든 사무실", "원격 근무", "가상 작업장", "모바일 사무실", "로밍 직업", "경계없는 사무실", "위성 사무실", "원격 작동 가능", "멀리서 근무"],
    'Chinese_TR': ["居家辦公", "遠端辦公", "虛擬辦公室", "外地辦公", "全球任務", "分散式團隊", "不受地理限制", "流動辦公", "雲端辦公室", "線上工作空間", "數位遊牧者", "靈活的工作地點", "無所不在的辦公室", "遠程工作", "虛擬工作場所", "行動辦公室", "漫遊工作", "無邊界辦公室", "衛星辦公室", "遠端啟用", "遠距工作"],
    'Chinese_SP': ["远程工作", "家庭办公室", "远程办公", "虚拟办公室", "外场办公", "随处办公", "分散团队", "无固定办公地", "移动式劳动力", "云办公室", "在线工作空间", "数字游牧者", "灵活的位置", "无处不办公", "远程工作", "虚拟工作场所", "移动办公室", "漫游工作", "无边界的办公室", "卫星办公室", "远程启用", "远程工作"],
}


In [149]:
def apply_locations(df, country, region):

    def replace_underscore(df: pd.DataFrame, columns: list) -> pd.DataFrame:
        for col in columns:
            df[col] = df[col].str.replace('_', ' ')
        return df

    def remove_leading_trailing(df: pd.DataFrame, columns: list) -> pd.DataFrame:
        for col in columns:
            df[col] = df[col].str.strip()
        return df


    def clean_columns_strings(df: pd.DataFrame, columns: list['str']):

        df = replace_underscore(df, columns)

        df = remove_leading_trailing(df, columns)

        return df

    # All those fancy, pansy names for the remote
    remote = [
        "Home office",
        "Telecommute",
        "Virtual office",
        "Off-site",
        "Work from anywhere",
        "Distributed team",
        "Location-independent",
        "Mobile workforce",
        "Cloud office",
        "Online workspace",
        "Digital nomad",
        "Flexible location",
        "Anywhere office",
        "Distance work",
        "Virtual workplace",
        "Mobile office",
        "Roaming job",
        "Borderless office",
        "Satellite office",
        "Remote-enabled",
        "Work from afar"
    ]

    country_languages = countries_languages[country]

    for language in country_languages:

        not_eng_language = remote_language_versions[language]

        remote.extend(not_eng_language)

    country_languages =  [x.lower() for x in country_languages]


    df['Location'] = df['Location'].apply(lambda location: np.nan if location is np.nan or not isinstance(location, str) \
                                          or location.strip() == "" else location)

    df = df.apply(lambda row: "Remote" \
                  if (row['Location'] is not np.nan and isinstance(row['Location'], str) \
                    and row['Location'].lower() in remote) \
                    or \
                    (row['Job_title'] is not np.nan and isinstance(row['Job_title'], str) \
                    and row['Job_title'].lower() in remote) \
                    else row 
                    , axis=1)
    

    df['City'] = df['Location'].apply(lambda location: location.split(',')[0].strip() if location is not np.nan \
                                      and isinstance(location, str) and "," in location else location)
    df['State'] = df['Location'].apply(lambda location: location.split(',')[1].strip() if location is not np.nan \
                                       and isinstance(location, str) and "," in location else np.nan)

    df['Country'] = country
    df['Region'] = region

    df = clean_columns_strings(df, ['Region', 'Country', 'State', 'City'])

    return df

def add_regions():

    for country_name, country_df in dfs.items():
        region = np.nan
        if country_name in ["United_States", "Canada"]:
            region = "North America"
        elif country_name in ["Japan", "Singapore", "Hong_Kong", "Taiwan", "South_Korea"]:
            region = "Asia"
        elif country_name in ["Australia", "New_Zealand"]:
            region = "Oceania"
        elif country_name in ["Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Canada", "Czech_Republic", "Denmark", "Estonia", "Finland", "France", "Germany", "Greece", "Hungary", "Ireland", "Israel", "Italy", "Luxembourg", "Netherlands", "Norway", "Poland", "Portugal", "Romania", "Spain", 'Sweden', "Switzerland", "Turkey", "United_Kingdom"]:
            region = "Europe"
        else:
            raise KeyError(f"\rUnknown region/continent for:\n{country_name}")

        dfs[country_name] = apply_locations(country_df, country_name, region)

add_regions()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Location'] = df['Location'].apply(lambda location: np.nan if location is np.nan or not isinstance(location, str) \


In [150]:
dfs['Austria'].iloc[: , -4:30].head()

Unnamed: 0,City,State,Country,Region
9,Vienna,,Austria,Europe
10,Vienna,,Austria,Europe
11,Vienna,,Austria,Europe
12,Vienna,,Austria,Europe
15,Vienna,,Austria,Europe


In [151]:
dfs['United_States'].iloc[: , -4:30].head()

Unnamed: 0,City,State,Country,Region
0,Long Beach,CA,United States,North America
1,Trenton,NJ,United States,North America
2,Irvine,CA,United States,North America
3,Remote,,United States,North America
4,Washington,DC,United States,North America


In [152]:
dfs['United_States']['State'].unique()

array(['CA', 'NJ', nan, 'DC', 'NY', 'MA', 'MN', 'GA', 'TX', 'VA', 'CO',
       'NC', 'DE', 'MI', 'NV', 'OH', 'FL', 'IL', 'UT', 'WI', 'WA', 'IA',
       'AZ', 'KY', 'RI', 'TN', 'MD', 'OR', 'IN', 'PA', 'MT', 'CT', 'NH',
       'HI', 'MO'], dtype=object)

In [153]:
dfs['South_Korea'].iloc[: , -4:30].head()

Unnamed: 0,City,State,Country,Region
0,,,South Korea,Asia
1,,,South Korea,Asia
2,,,South Korea,Asia
3,,,South Korea,Asia
5,,,South Korea,Asia


In [154]:
dfs['Australia'].iloc[: , -4:30].head()

Unnamed: 0,City,State,Country,Region
1,Melbourne,,Australia,Oceania
2,North Sydney,,Australia,Oceania
3,Sydney,,Australia,Oceania
4,New South Wales,,Australia,Oceania
5,Melbourne,,Australia,Oceania


In [155]:
def show_remote_stats():
    
    result = {}

    for country, df in dfs.items():
        location_col = df['Location']

        location_total = np.nan
        location_remote = np.nan
        location_onside = np.nan
        location_remote_percent = np.nan

        if location_col.dtype == 'object':
            location_total = location_col.count()
            location_remote = int(location_col.str.count('Remote').sum())
            location_onside = int(location_total - location_remote)

            if location_total != 0 and location_remote != 0:
                location_remote_percent = int(round(location_remote/location_total, 2) * 100)
            else:
                location_remote_percent = 0

        result[country] = (location_remote, location_onside, location_remote_percent)

    result

show_remote_stats()

In [156]:
dfs['United_States']['City'].value_counts()

Remote         48
New York       11
Chicago        10
Boston          8
Dallas          8
               ..
Louisville      1
Burbank         1
Chandler        1
Iowa City       1
Saint Louis     1
Name: City, Length: 106, dtype: int64

In [157]:
dfs['United_Kingdom']['City'].value_counts()

London                   39
Remote                   26
Leeds                    10
Bristol                   6
United Kingdom            6
Glasgow                   4
Edinburgh                 4
Halifax                   2
Belfast                   2
Reading                   2
Manchester                1
Birmingham                1
Whitley Bay               1
Scotland                  1
Royal Tunbridge Wells     1
Brentford                 1
Cambridge                 1
Derby                     1
Stoke Poges               1
Bracknell                 1
Milton Keynes             1
Sunbury                   1
Newcastle upon Tyne       1
Southampton               1
Chester                   1
Windsor                   1
Name: City, dtype: int64

In [158]:
del add_regions, remote_language_versions, show_remote_stats

#### 7.2 Remove `Location` column

In [159]:

for df in dfs.values():
    del df['Location']

In [160]:
dfs['Austria'].columns

Index(['Company_name', 'Rating', 'Job_title', 'Description', 'Job_age',
       'Easy_apply', 'Salary', 'Employees', 'Type_of_ownership', 'Sector',
       'Founded', 'Industry', 'Revenue_USD', 'Friend_recommend',
       'CEO_approval', 'Career_opportunities', 'Comp_&_benefits',
       'Culture_&_values', 'Senior_management', 'Work/Life_balance', 'Pros',
       'Cons', 'Benefits_rating', 'Benefits_reviews', 'City', 'State',
       'Country', 'Region'],
      dtype='object')

In [161]:
dfs['United_States'].columns

Index(['Company_name', 'Rating', 'Job_title', 'Description', 'Job_age',
       'Easy_apply', 'Salary', 'Employees', 'Type_of_ownership', 'Sector',
       'Founded', 'Industry', 'Revenue_USD', 'Friend_recommend',
       'CEO_approval', 'Career_opportunities', 'Comp_&_benefits',
       'Culture_&_values', 'Senior_management', 'Work/Life_balance', 'Pros',
       'Cons', 'Benefits_rating', 'Benefits_reviews', 'City', 'State',
       'Country', 'Region'],
      dtype='object')

In [162]:
dfs['United_States'].columns == dfs['Austria'].columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

### 8. Add job title seniority

Remember that some companies, especially FANG, have some unique seniority tiles, like: "Level I', "Level II", etc. We didn't cover those examples, because they are too specific to the particular company. And each level means something different, depending on the company.

Also it's worth to keep things simple and avoid seniority titles inflation, like "Principle Senior Executive Manager" etc.

Of course, in some cases, it is disputable if management positions are more "senior" than senior positions. But it's good enough in our case. Senior we treat here more as "Doer", "Management" more as task coordinator.

In [163]:
seniorities_translations= {
    'Arabic': {
        'Junior' : ["مبتدئ", "مبتدئ", "تدريب", "متدرب", "متدرب", "شابّ حرفي", "مبتدئ", "مبتدئ", "تجريبي"],
        'Mid' : ["وسط", "مساعد", "الإداري"],
        'Senior': ["كبير", "مسن", "زعيم", "مبدأ"], 
        'Management' : ["مدير", "رئيس", "مدير عام", "رئيس تنفيذي", "مشرف", "منسق", "تنفيذي"]
    },
    'Basque': {
        'Junior' : ["Praktikak", "Praktikante", "Lantaldean", "Lantokiari", "Hasiberri", "Hasi", "Proba"],
        'Mid' : ["Erdi", " Kide Laguntzailea"],
        'Senior': ["Zk.", "Seneur", "Lider", "Printzipio"],
        'Management' : ["Kudeatzailea", "Burua", "Zuzendaria", "Buruzagia", "Buruzagi", "Koordinatzailea", "Gobernatzailea", "Ejecutivoa"]
    },
    'Catalan': {
        'Junior' : ["Pràctiques", "Practicant", "Aprenent", "Aprendiz", "Novell", "Principiant", "Probatori"],
        'Mid' : ["Middle", "Associat", "Associada"],
        'Senior': ["Snr.", "Sènior", "Líder", "Principi"],
        'Management' : ["Gerent", "Cap de", "Director", "Cap", "Supervisor", "Coordinador", "Executiu"]
    },
    'Czech': {
        'Junior' : ["Stáž", "Stážista", "Staženík", "Učeň", "Nováček", "Začátečník", "Zkušební doba"],
        'Mid' : ["Střední", "Asociát"],
        'Senior': ["Sr.", "Starší", "Vedoucí", "Princip"],
        'Management' : ["Manažer", "Šéf", "Ředitel", "Šéf", "Dozorce", "Koordinátor", "Výkonný"]
    },
    'German': {
        'Junior' : ["Jr.", "Junior", "Praktikum", "Praktikant", "Auszubildender", "Lehrling", "Neuling", "Anfänger", "Probezeit"],
        'Mid' : ["Mittel", "Assoziierter", "Assoziierte"],
        'Senior': ["Sr.", "Senior", "Leiter", "Prinzip"],
        'Management' : ["Manager", "Leiter", "Direktor", "Chef", "Vorgesetzter", "Koordinator", "Executive"]
    },
    'Danish': {
        'Junior' : ["Praktik", "Praktikant", "Trainee", "Lærling", "Nybegynder", "Begynder", "Prøvetid"],
        'Mid' : ["Mellem", "Associeret"],
        'Senior': ["Sr.", "Senior", "Leder", "Princip"],
        'Management': ["Manager", "Hoved af", "Direktør", "Chef", "Supervisor", "Koordinator", "Executive"]
    },
    'Spanish': {
        'Junior' : ["Prácticas", "Practicante", "Practicante", "Aprendiz", "Novato", "Principiante", "De prueba"],
        'Mid' : ["Medio", "Intermedio", "Asociado", "Asociada"],
        'Senior': ["Sr.", "Senior", "Líder", "Principio"],
        'Management' : ["Gerente", "Jefe de", "Director", "Jefe", "Supervisor", "Coordinador", "Ejecutivo"]
    },
    'Finnish': {
        'Junior' : ["Nuorempi", "Harjoittelu", "Harjoittelija", "Harjoittelija", "Oppipoika", "Uusi", "Aloittelija", "Koeaika"],
        'Mid' : ["Keski-", "Keskivaiheen", "Yhdistynyt"],
        'Senior': ["Sr.", "Seniori", "Johtaja", "Periaate"],
        'Management' : ["Johtaja", "Pää", "Johtaja", "Päällikkö", "Valvoja", "Koordinaattori", "Toteuttava"]
    },
    'French': {
        'Junior' : ["Stage", "Stagiaire", "Stagiaire", "Apprenti", "Novice", "Débutant", "Période d'essai"],
        'Mid' : ["Milieu", "Associé", "Associée"],
        'Senior': ["Sr.", "Senior", "Principe"],
        'Management' : ["Directeur", "Chef de", "Directeur", "Chef", "Superviseur", "Coordinateur", "Exécutif"]
    },
    'Frisian': {
        'Junior' : ["Stazjê", "Stazjê", "Trainee", "Learling", "Nijkommer", "Nijbegjinne", "Probearperiode"],
        'Mid' : ["Mids", "Midden", "Ferbûn"],
        'Senior': ["Snr.", "Sinnior", "Lieder", "Prinsipe"],
        'Management' : ["Manager", "Holwer", "Direkteur", "Chef", "Taufersjoch", "Koördinator", "Executive"]
    },
    'Galician': {
        'Junior' : ["Novo", "Prácticas", "Estudante en prácticas", "Trainee", "Aprendiz", "Novato", "Principiante", "Probatorio"],
        'Mid' : ["Medio", "Intermedio", "Asociado", "Asociada"],
        'Senior': ["Snr.", "Sénior", "Líder", "Principio"],
        'Management' : ["Xestor", "Xefe de", "Director", "Xefe", "Supervisor", "Coordinador", "Executivo"]
    },
    'Greek': {
        'Junior' : ["Νεαρός", "Πρακτική άσκηση", "Πρακτικός", "Εκπαιδευόμενος", "Μαθητευόμενος", "Νέος", "Αρχάριος", "Δοκιμαστική περίοδος"],
        'Mid' : ["Μεσαίος", "Μεσαίας", "Συνδεδεμένος "],
        'Senior': ["Κος.", "Γερός", "Αρχηγός", "Αρχή"],
        'Management' : ["Διευθυντής", "Αρχηγός", "Διευθυντής", "Αρχηγός", "Επόπτης", "Συντονιστής", "Εκτελεστικός"]
    },
    'Hebrew': {
        'Junior' : ["תקופת הכשרה", "סטודנט לתקופת הכשרה", "מתמחה", "חניך", "חדש", "מתחיל", "תקופת ניסיון"],
        'Mid' : ["אמצעי", "אמצעיים", "שותף"],
        'Senior': ["סמסטר", "בכיר", "מנהיג", "עקרון"],
        'Management' : ["מנהל", "ראש", "מנכ״ל", "ראשי", "מפקח", "מתאם", "מנהלי"]
    },
    'Hungarian': {
        'Junior' : ["Jr.", "Junior", "Gyakornok", "Gyakornok", "Tanuló", "Apprentice", "Újonc", "Kezdő", "Próbaidős"],
        'Mid' : ["Közép", "Középső", "Társult"],
        'Senior': ["Sr.", "Idősebb", "Vezető", "Elv"],
        'Management' : ["Menedzser", "Igazgató", "Fő", "Felügyelő", "Koordinátor"]
    },
    'Italian': {
        'Junior' : ["Jr.", "Junior", "Stage", "Tirocinante", "Apprendista", "Apprendista", "Neofita", "Principiante", "In prova"],
        'Mid' : ["Middle", "Associato", "Associata"],
        'Senior': ["Sig.", "Senior", "Capo", "Principio"],
        'Management' : ["Manager", "Capo di", "Direttore", "Capo", "Supervisore", "Coordinatore", "Esecutivo"]
    },
    'Kurdish': {
        'Junior' : ["Stajyerî", "Stajyer", "Dîlmej", "Şagirt", "Nûjen", "Destpêkê", "Probasyon"],
        'Mid' : ["Navend", "Pêdivî", "Pêdivîya"],
        'Senior': ["Snr.", "Sînîor", "Serok", "Maf"],
        'Management' : ["Bazirgani", "Serek", "Dirêjor", "Sereke", "Supervîzekar", "Koordinatêr", "Xebatkari"]
    },
    'Dutch': {
        'Junior' : ["Stage", "Stagiair", "Leerling", "Leerling", "Beginner", "Beginnend", "Proef"],
        'Mid' : ["Midden", "Associé", "Associée"],
        'Senior': ["Sr.", "Senior", "Leidinggevende", "Principe"],
        'Management' : ["Manager", "Hoofd van", "Directeur", "Chef", "Toezichthouder", "Coördinator", "Uitvoerend"]
    },
    'Norwegian': {
        'Junior' : ["Internship", "Intern", "Lærling", "Lærling", "Nybegynner", "Nybegynner", "Prøvetid"],
        'Mid' : ["Midten", "Mellom", "Assosiert"],
        'Senior': ["Sr.", "Senior", "Leder", "Prinsipp"],
        'Management' : ["Leder", "Hode av", "Direktør", "Sjef", "Veileder", "Koordinator", "Utførende"]
    },
    'Polish': {
        'Junior' : ["Staż", "Stażysta", "Praktykant", "Praktykant", "Początkujący", "Nowicjusz", "Okres próbny"],
        'Mid' : ["Średni", "Średniego"],
        'Senior': ["Sr.", "Senior", "Lider"],
        'Management' : ["Manager", "Kierownik", "Dyrektor", "Szef", "Przełożony", "Koordynator", "Wykonawczy"]
    },
    'Portuguese': {
        'Junior' : ["Jr.", "Júnior", "Estágio", "Estagiário", "Estagiário", "Aprendiz", "Novato", "Iniciante", "Probatório"],
        'Mid' : ["Médio", "Média", "Associado", "Associada"],
        'Senior': ["Sr.", "Sênior", "Líder", "Princípio"],
        'Management' : ["Gerente", "Chefe de", "Diretor", "Chefe", "Supervisor", "Coordenador", "Executivo"]
    },
    'Romanian': {
        'Junior' : ["Internship", "Intern", "Stagiar", "Stagiar", "Începător", "Novice", "Perioada de probă"],
        'Mid' : ["Mijlociu", "Asociat", "Asociată"],
        'Senior': ["Sr.", "Senior", "Lider", "Principiu"],
        'Management' : ["Manager", "Șef de", "Director", "Șef", "Supraveghetor", "Coordonator", "Executiv"]
    },
    'Slovakian': {
        'Junior' : ["Stáž", "Stážista", "Učeň", "Učeň", "Nováčik", "Začiatočník", "Skúšobná doba"],
        'Mid' : ["Stred", "Asociat", "Asociátka"],
        'Senior': ["Srk.", "Starší", "Vedúci", "Princíp"],
        'Management' : ["Manažér", "Vedúci", "Riaditeľ", "Šéf", "Dozorca", "Koordinátor", "Výkonný"]
    },
    'Slovenian': {
        'Junior' : ["Staž", "Stažist", "Vajenec", "Vajenec", "Novinec", "Novinec", "Poskusno"],
        'Mid' : ["Srednji", "Povezan", "Povezana"],
        'Senior': ["G. g.", "Starejši", "Vodja", "Načelo"],
        'Management' : ["Vodja", "Vodja", "Direktor", "Šef", "Nadzornik", "Koordinator", "Izvršni"]
    },
    'Swedish': {
        'Junior' : ["Praktik", "Praktikant", "Lärling", "Lärling", "Nykomling", "Nybörjare", "Prövotid"],
        'Mid' : ["Mellan", "Associerad"],
        'Senior': ["Sr.", "Senior", "Ledare", "Princip"],
        'Management' : ["Chef", "Huvud av", "Direktör", "Chef", "Övervakare", "Koordinator", "Utförande"]
    },
    'Turkish': {
        'Junior' : ["Stajyerlik", "Stajyer", "Stajyer", "Çırak", "Acemi", "Yeni başlayan", "Deneme süresi"],
        'Mid' : ["Orta", "Ortak", "Orta düzeyli"],
        'Senior': ["Sn.", "Kıdemli", "Lider", "Prensip"],
        'Management' : ["Yönetici", "Başkanı", "Direktör", "Şef", "Denetçi", "Koordinatör", "Yürütücü"]
    },
    'Japanese': {
        'Junior' : ["ジュニア", "インターンシップ", "インターン", "トレーニー", "見習い", "初心者", "ビギナー", "試用期間"],
        'Mid' : ["ミッドレベル", "アソシエイト", "ミドル"],
        'Senior': ["Sr.", "シニア", "リーダー", "原則"],
        'Management' : ["マネージャー", "ヘッドオブ", "ディレクター", "チーフ", "スーパーバイザー", "コーディネーター", "エグゼクティブ"]
    },
    'Korean': {
        'Junior' : ["주니어", "인턴십", "인턴", "연수생", "견습생", "초보자", "비전문가", "수습기간"],
        'Mid' : ["중급", "어소시에이트"],
        'Senior': ["선배", "시니어", "리더", "원칙"],
        'Management' : ["매니저", "대표", "디렉터", "최고", "감독자", "코디네이터", "집행"]
    },
    'Chinese_TR': {
        'Junior' : ["初級", "實習", "實習生", "見習生", "學徒", "新手", "初學者", "試用期"],
        'Mid' : ["中級", "聯合"],
        'Senior': ["高級", "資深", "領導", "原則"],
        'Management' : ["經理", "負責人", "董事", "主管", "監督者", "協調人", "執行"]
    },
    'Chinese_SP': {
        'Junior' : ["Jr.", "初级", "实习", "实习生", "见习生", "学徒", "新手", "初学者", "试用期"],
        'Mid' : ["中级", "联合"],
        'Senior': ["高级", "高级", "领导", "原则"],
        'Management' : ["经理", "负责人", "董事", "主管", "监督者", "协调人", "执行"]
    }
}

In [164]:
countries_languages

{'Austria': ['German'],
 'Belgium': ['French', 'Dutch', 'German'],
 'Canada': ['French'],
 'Czech_Republic': ['Czech', 'Slovakian', 'Hungarian'],
 'Denmark': ['Danish'],
 'Finland': ['Finnish', 'Swedish'],
 'France': ['French', 'Catalan', 'Italian', 'Basque'],
 'Germany': ['German'],
 'Greece': ['Greek'],
 'Hungary': ['Hungarian', 'Romanian'],
 'Ireland': [],
 'Israel': ['Hebrew', 'Arabic'],
 'Italy': ['Italian', 'German', 'French', 'Catalan', 'Greek', 'Slovenian'],
 'Luxembourg': ['German', 'French'],
 'Netherlands': ['Dutch', 'Frisian'],
 'Norway': ['Norwegian'],
 'Poland': ['Polish'],
 'Portugal': ['Portuguese'],
 'Romania': ['Romanian'],
 'Spain': ['Spanish', 'Basque', 'Catalan', 'Galician'],
 'Sweden': ['Swedish', 'Finnish'],
 'Switzerland': ['German', 'French', 'Italian'],
 'Turkey': ['Turkish', 'Kurdish'],
 'United_States': ['Spanish'],
 'United_Kingdom': [],
 'Japan': ['Japanese'],
 'South_Korea': ['Korean'],
 'Taiwan': ['Chinese_TR'],
 'Singapore': ['Chinese_SP'],
 'New_Zealan

In [165]:
def get_seniority(job_title:str, seniority_variants: list[dict] = []):
    
    def is_substring_in_string(substrings: list[str], string: str):
        """
        Check if any substrings is present in the given string
        """
        pattern = r"\b{}\b"
        for substring in substrings:
            if re.search(pattern.format(substring), string, re.IGNORECASE):
                return True
        return False
    
    seniority_foreign = {
        'Junior' : [],
        'Mid' : [],
        'Senior': [],
        'Management' : []
    }

    if seniority_variants:
        for language in seniority_variants:
            for seniority, value in language.items():
                seniority_foreign[seniority] += value


    seniorities_EN = {
        # "Internship", "Intern", "Trainee", "Apprentice", etc. are basically the same as Junior but usually without pay
        'Junior' : ["Jr.", "Junior", "Internship", "Intern", "Trainee", "Apprentice", "Novice", "Beginner", "Probationary"],
        'Mid' : ["Mid", "Associate", "Regular"],
        'Senior': ["Sr.", "Senior", "Lead", "Principle"],
        'Management' : ["Manager", "Head of", "Director", "Chief", "Supervisor", "Coordinator", "Executive"]
    }

    for seniority in seniorities_EN.keys():

        seniorities_all = seniorities_EN[seniority] + seniority_foreign[seniority]

        if is_substring_in_string(seniorities_all, job_title):
            return seniority
    else:
        return np.nan
    
        
def get_all_countries_seniority(countries_languages, seniorities_langs):

    summary = {}

    for country, country_languages in countries_languages.items():

        df = dfs[country]

        languages = []

        for language in country_languages:

            not_eng_language = seniorities_langs[language]

            languages.append(not_eng_language)


        df['Seniority'] = df['Job_title'].apply(lambda job: get_seniority(job, languages))

        summary[country] = df['Seniority'].value_counts()

    print(summary)

get_all_countries_seniority(countries_languages, seniorities_translations)

del get_all_countries_seniority, get_seniority, seniorities_translations

{'Austria': Senior        28
Junior        12
Management     4
Name: Seniority, dtype: int64, 'Belgium': Management    5
Senior        2
Junior        2
Name: Seniority, dtype: int64, 'Canada': Series([], Name: Seniority, dtype: int64), 'Czech_Republic': Senior    13
Junior     2
Name: Seniority, dtype: int64, 'Denmark': Senior        14
Management     5
Mid            2
Junior         1
Name: Seniority, dtype: int64, 'Finland': Senior    13
Mid        1
Name: Seniority, dtype: int64, 'France': Senior        31
Junior         8
Management     2
Name: Seniority, dtype: int64, 'Germany': Senior        36
Junior         5
Management     1
Mid            1
Name: Seniority, dtype: int64, 'Greece': Senior        9
Junior        5
Management    3
Mid           3
Name: Seniority, dtype: int64, 'Hungary': Senior    26
Junior     3
Mid        1
Name: Seniority, dtype: int64, 'Ireland': Senior        13
Junior         1
Mid            1
Management     1
Name: Seniority, dtype: int64, 'Israel': Se

Add non-standard seniority

### 8. Parse salary

In [166]:
def show_results(selector_target: str, dfs: dict[str, pd.DataFrame], dropna: bool=True):
    results = {}

    for country, df in dfs.items():
        results[country] = df[selector_target].value_counts(dropna=dropna)

    print(results)


In [167]:
dfs['United_States']['Salary'].value_counts(dropna=False)

NaN                                        30
$55.00 - $60.00 Per Hour(Employer est.)     6
$50.00 - $55.00 Per Hour(Employer est.)     5
$110K (Employer est.)                       4
$65.00 - $75.00 Per Hour(Employer est.)     4
                                           ..
$95K - $133K (Glassdoor est.)               1
$60.00 - $75.00 Per Hour(Employer est.)     1
$143K - $178K (Employer est.)               1
$57.63 - $65.00 Per Hour(Employer est.)     1
$120K - $150K (Employer est.)               1
Name: Salary, Length: 171, dtype: int64

#### 8.1 Employer provided salary

In [168]:
for country, df in dfs.items():
    
    df['Salary_employer_provided'] = df['Salary'].apply(lambda salary : True if isinstance(salary, str) and "(Employer est.)" in salary else False)
    dfs[country] = df

show_results('Salary_employer_provided', dfs)

{'Australia': False    48
True     19
Name: Salary_employer_provided, dtype: int64, 'Austria': False    137
True       2
Name: Salary_employer_provided, dtype: int64, 'Belgium': False    58
True      6
Name: Salary_employer_provided, dtype: int64, 'Canada': False    4
Name: Salary_employer_provided, dtype: int64, 'Czech_Republic': False    66
True      2
Name: Salary_employer_provided, dtype: int64, 'Denmark': False    90
True      1
Name: Salary_employer_provided, dtype: int64, 'Finland': False    57
True      1
Name: Salary_employer_provided, dtype: int64, 'France': False    126
True     101
Name: Salary_employer_provided, dtype: int64, 'Germany': False    150
True      14
Name: Salary_employer_provided, dtype: int64, 'Greece': False    56
Name: Salary_employer_provided, dtype: int64, 'Hong_Kong': False    71
True     34
Name: Salary_employer_provided, dtype: int64, 'Hungary': False    86
Name: Salary_employer_provided, dtype: int64, 'Ireland': False    57
True     18
Name: Salary_em

#### 8.2 Salary is hourly

In [169]:
from typing import Union

def is_hourly(salary: Union[str, np.nan]):

    if isinstance(salary, str) and len(salary.strip()) > 0:

        return bool("Per Hour" in salary)
        
    elif np.isnan(salary):

        return np.nan
    
    else:

        raise ValueError("Salary must be a string or numpy.nan")

selector_target = 'Salary_hourly'

for country, df in dfs.items():

    df[selector_target] = df['Salary'].apply(is_hourly)
    dfs[country] = df

show_results(selector_target, dfs, dropna=False)

{'Australia': False    51
NaN      12
True      4
Name: Salary_hourly, dtype: int64, 'Austria': NaN      137
False      2
Name: Salary_hourly, dtype: int64, 'Belgium': NaN      58
False     5
True      1
Name: Salary_hourly, dtype: int64, 'Canada': False    3
NaN      1
Name: Salary_hourly, dtype: int64, 'Czech_Republic': NaN      66
False     1
True      1
Name: Salary_hourly, dtype: int64, 'Denmark': NaN     90
True     1
Name: Salary_hourly, dtype: int64, 'Finland': NaN      57
False     1
Name: Salary_hourly, dtype: int64, 'France': NaN      126
True      71
False     30
Name: Salary_hourly, dtype: int64, 'Germany': NaN      150
False     12
True       2
Name: Salary_hourly, dtype: int64, 'Greece': NaN    56
Name: Salary_hourly, dtype: int64, 'Hong_Kong': NaN      65
False    40
Name: Salary_hourly, dtype: int64, 'Hungary': NaN    86
Name: Salary_hourly, dtype: int64, 'Ireland': False    63
NaN       8
True      4
Name: Salary_hourly, dtype: int64, 'Israel': NaN    252
Name: Salary

#### 8.3 Salary currency 

In [170]:
def get_currency(salary: str):

    if isinstance(salary, str):

        pattern_currency = r"(.+?(?=\d))"

        matched = re.search(pattern_currency, salary)

        currency = matched.group(1).strip().replace(":", "")

        return currency

    else:

        return salary

for country, df in dfs.items():

    df['Salary_currency'] = df['Salary'].apply(get_currency)
    dfs[country] = df
 
show_results('Salary_currency', dfs)

{'Australia': A$    55
Name: Salary_currency, dtype: int64, 'Austria': €    2
Name: Salary_currency, dtype: int64, 'Belgium': €    6
Name: Salary_currency, dtype: int64, 'Canada': CA$    3
Name: Salary_currency, dtype: int64, 'Czech_Republic': CZK    2
Name: Salary_currency, dtype: int64, 'Denmark': DKK    1
Name: Salary_currency, dtype: int64, 'Finland': €    1
Name: Salary_currency, dtype: int64, 'France': €    101
Name: Salary_currency, dtype: int64, 'Germany': €    14
Name: Salary_currency, dtype: int64, 'Greece': Series([], Name: Salary_currency, dtype: int64), 'Hong_Kong': HK$    40
Name: Salary_currency, dtype: int64, 'Hungary': Series([], Name: Salary_currency, dtype: int64), 'Ireland': €    67
Name: Salary_currency, dtype: int64, 'Israel': Series([], Name: Salary_currency, dtype: int64), 'Italy': €    12
Name: Salary_currency, dtype: int64, 'Japan': Series([], Name: Salary_currency, dtype: int64), 'Luxembourg': Series([], Name: Salary_currency, dtype: int64), 'Netherlands': € 

Convert currencies to ISO standard

In [171]:
def convert_to_ISO(salary: str):
    
    if isinstance(salary, str):

        ISO_standard = {
            '€': "EUR",
            '$': "USD",
            'CA$': "CAD",
            'HK$': "HKD",
            'NZ$': "NZD",
            'A$': "AUD",
            '£': "GBP",
        }

        for currency, ISO in ISO_standard.items():
            if salary.strip() == currency:
                return ISO
            
    return salary

In [172]:
for country, df in dfs.items():

    df['Salary_currency'] = df['Salary_currency'].apply(convert_to_ISO)
    dfs[country] = df

show_results('Salary_currency', dfs)

{'Australia': AUD    55
Name: Salary_currency, dtype: int64, 'Austria': EUR    2
Name: Salary_currency, dtype: int64, 'Belgium': EUR    6
Name: Salary_currency, dtype: int64, 'Canada': CAD    3
Name: Salary_currency, dtype: int64, 'Czech_Republic': CZK    2
Name: Salary_currency, dtype: int64, 'Denmark': DKK    1
Name: Salary_currency, dtype: int64, 'Finland': EUR    1
Name: Salary_currency, dtype: int64, 'France': EUR    101
Name: Salary_currency, dtype: int64, 'Germany': EUR    14
Name: Salary_currency, dtype: int64, 'Greece': Series([], Name: Salary_currency, dtype: int64), 'Hong_Kong': HKD    40
Name: Salary_currency, dtype: int64, 'Hungary': Series([], Name: Salary_currency, dtype: int64), 'Ireland': EUR    67
Name: Salary_currency, dtype: int64, 'Israel': Series([], Name: Salary_currency, dtype: int64), 'Italy': EUR    12
Name: Salary_currency, dtype: int64, 'Japan': Series([], Name: Salary_currency, dtype: int64), 'Luxembourg': Series([], Name: Salary_currency, dtype: int64), 'N

In [173]:
currencies = set()

for country, df in dfs.items():

    new_currency = df['Salary_currency'].unique().tolist()

    currencies.update(new_currency)

currencies

{'AUD',
 'CAD',
 'CHF',
 'CZK',
 'DKK',
 'EUR',
 'GBP',
 'HKD',
 'NOK',
 'NZD',
 'PLN',
 'RON',
 'SEK',
 'SGD',
 'USD',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan}

In [174]:
del get_currency, convert_to_ISO, currencies

#### 8.4 Salary min

In [175]:
def change_metric_prefixes_numbers(match_max:str) -> float:

    if "K" in match_max:
        match_max = float(match_max.replace("K", ""))
        match_max *= 1.0e+3

    elif "M" in match_max:
        match_max = float(match_max.replace("M", ""))
        match_max *= 1.0e+6

    elif "G" in match_max:
        match_max = float(match_max.replace("G", ""))
        match_max *= 1.0e+9

    return match_max

In [176]:
def get_salary_min(salary):

    if isinstance(salary, str):

        pattern_salary = r"(\d+(\.\d+)?[KMG]?)"
        match_min: str = re.findall(pattern_salary, salary)[0][0]

        match_min = change_metric_prefixes_numbers(match_min)

        return float(match_min)

    else:

        return salary


In [177]:

for country, df in dfs.items():

    df['Salary_min'] = df['Salary'].apply(get_salary_min)
    dfs[country] = df

show_results('Salary_min', dfs)

{'Australia': 109000.00    6
90000.00     6
100000.00    4
120000.00    4
80000.00     3
112000.00    2
96000.00     2
91000.00     2
99000.00     2
110000.00    2
75000.00     2
92000.00     2
75.00        2
137.50       1
113000.00    1
116000.00    1
101000.00    1
138000.00    1
104000.00    1
200000.00    1
85000.00     1
93.75        1
102000.00    1
95000.00     1
63000.00     1
140000.00    1
126000.00    1
130000.00    1
133000.00    1
Name: Salary_min, dtype: int64, 'Austria': 3000.0    2
Name: Salary_min, dtype: int64, 'Belgium': 45000.00    2
42000.00    1
40000.00    1
68.75       1
55000.00    1
Name: Salary_min, dtype: int64, 'Canada': 65000.0    1
88000.0    1
57000.0    1
Name: Salary_min, dtype: int64, 'Czech_Republic': 8000.0    1
400.0     1
Name: Salary_min, dtype: int64, 'Denmark': 187.5    1
Name: Salary_min, dtype: int64, 'Finland': 4000.0    1
Name: Salary_min, dtype: int64, 'France': 50.00       28
62.50        8
56.25        7
68.75        6
60000.00     5
40

In [178]:
del get_salary_min

#### 8.5 Salary max

In [179]:
def get_salary_max(salary):

    if isinstance(salary, str):

        pattern_salary = r"(\d+(\.\d+)?[KMG]?)"
        match_max: str = re.findall(pattern_salary, salary)[-1][0]

        match_max = change_metric_prefixes_numbers(match_max)

        return float(match_max)

    else:

        return salary

for country, df in dfs.items():

    df['Salary_max'] = df['Salary'].apply(get_salary_max)
    
    dfs[country] = df

show_results('Salary_max', dfs)

{'Australia': 120000.00    5
150000.00    5
128000.00    5
105000.00    3
122000.00    2
100000.00    2
90000.00     2
180000.00    2
177000.00    2
130000.00    2
155000.00    2
137.50       1
111000.00    1
106000.00    1
108000.00    1
139000.00    1
126000.00    1
141000.00    1
200000.00    1
125000.00    1
135000.00    1
75000.00     1
121000.00    1
144000.00    1
106.25       1
76000.00     1
164000.00    1
118000.00    1
90.00        1
165000.00    1
181000.00    1
100.00       1
174000.00    1
158000.00    1
Name: Salary_max, dtype: int64, 'Austria': 3000.0    1
4000.0    1
Name: Salary_max, dtype: int64, 'Belgium': 65000.0    2
70000.0    2
42000.0    1
87.5       1
Name: Salary_max, dtype: int64, 'Canada': 75000.0    1
94000.0    1
71000.0    1
Name: Salary_max, dtype: int64, 'Czech_Republic': 8000.0    1
800.0     1
Name: Salary_max, dtype: int64, 'Denmark': 312.5    1
Name: Salary_max, dtype: int64, 'Finland': 6000.0    1
Name: Salary_max, dtype: int64, 'France': 68.75   

In [180]:
# Cleanup

del get_salary_max, change_metric_prefixes_numbers

#### 8.6 Is salary

In [181]:
selector_target = 'Is_salary'

for country, df in dfs.items():

    df[selector_target] = df['Salary_min'].notnull() | df['Salary_min'].notnull()
    
    dfs[country] = df

show_results(selector_target, dfs)

{'Australia': True     55
False    12
Name: Is_salary, dtype: int64, 'Austria': False    137
True       2
Name: Is_salary, dtype: int64, 'Belgium': False    58
True      6
Name: Is_salary, dtype: int64, 'Canada': True     3
False    1
Name: Is_salary, dtype: int64, 'Czech_Republic': False    66
True      2
Name: Is_salary, dtype: int64, 'Denmark': False    90
True      1
Name: Is_salary, dtype: int64, 'Finland': False    57
True      1
Name: Is_salary, dtype: int64, 'France': False    126
True     101
Name: Is_salary, dtype: int64, 'Germany': False    150
True      14
Name: Is_salary, dtype: int64, 'Greece': False    56
Name: Is_salary, dtype: int64, 'Hong_Kong': False    65
True     40
Name: Is_salary, dtype: int64, 'Hungary': False    86
Name: Is_salary, dtype: int64, 'Ireland': True     67
False     8
Name: Is_salary, dtype: int64, 'Israel': False    252
Name: Is_salary, dtype: int64, 'Italy': False    68
True     12
Name: Is_salary, dtype: int64, 'Japan': False    42
Name: Is_salar

#### 8.7 Convert hourly salaries to yearly

In [182]:
def calculate_yearly_income(hourly_rate):

    hours_per_week = 40
    WEEKS_PER_YEAR = 52
    HOURS_PER_YEAR = WEEKS_PER_YEAR * hours_per_week
    gross_income = hourly_rate * HOURS_PER_YEAR
    return gross_income

Salary Min

In [183]:
for country, df in dfs.items():

    df['Salary_min'] = df.apply(
            lambda row: calculate_yearly_income(row['Salary_min']) if row['Salary_hourly'] == True else row['Salary_min'],
            axis=1
        )
    
    dfs[country] = df

show_results('Salary_min', dfs)

{'Australia': 109000.0    6
90000.0     6
100000.0    4
120000.0    4
80000.0     3
112000.0    2
96000.0     2
91000.0     2
99000.0     2
110000.0    2
75000.0     2
92000.0     2
156000.0    2
286000.0    1
113000.0    1
116000.0    1
101000.0    1
138000.0    1
104000.0    1
200000.0    1
85000.0     1
195000.0    1
102000.0    1
95000.0     1
63000.0     1
140000.0    1
126000.0    1
130000.0    1
133000.0    1
Name: Salary_min, dtype: int64, 'Austria': 3000.0    2
Name: Salary_min, dtype: int64, 'Belgium': 45000.0     2
42000.0     1
40000.0     1
143000.0    1
55000.0     1
Name: Salary_min, dtype: int64, 'Canada': 65000.0    1
88000.0    1
57000.0    1
Name: Salary_min, dtype: int64, 'Czech_Republic': 8000.0      1
832000.0    1
Name: Salary_min, dtype: int64, 'Denmark': 390000.0    1
Name: Salary_min, dtype: int64, 'Finland': 4000.0    1
Name: Salary_min, dtype: int64, 'France': 104000.0    28
130000.0     8
117000.0     7
143000.0     6
60000.0      5
40000.0      4
52000.0  

Salary max

In [184]:
for country, df in dfs.items():

    df['Salary_max'] = df.apply(
            lambda row: calculate_yearly_income(row['Salary_max']) if row['Salary_hourly'] == True else row['Salary_max'],
            axis=1
        )
    
    dfs[country] = df

show_results('Salary_max', dfs)

{'Australia': 120000.0    5
150000.0    5
128000.0    5
105000.0    3
122000.0    2
100000.0    2
90000.0     2
180000.0    2
177000.0    2
130000.0    2
155000.0    2
286000.0    1
111000.0    1
106000.0    1
108000.0    1
139000.0    1
126000.0    1
141000.0    1
200000.0    1
125000.0    1
135000.0    1
75000.0     1
121000.0    1
144000.0    1
221000.0    1
76000.0     1
164000.0    1
118000.0    1
187200.0    1
165000.0    1
181000.0    1
208000.0    1
174000.0    1
158000.0    1
Name: Salary_max, dtype: int64, 'Austria': 3000.0    1
4000.0    1
Name: Salary_max, dtype: int64, 'Belgium': 65000.0     2
70000.0     2
42000.0     1
182000.0    1
Name: Salary_max, dtype: int64, 'Canada': 75000.0    1
94000.0    1
71000.0    1
Name: Salary_max, dtype: int64, 'Czech_Republic': 8000.0       1
1664000.0    1
Name: Salary_max, dtype: int64, 'Denmark': 650000.0    1
Name: Salary_max, dtype: int64, 'Finland': 6000.0    1
Name: Salary_max, dtype: int64, 'France': 143000.0    16
60000.0      9

In [185]:
del calculate_yearly_income

#### 8.8 Convert all salaries to USD

In [186]:
df_dollar_rates = pd.read_csv("data\clean\_Socioeconomic data\dollar_rates_04_14_2023.csv", index_col='Currency')
df_dollar_rates

Unnamed: 0_level_0,Rate
Currency,Unnamed: 1_level_1
AUD,1.474993
BGN,1.768834
BRL,4.920865
CAD,1.331736
CHF,0.888758
CNY,6.851859
CZK,21.109704
DKK,6.738718
EUR,0.904404
GBP,0.799855


In [187]:
salary_type = 'Salary_max'

for country, df in dfs.items():

    df[salary_type] = df.apply(
            lambda row: int(row[salary_type] / df_dollar_rates.loc[row['Salary_currency']]) if isinstance(row['Salary_currency'], str) else row[salary_type],
            axis=1
        )
    
    dfs[country] = df


In [188]:

show_results(salary_type, dfs)

{'Australia': 81356.0     5
101695.0    5
86780.0     5
71186.0     3
82712.0     2
67796.0     2
61017.0     2
122034.0    2
120000.0    2
88135.0     2
105085.0    2
193899.0    1
75254.0     1
71864.0     1
73220.0     1
94237.0     1
85424.0     1
95593.0     1
135593.0    1
84746.0     1
91525.0     1
50847.0     1
82034.0     1
97627.0     1
149831.0    1
51525.0     1
111186.0    1
80000.0     1
126915.0    1
111864.0    1
122712.0    1
141017.0    1
117966.0    1
107119.0    1
Name: Salary_max, dtype: int64, 'Austria': 3317.0    1
4422.0    1
Name: Salary_max, dtype: int64, 'Belgium': 71870.0     2
77399.0     2
46439.0     1
201237.0    1
Name: Salary_max, dtype: int64, 'Canada': 56317.0    1
70584.0    1
53313.0    1
Name: Salary_max, dtype: int64, 'Czech_Republic': 378.0      1
78826.0    1
Name: Salary_max, dtype: int64, 'Denmark': 96457.0    1
Name: Salary_max, dtype: int64, 'Finland': 6634.0    1
Name: Salary_max, dtype: int64, 'France': 158115.0    16
66342.0      9
1724

In [189]:
salary_type = 'Salary_min'

for country, df in dfs.items():

    df[salary_type] = df.apply(
            lambda row: int(row[salary_type] / df_dollar_rates.loc[row['Salary_currency']]) if isinstance(row['Salary_currency'], str) else row[salary_type],
            axis=1
        )
    
    dfs[country] = df

In [190]:
results = {}

for country, df in dfs.items():
    results[country] = df[salary_type].value_counts().sort_index()

results

{'Australia': 42712.0     1
 50847.0     2
 54237.0     3
 57627.0     1
 61017.0     6
 61695.0     2
 62373.0     2
 64407.0     1
 65085.0     2
 67118.0     2
 67796.0     4
 68474.0     1
 69152.0     1
 70508.0     1
 73898.0     6
 74576.0     2
 75932.0     2
 76610.0     1
 78644.0     1
 81356.0     4
 85424.0     1
 88135.0     1
 90169.0     1
 93559.0     1
 94915.0     1
 105763.0    2
 132203.0    1
 135593.0    1
 193899.0    1
 Name: Salary_min, dtype: int64,
 'Austria': 3317.0    2
 Name: Salary_min, dtype: int64,
 'Belgium': 44228.0     1
 46439.0     1
 49756.0     2
 60813.0     1
 158115.0    1
 Name: Salary_min, dtype: int64,
 'Canada': 42801.0    1
 48808.0    1
 66079.0    1
 Name: Salary_min, dtype: int64,
 'Czech_Republic': 378.0      1
 39413.0    1
 Name: Salary_min, dtype: int64,
 'Denmark': 57874.0    1
 Name: Salary_min, dtype: int64,
 'Finland': 4422.0    1
 Name: Salary_min, dtype: int64,
 'France': 773.0        1
 36488.0      1
 37593.0      3
 38699

#### 8.9 Convert monthly salaries to yearly

Job offers outside of the US, particularly in Europe, often provide the salary as a monthly figure rather than yearly. To determine the monthly salary, it should generally not be lower than the monthly poverty line or higher than the yearly poverty line. Typically, the average salary for the country is used, and sometimes the median is used instead. However, in some cases, the median salary may not be officially available due to political reasons, such as in Poland. In countries with significant income inequality, the average salary can differ greatly from the median.

To address this issue, we use a rule of thumb where the poverty line is half of the median salary in the selected region. We then take the average or median salary and divide it by a factor of 2 to avoid confusion between daily, weekly, and monthly salaries. Any undefined fields are left empty to ensure accurate results.

Download poverty line

In [191]:
file_name = "OECD_poverty_line_monthly.csv"
file_path = f"data\clean\_Socioeconomic data\{file_name}"
df_poverty_line = pd.read_csv(file_path, index_col='LOCATION')
df_poverty_line.head()

Unnamed: 0_level_0,USD
LOCATION,Unnamed: 1_level_1
Australia,2358
Austria,2424
Belgium,2462
Canada,2333
Czech_Republic,1321


In [192]:
from typing import Literal

def monthly_to_yearly_salary(
        row: pd.Series, 
        salary_type: Literal['Salary_min', 'Salary_max'],
        df_poverty_line: pd.DataFrame,
        country: str,
    ) -> float:

    salary_value = row[salary_type]

    if not np.isnan(salary_value):

        monthly_poverty_line = df_poverty_line.loc[country]
        yearly_poverty_line = monthly_poverty_line * 12


        if (salary_value <= 0 or salary_value < monthly_poverty_line).bool():
            return np.nan
        
        elif (salary_value >= monthly_poverty_line).bool() and (salary_value < yearly_poverty_line).bool():
            return salary_value * 12

    return salary_value


Salary min

In [193]:
selector_target = 'Salary_min'

for country, df in dfs.items():

    df[selector_target] = df.apply(
            lambda row: monthly_to_yearly_salary(row, selector_target, df_poverty_line, country),
            axis=1
        )
    
    dfs[country] = df

In [194]:
def show_results_salary(selector_target: str, dfs:dict[pd.DataFrame]):

    results = {}

    for country, df in dfs.items():
        results[country] = df[selector_target].value_counts().sort_index(ascending=False)

    print(results)

show_results_salary(selector_target, dfs)

{'Australia': 193899.0    1
135593.0    1
132203.0    1
105763.0    2
94915.0     1
93559.0     1
90169.0     1
88135.0     1
85424.0     1
81356.0     4
78644.0     1
76610.0     1
75932.0     2
74576.0     2
73898.0     6
70508.0     1
69152.0     1
68474.0     1
67796.0     4
67118.0     2
65085.0     2
64407.0     1
62373.0     2
61695.0     2
61017.0     6
57627.0     1
54237.0     3
50847.0     2
42712.0     1
Name: Salary_min, dtype: int64, 'Austria': 39804.0    2
Name: Salary_min, dtype: int64, 'Belgium': 158115.0    1
60813.0     1
49756.0     2
46439.0     1
44228.0     1
Name: Salary_min, dtype: int64, 'Canada': 66079.0    1
48808.0    1
42801.0    1
Name: Salary_min, dtype: int64, 'Czech_Republic': 39413.0    1
Name: Salary_min, dtype: int64, 'Denmark': 57874.0    1
Name: Salary_min, dtype: int64, 'Finland': 53064.0    1
Name: Salary_min, dtype: int64, 'France': 201237.0     2
186863.0     2
172489.0     2
158115.0     6
149490.0     1
143741.0     8
137991.0     2
129366.0

Salary max

In [195]:
selector_target = 'Salary_max'

for country, df in dfs.items():

    df[selector_target] = df.apply(
            lambda row: monthly_to_yearly_salary(row, selector_target, df_poverty_line, country),
            axis=1
        )
    
    dfs[country] = df

show_results_salary(selector_target, dfs)

{'Australia': 193899.0    1
149831.0    1
141017.0    1
135593.0    1
126915.0    1
122712.0    1
122034.0    2
120000.0    2
117966.0    1
111864.0    1
111186.0    1
107119.0    1
105085.0    2
101695.0    5
97627.0     1
95593.0     1
94237.0     1
91525.0     1
88135.0     2
86780.0     5
85424.0     1
84746.0     1
82712.0     2
82034.0     1
81356.0     5
80000.0     1
75254.0     1
73220.0     1
71864.0     1
71186.0     3
67796.0     2
61017.0     2
51525.0     1
50847.0     1
Name: Salary_max, dtype: int64, 'Austria': 53064.0    1
39804.0    1
Name: Salary_max, dtype: int64, 'Belgium': 201237.0    1
77399.0     2
71870.0     2
46439.0     1
Name: Salary_max, dtype: int64, 'Canada': 70584.0    1
56317.0    1
53313.0    1
Name: Salary_max, dtype: int64, 'Czech_Republic': 78826.0    1
Name: Salary_max, dtype: int64, 'Denmark': 96457.0    1
Name: Salary_max, dtype: int64, 'Finland': 79608.0    1
Name: Salary_max, dtype: int64, 'France': 229985.0     1
227110.0     1
215611.0     2

In [196]:
del show_results_salary

#### 8.10 Salary average

In [197]:
for country, df in dfs.items():

    df['Salary_avg'] = (df['Salary_max']+df['Salary_min'])/2
    dfs[country] = df

show_results('Salary_avg', dfs)

{'Australia': 80339.0     5
91525.5     3
81356.0     3
93559.0     2
61017.0     2
66779.5     2
73220.5     2
86440.5     2
76949.0     1
62711.5     1
78644.0     1
72203.5     1
72203.0     1
78983.0     1
50847.0     1
76271.0     1
107796.5    1
71186.5     1
135593.0    1
141017.0    1
98644.0     1
83050.5     1
63728.5     1
82033.5     1
61016.5     1
47118.5     1
123390.0    1
73898.0     1
116339.0    1
92542.5     1
101695.0    1
86779.5     1
193899.0    1
67796.5     1
103050.5    1
96610.0     1
89830.0     1
97966.0     1
66440.5     1
77966.0     1
67796.0     1
74576.0     1
Name: Salary_avg, dtype: int64, 'Austria': 39804.0    1
46434.0    1
Name: Salary_avg, dtype: int64, 'Belgium': 60813.0     2
46439.0     1
60813.5     1
179676.0    1
69106.0     1
Name: Salary_avg, dtype: int64, 'Canada': 52562.5    1
68331.5    1
48057.0    1
Name: Salary_avg, dtype: int64, 'Czech_Republic': 59119.5    1
Name: Salary_avg, dtype: int64, 'Denmark': 77165.5    1
Name: Salary_avg

In [198]:
for country, df in dfs.items():

    del df['Salary']

### 9. Employees

In [199]:
show_results('Employees', dfs)

{'Australia': 10000+           27
51 to 200         7
1 to 50           7
501 to 1000       5
1001 to 5000      4
201 to 500        3
5001 to 10000     1
Name: Employees, dtype: int64, 'Austria': 10000+           44
1001 to 5000     21
1 to 50          14
51 to 200        12
501 to 1000      11
5001 to 10000     9
201 to 500        1
Name: Employees, dtype: int64, 'Belgium': 1001 to 5000     17
1 to 50          10
201 to 500        8
501 to 1000       6
5001 to 10000     5
51 to 200         5
10000+            3
Name: Employees, dtype: int64, 'Canada': 51 to 200      1
501 to 1000    1
10000+         1
1 to 50        1
Name: Employees, dtype: int64, 'Czech_Republic': 10000+           25
1001 to 5000     11
501 to 1000       7
201 to 500        6
1 to 50           5
5001 to 10000     5
51 to 200         4
Name: Employees, dtype: int64, 'Denmark': 10000+           28
1001 to 5000     13
1 to 50           8
201 to 500        8
501 to 1000       8
51 to 200         7
5001 to 10000     3
Na

### 10. Type of ownership

In [200]:
show_results('Type_of_ownership', dfs)

{'Australia': Company - Public     30
Company - Private    27
Self-employed         1
Government            1
Name: Type_of_ownership, dtype: int64, 'Austria': Company - Private                 65
Company - Public                  41
Subsidiary or Business Segment     3
Government                         2
Self-employed                      2
College / University               1
Nonprofit Organization             1
Name: Type_of_ownership, dtype: int64, 'Belgium': Company - Private                 27
Company - Public                  16
Self-employed                      5
Nonprofit Organization             3
Subsidiary or Business Segment     2
Private Practice / Firm            1
Name: Type_of_ownership, dtype: int64, 'Canada': Company - Private    3
Company - Public     1
Name: Type_of_ownership, dtype: int64, 'Czech_Republic': Company - Private    36
Company - Public     26
Self-employed         1
Name: Type_of_ownership, dtype: int64, 'Denmark': Company - Private                 4

### 11. Sector

In [201]:
show_results('Sector', dfs)

{'Australia': Information Technology        15
Financial Services            12
Human Resources & Staffing     4
Media & Communication          4
Energy, Mining & Utilities     3
Retail & Wholesale             2
Telecommunications             1
Real Estate                    1
Aerospace & Defense            1
Transportation & Logistics     1
Management & Consulting        1
Name: Sector, dtype: int64, 'Austria': Information Technology                         26
Manufacturing                                  20
Human Resources & Staffing                      9
Financial Services                              7
Management & Consulting                         5
Energy, Mining & Utilities                      4
Transportation & Logistics                      3
Construction, Repair & Maintenance Services     3
Education                                       3
Retail & Wholesale                              2
Insurance                                       2
Telecommunications                

### 12. Industry

In [202]:
show_results('Industry', dfs)

{'Australia': Investment & Asset Management              8
Enterprise Software & Network Solutions    4
Information Technology Support Services    4
Internet & Web Services                    3
Banking & Lending                          3
HR Consulting                              3
Advertising & Public Relations             3
Computer Hardware Development              2
Software Development                       2
Energy & Utilities                         2
Cable, Internet & Telephone Providers      1
Real Estate                                1
Aerospace & Defense                        1
Staffing & Subcontracting                  1
Accounting & Tax                           1
Video Game Publishing                      1
Food & Beverage Stores                     1
Taxi & Car Services                        1
Business Consulting                        1
Drug & Health Stores                       1
Mining & Metals                            1
Name: Industry, dtype: int64, 'Austria': 

### 13. Company age

In [203]:
import datetime

year = datetime.date.today().year

for country, df in dfs.items():

    df['Company_age'] = df['Founded'].apply(lambda x: x if np.isnan(x) else int(year - x))
    del df['Founded']

    dfs[country] = df

df['Company_age'] = df['Company_age']

del year

show_results('Company_age', dfs)

{'Australia': 54.0     8
188.0    3
29.0     3
21.0     2
10.0     2
20.0     2
56.0     2
97.0     2
25.0     2
7.0      1
6.0      1
150.0    1
39.0     1
13.0     1
22.0     1
16.0     1
117.0    1
3.0      1
9.0      1
5.0      1
38.0     1
18.0     1
96.0     1
19.0     1
Name: Company_age, dtype: int64, 'Austria': 57.0     5
32.0     5
17.0     4
23.0     4
75.0     4
55.0     4
18.0     4
19.0     3
27.0     3
11.0     2
7.0      2
22.0     2
67.0     2
76.0     2
68.0     2
10.0     2
71.0     2
91.0     2
74.0     2
137.0    2
66.0     2
131.0    1
53.0     1
142.0    1
56.0     1
29.0     1
60.0     1
88.0     1
149.0    1
40.0     1
39.0     1
30.0     1
204.0    1
9.0      1
54.0     1
34.0     1
28.0     1
24.0     1
109.0    1
36.0     1
112.0    1
144.0    1
140.0    1
96.0     1
15.0     1
78.0     1
13.0     1
124.0    1
64.0     1
41.0     1
Name: Company_age, dtype: int64, 'Belgium': 50.0     7
55.0     5
21.0     3
84.0     3
32.0     3
104.0    2
35.0     2
26.0   

### 14. Job age

In [204]:
np.sort(dfs['United_States']['Job_age'].unique())

array(['1 day ago', '10d', '12d', '13d', '14d', '15d', '16d', '17d',
       '19d', '20d', '21d', '22d', '23d', '24h', '26d', '27d', '28d',
       '29d', '2d', '30d', '30d+', '3d', '5d', '6d', '7d', '8d', '9d'],
      dtype=object)

In [205]:
def clean_job_age(job_age):

    if job_age == "24h" or job_age == "1 day ago":
        job_age = "1d"
    elif job_age == "30d+":
        job_age = "31d"

    if isinstance(job_age, str):
        return int(job_age.replace("d", ""))
    else:
        return job_age

for country, df in dfs.items():

    df['Job_age'] = df['Job_age'].apply(clean_job_age)

    dfs[country] = df

del clean_job_age

show_results('Job_age', dfs)


{'Australia': 31    27
1     11
7      4
2      4
24     4
17     3
14     2
15     2
22     2
21     2
13     1
16     1
29     1
28     1
20     1
8      1
Name: Job_age, dtype: int64, 'Austria': 31    92
11     8
6      6
4      5
5      5
14     3
27     2
10     2
20     2
18     2
13     2
26     1
3      1
21     1
8      1
22     1
25     1
24     1
2      1
1      1
17     1
Name: Job_age, dtype: int64, 'Belgium': 31    33
18     8
5      5
12     4
10     4
7      2
19     2
6      2
11     2
25     1
27     1
Name: Job_age, dtype: int64, 'Canada': 31    4
Name: Job_age, dtype: int64, 'Czech_Republic': 31    49
14     2
11     2
7      2
21     2
20     1
8      1
12     1
28     1
13     1
1      1
27     1
22     1
15     1
26     1
4      1
Name: Job_age, dtype: int64, 'Denmark': 31    45
18     7
6      6
8      4
11     4
1      3
27     3
26     3
13     3
19     2
20     2
7      2
12     1
22     1
10     1
15     1
25     1
17     1
21     1
Name: Job_age, dtype: int

### 15. Revenue

In [206]:
show_results('Revenue_USD', dfs)

{'Australia': $5 to $10 billion             10
$10+ billion                   9
$1 to $5 billion               6
$25 to $100 million            2
$500 million to $1 billion     1
Name: Revenue_USD, dtype: int64, 'Austria': $1 to $5 billion              21
$10+ billion                  15
$5 to $25 million             10
$5 to $10 billion             10
$100 to $500 million          10
$25 to $100 million            5
$1 to $5 million               4
Less than $1 million           1
$500 million to $1 billion     1
Name: Revenue_USD, dtype: int64, 'Belgium': $100 to $500 million    16
$1 to $5 billion         8
$10+ billion             3
$25 to $100 million      3
$1 to $5 million         1
$5 to $25 million        1
Name: Revenue_USD, dtype: int64, 'Canada': $100 to $500 million    1
$10+ billion            1
Name: Revenue_USD, dtype: int64, 'Czech_Republic': $10+ billion            11
$100 to $500 million     6
$1 to $5 billion         6
$5 to $10 billion        5
$25 to $100 million 

### 16. Preview columns so far

In [207]:
dfs['Austria'].dtypes

Company_name                 object
Rating                      float64
Job_title                    object
Description                  object
Job_age                       int64
Easy_apply                     bool
Employees                    object
Type_of_ownership            object
Sector                       object
Industry                     object
Revenue_USD                  object
Friend_recommend            float64
CEO_approval                float64
Career_opportunities        float64
Comp_&_benefits             float64
Culture_&_values            float64
Senior_management           float64
Work/Life_balance           float64
Pros                         object
Cons                         object
Benefits_rating             float64
Benefits_reviews             object
City                         object
State                       float64
Country                      object
Region                       object
Seniority                    object
Salary_employer_provided    

### 17. Change columns order

In [208]:
def move_column_to_index(df: pd.DataFrame, column_name: str, index: int):

    df.insert(index, column_name, df.pop(column_name))

    return df


def move_columns_to_index(df: pd.DataFrame, column_names: list[str], index: int):

    for col in column_names:
        df.insert(index, col, df.pop(col))
        index += 1

    return df

##### 17.1 move salary values

In [209]:
for country, df in dfs.items():

    dfs[country] = move_columns_to_index(
            df, 
            [
                'Salary_min', 
                'Salary_max', 
                'Salary_avg', 
                'Salary_currency',
                'Salary_employer_provided', 
                'Salary_hourly',
                'Is_salary'
            ], 3
        )

dfs['Austria'].dtypes

Company_name                 object
Rating                      float64
Job_title                    object
Salary_min                  float64
Salary_max                  float64
Salary_avg                  float64
Salary_currency              object
Salary_employer_provided       bool
Salary_hourly                object
Is_salary                      bool
Description                  object
Job_age                       int64
Easy_apply                     bool
Employees                    object
Type_of_ownership            object
Sector                       object
Industry                     object
Revenue_USD                  object
Friend_recommend            float64
CEO_approval                float64
Career_opportunities        float64
Comp_&_benefits             float64
Culture_&_values            float64
Senior_management           float64
Work/Life_balance           float64
Pros                         object
Cons                         object
Benefits_rating             

##### 17.2 Move Seniority

In [210]:
for country, df in dfs.items():


    try:
        df = move_column_to_index(df, 'Seniority', 3)
    except:
        print(f"{country}\n{df.dtypes}")

    dfs[country] = df
    
dfs['Austria'].dtypes

Company_name                 object
Rating                      float64
Job_title                    object
Seniority                    object
Salary_min                  float64
Salary_max                  float64
Salary_avg                  float64
Salary_currency              object
Salary_employer_provided       bool
Salary_hourly                object
Is_salary                      bool
Description                  object
Job_age                       int64
Easy_apply                     bool
Employees                    object
Type_of_ownership            object
Sector                       object
Industry                     object
Revenue_USD                  object
Friend_recommend            float64
CEO_approval                float64
Career_opportunities        float64
Comp_&_benefits             float64
Culture_&_values            float64
Senior_management           float64
Work/Life_balance           float64
Pros                         object
Cons                        

##### 17.3 Move City, State

In [211]:
for country, df in dfs.items():

    try:
        df = move_columns_to_index(df, ['City', 'State'], 11)
    except:
        print(f"{country}\n{df.dtypes}")

    dfs[country] = df

dfs['Austria'].dtypes

Company_name                 object
Rating                      float64
Job_title                    object
Seniority                    object
Salary_min                  float64
Salary_max                  float64
Salary_avg                  float64
Salary_currency              object
Salary_employer_provided       bool
Salary_hourly                object
Is_salary                      bool
City                         object
State                       float64
Description                  object
Job_age                       int64
Easy_apply                     bool
Employees                    object
Type_of_ownership            object
Sector                       object
Industry                     object
Revenue_USD                  object
Friend_recommend            float64
CEO_approval                float64
Career_opportunities        float64
Comp_&_benefits             float64
Culture_&_values            float64
Senior_management           float64
Work/Life_balance           

##### 17.4 Move Company age

In [212]:
for country, df in dfs.items():

    try:
        df = move_column_to_index(df, 'Company_age', 19)
    except:
        print(f"{country}\n{df.dtypes}")

    dfs[country] = df

dfs['Austria'].dtypes

Company_name                 object
Rating                      float64
Job_title                    object
Seniority                    object
Salary_min                  float64
Salary_max                  float64
Salary_avg                  float64
Salary_currency              object
Salary_employer_provided       bool
Salary_hourly                object
Is_salary                      bool
City                         object
State                       float64
Description                  object
Job_age                       int64
Easy_apply                     bool
Employees                    object
Type_of_ownership            object
Sector                       object
Company_age                 float64
Industry                     object
Revenue_USD                  object
Friend_recommend            float64
CEO_approval                float64
Career_opportunities        float64
Comp_&_benefits             float64
Culture_&_values            float64
Senior_management           

##### 17.5 Move Work/Life_balance 

In [213]:
for country, df in dfs.items():

    try:
        df = move_columns_to_index(df, ['Senior_management', 'Work/Life_balance'], 25)
    except:
        print(f"{country}\n{df.dtypes}")

    dfs[country] = df

dfs['Austria'].dtypes

Company_name                 object
Rating                      float64
Job_title                    object
Seniority                    object
Salary_min                  float64
Salary_max                  float64
Salary_avg                  float64
Salary_currency              object
Salary_employer_provided       bool
Salary_hourly                object
Is_salary                      bool
City                         object
State                       float64
Description                  object
Job_age                       int64
Easy_apply                     bool
Employees                    object
Type_of_ownership            object
Sector                       object
Company_age                 float64
Industry                     object
Revenue_USD                  object
Friend_recommend            float64
CEO_approval                float64
Career_opportunities        float64
Senior_management           float64
Work/Life_balance           float64
Comp_&_benefits             

## 18. Technology requirements - parsing the job description

In [214]:
def make_is_tech(cloud_names: list[str]):

    def is_tech(job_description: str):
        
        for cloud in cloud_names:
            if re.search((r"\b" + cloud + r"\b"), job_description, re.IGNORECASE) or \
               re.search((r"\b" + cloud + r"\b"), job_description, re.IGNORECASE):
                return True
            
        return False
    
    return is_tech

def add_is_needed_column_to_df(df: pd.DataFrame, column_name: str, tech_names: list[str]):

    df[column_name] = df['Description'].apply(make_is_tech(tech_names))

    return df

def add_tech_to_dfs(dfs, cloud_names, column_name):

    for country, df in dfs.items():
        add_is_needed_column_to_df(df, column_name, cloud_names)

        dfs[country] = df

    show_results(column_name, dfs)
    return dfs

##### 19. Git and code repositories

In [215]:
def check_repo(job_description: str):

    git_platforms = [
        r"Github", 
        r"GitLab", 
        r"Bitbucket", 
        r"SourceForge", 
        r"Launchpad", 
        r"Google Cloud Source Repositories",
        r"AWS CodeCommit",
        r"GitBucket",
        r"Gogs",
        r"Gitea",
        r"Apache Allura",
        r"RhodeCode",
        r"ONEDEV",
        r"Codeberg",
        r"Git" # IMPORTANT, it has to be last!
        ]
    
    for platform in git_platforms:
        if re.search((r"\b" + platform + r"\b"), job_description, re.IGNORECASE):
            return platform
        
    return np.nan

for country, df in dfs.items():
        
    df['Git'] = df['Description'].apply(check_repo)

    dfs[country] = df

del check_repo

show_results('Git', dfs)

{'Australia': Series([], Name: Git, dtype: int64), 'Austria': Git    3
Name: Git, dtype: int64, 'Belgium': GitLab    1
Git       1
Name: Git, dtype: int64, 'Canada': Series([], Name: Git, dtype: int64), 'Czech_Republic': Series([], Name: Git, dtype: int64), 'Denmark': Git    1
Name: Git, dtype: int64, 'Finland': Series([], Name: Git, dtype: int64), 'France': Git       11
GitLab    10
Github     4
Name: Git, dtype: int64, 'Germany': GitLab    2
Name: Git, dtype: int64, 'Greece': Series([], Name: Git, dtype: int64), 'Hong_Kong': GitLab    1
Git       1
Name: Git, dtype: int64, 'Hungary': Series([], Name: Git, dtype: int64), 'Ireland': Series([], Name: Git, dtype: int64), 'Israel': Series([], Name: Git, dtype: int64), 'Italy': Series([], Name: Git, dtype: int64), 'Japan': Series([], Name: Git, dtype: int64), 'Luxembourg': Git    1
Name: Git, dtype: int64, 'Netherlands': Series([], Name: Git, dtype: int64), 'New_Zealand': Series([], Name: Git, dtype: int64), 'Norway': Series([], Name: Git,

#### 20. Cloud Platforms

##### 20.1 AWS


Provides on-demand cloud computing platforms and APIs to individuals, companies, and governments, on a metered, pay-as-you-go basis. Often times, clients will use this in combination with autoscaling.

In [216]:
cloud_names = [
    r"Amazon Web Services", 
    r"AWS",
    ]

column_name = 'AWS'

dfs = add_tech_to_dfs(dfs, cloud_names, column_name)

{'Australia': False    59
True      8
Name: AWS, dtype: int64, 'Austria': False    136
True       3
Name: AWS, dtype: int64, 'Belgium': False    60
True      4
Name: AWS, dtype: int64, 'Canada': False    4
Name: AWS, dtype: int64, 'Czech_Republic': False    64
True      4
Name: AWS, dtype: int64, 'Denmark': False    88
True      3
Name: AWS, dtype: int64, 'Finland': False    52
True      6
Name: AWS, dtype: int64, 'France': False    192
True      35
Name: AWS, dtype: int64, 'Germany': False    158
True       6
Name: AWS, dtype: int64, 'Greece': False    55
True      1
Name: AWS, dtype: int64, 'Hong_Kong': False    93
True     12
Name: AWS, dtype: int64, 'Hungary': False    83
True      3
Name: AWS, dtype: int64, 'Ireland': False    67
True      8
Name: AWS, dtype: int64, 'Israel': False    198
True      54
Name: AWS, dtype: int64, 'Italy': False    78
True      2
Name: AWS, dtype: int64, 'Japan': False    42
Name: AWS, dtype: int64, 'Luxembourg': False    39
True      1
Name: AWS, dtyp

##### 20.2 Microsoft Azure

A cloud computing platform operated by Microsoft that provides access, management, and development of applications and services via around the world-distributed data centers.

In [217]:
cloud_names = [
    r"Microsoft Azure", 
    r"Azure",
    ]

column_name = 'Microsoft_Azure'

dfs = add_tech_to_dfs(dfs, cloud_names, column_name)

{'Australia': False    60
True      7
Name: Microsoft_Azure, dtype: int64, 'Austria': False    133
True       6
Name: Microsoft_Azure, dtype: int64, 'Belgium': False    54
True     10
Name: Microsoft_Azure, dtype: int64, 'Canada': False    4
Name: Microsoft_Azure, dtype: int64, 'Czech_Republic': False    61
True      7
Name: Microsoft_Azure, dtype: int64, 'Denmark': False    77
True     14
Name: Microsoft_Azure, dtype: int64, 'Finland': False    41
True     17
Name: Microsoft_Azure, dtype: int64, 'France': False    186
True      41
Name: Microsoft_Azure, dtype: int64, 'Germany': False    148
True      16
Name: Microsoft_Azure, dtype: int64, 'Greece': False    52
True      4
Name: Microsoft_Azure, dtype: int64, 'Hong_Kong': False    93
True     12
Name: Microsoft_Azure, dtype: int64, 'Hungary': False    78
True      8
Name: Microsoft_Azure, dtype: int64, 'Ireland': False    72
True      3
Name: Microsoft_Azure, dtype: int64, 'Israel': False    244
True       8
Name: Microsoft_Azure, dty

##### 20.3 GCP

A suite of cloud computing services that runs on the same infrastructure that Google uses internally for its end-user products, such as Google Search, Gmail, Google Drive, and YouTube.

In [218]:
cloud_names = [
    r"Google Cloud Platform", 
    r"GCP",
    ]

column_name = 'GPC'

dfs = add_tech_to_dfs(dfs, cloud_names, column_name)

{'Australia': False    62
True      5
Name: GPC, dtype: int64, 'Austria': False    138
True       1
Name: GPC, dtype: int64, 'Belgium': False    63
True      1
Name: GPC, dtype: int64, 'Canada': False    4
Name: GPC, dtype: int64, 'Czech_Republic': False    66
True      2
Name: GPC, dtype: int64, 'Denmark': False    91
Name: GPC, dtype: int64, 'Finland': False    53
True      5
Name: GPC, dtype: int64, 'France': False    192
True      35
Name: GPC, dtype: int64, 'Germany': False    164
Name: GPC, dtype: int64, 'Greece': False    55
True      1
Name: GPC, dtype: int64, 'Hong_Kong': False    101
True       4
Name: GPC, dtype: int64, 'Hungary': False    86
Name: GPC, dtype: int64, 'Ireland': False    70
True      5
Name: GPC, dtype: int64, 'Israel': False    243
True       9
Name: GPC, dtype: int64, 'Italy': False    79
True      1
Name: GPC, dtype: int64, 'Japan': False    39
True      3
Name: GPC, dtype: int64, 'Luxembourg': False    39
True      1
Name: GPC, dtype: int64, 'Netherlands'

##### 20.4 Alibaba Cloud

Alibaba Cloud provides cloud computing services to online businesses and Alibaba's own e-commerce ecosystem.

In [219]:
cloud_names = [
    r"Alibaba Cloud", 
    r"Aliyun",
    ]

column_name = 'Alibaba_Cloud'

dfs = add_tech_to_dfs(dfs, cloud_names, column_name)

{'Australia': False    67
Name: Alibaba_Cloud, dtype: int64, 'Austria': False    139
Name: Alibaba_Cloud, dtype: int64, 'Belgium': False    64
Name: Alibaba_Cloud, dtype: int64, 'Canada': False    4
Name: Alibaba_Cloud, dtype: int64, 'Czech_Republic': False    68
Name: Alibaba_Cloud, dtype: int64, 'Denmark': False    91
Name: Alibaba_Cloud, dtype: int64, 'Finland': False    58
Name: Alibaba_Cloud, dtype: int64, 'France': False    227
Name: Alibaba_Cloud, dtype: int64, 'Germany': False    164
Name: Alibaba_Cloud, dtype: int64, 'Greece': False    56
Name: Alibaba_Cloud, dtype: int64, 'Hong_Kong': False    105
Name: Alibaba_Cloud, dtype: int64, 'Hungary': False    86
Name: Alibaba_Cloud, dtype: int64, 'Ireland': False    75
Name: Alibaba_Cloud, dtype: int64, 'Israel': False    252
Name: Alibaba_Cloud, dtype: int64, 'Italy': False    80
Name: Alibaba_Cloud, dtype: int64, 'Japan': False    42
Name: Alibaba_Cloud, dtype: int64, 'Luxembourg': False    40
Name: Alibaba_Cloud, dtype: int64, 'Ne

##### 20.4 Oracle Cloud

Providing servers, storage, network, applications and services through a global network of Oracle Corporation managed data centers

In [220]:
cloud_names = [
    r"Oracle Cloud", 
    r"OCI",
    ]

column_name = 'Oracle_Cloud'

dfs = add_tech_to_dfs(dfs, cloud_names, column_name)

{'Australia': False    67
Name: Oracle_Cloud, dtype: int64, 'Austria': False    139
Name: Oracle_Cloud, dtype: int64, 'Belgium': False    64
Name: Oracle_Cloud, dtype: int64, 'Canada': False    4
Name: Oracle_Cloud, dtype: int64, 'Czech_Republic': False    68
Name: Oracle_Cloud, dtype: int64, 'Denmark': False    91
Name: Oracle_Cloud, dtype: int64, 'Finland': False    58
Name: Oracle_Cloud, dtype: int64, 'France': False    227
Name: Oracle_Cloud, dtype: int64, 'Germany': False    164
Name: Oracle_Cloud, dtype: int64, 'Greece': False    56
Name: Oracle_Cloud, dtype: int64, 'Hong_Kong': False    105
Name: Oracle_Cloud, dtype: int64, 'Hungary': False    86
Name: Oracle_Cloud, dtype: int64, 'Ireland': False    75
Name: Oracle_Cloud, dtype: int64, 'Israel': False    251
True       1
Name: Oracle_Cloud, dtype: int64, 'Italy': False    80
Name: Oracle_Cloud, dtype: int64, 'Japan': False    42
Name: Oracle_Cloud, dtype: int64, 'Luxembourg': False    40
Name: Oracle_Cloud, dtype: int64, 'Nether

##### 20.5 IBM Cloud

A set of cloud computing services for business

In [221]:
cloud_names = [
    r"IBM Cloud", 
    r"Kyndryl",
    r"Bluemix"
    ]

column_name = 'IBM_cloud'

dfs = add_tech_to_dfs(dfs, cloud_names, column_name)

{'Australia': False    67
Name: IBM_cloud, dtype: int64, 'Austria': False    139
Name: IBM_cloud, dtype: int64, 'Belgium': False    64
Name: IBM_cloud, dtype: int64, 'Canada': False    4
Name: IBM_cloud, dtype: int64, 'Czech_Republic': False    67
True      1
Name: IBM_cloud, dtype: int64, 'Denmark': False    91
Name: IBM_cloud, dtype: int64, 'Finland': False    58
Name: IBM_cloud, dtype: int64, 'France': False    225
True       2
Name: IBM_cloud, dtype: int64, 'Germany': False    164
Name: IBM_cloud, dtype: int64, 'Greece': False    56
Name: IBM_cloud, dtype: int64, 'Hong_Kong': False    105
Name: IBM_cloud, dtype: int64, 'Hungary': False    86
Name: IBM_cloud, dtype: int64, 'Ireland': False    75
Name: IBM_cloud, dtype: int64, 'Israel': False    252
Name: IBM_cloud, dtype: int64, 'Italy': False    80
Name: IBM_cloud, dtype: int64, 'Japan': False    42
Name: IBM_cloud, dtype: int64, 'Luxembourg': False    40
Name: IBM_cloud, dtype: int64, 'Netherlands': False    40
Name: IBM_cloud, dt

##### 20.6 Tencent Cloud

Tencent Cloud provides businesses across the globe with stable and secure industry-leading cloud products and services, leveraging technological advancements such as cloud computing, Big Data, AI, IoT and network security.

In [222]:
cloud_names = [
    r"Tencent Cloud",
    ]

column_name = 'Tencent_cloud'

dfs = add_tech_to_dfs(dfs, cloud_names, column_name)

{'Australia': False    67
Name: Tencent_cloud, dtype: int64, 'Austria': False    139
Name: Tencent_cloud, dtype: int64, 'Belgium': False    64
Name: Tencent_cloud, dtype: int64, 'Canada': False    4
Name: Tencent_cloud, dtype: int64, 'Czech_Republic': False    68
Name: Tencent_cloud, dtype: int64, 'Denmark': False    91
Name: Tencent_cloud, dtype: int64, 'Finland': False    58
Name: Tencent_cloud, dtype: int64, 'France': False    227
Name: Tencent_cloud, dtype: int64, 'Germany': False    164
Name: Tencent_cloud, dtype: int64, 'Greece': False    56
Name: Tencent_cloud, dtype: int64, 'Hong_Kong': False    105
Name: Tencent_cloud, dtype: int64, 'Hungary': False    86
Name: Tencent_cloud, dtype: int64, 'Ireland': False    75
Name: Tencent_cloud, dtype: int64, 'Israel': False    252
Name: Tencent_cloud, dtype: int64, 'Italy': False    80
Name: Tencent_cloud, dtype: int64, 'Japan': False    42
Name: Tencent_cloud, dtype: int64, 'Luxembourg': False    40
Name: Tencent_cloud, dtype: int64, 'Ne

##### 20.8 OVHcloud

A French cloud computing company which offers VPS, dedicated servers and other web services

In [223]:
cloud_names = [
    r"OVHcloud",
    r"OVH"
    ]

column_name = 'OVHcloud'

dfs = add_tech_to_dfs(dfs, cloud_names, column_name)

{'Australia': False    67
Name: OVHcloud, dtype: int64, 'Austria': False    139
Name: OVHcloud, dtype: int64, 'Belgium': False    64
Name: OVHcloud, dtype: int64, 'Canada': False    4
Name: OVHcloud, dtype: int64, 'Czech_Republic': False    68
Name: OVHcloud, dtype: int64, 'Denmark': False    91
Name: OVHcloud, dtype: int64, 'Finland': False    58
Name: OVHcloud, dtype: int64, 'France': False    227
Name: OVHcloud, dtype: int64, 'Germany': False    164
Name: OVHcloud, dtype: int64, 'Greece': False    56
Name: OVHcloud, dtype: int64, 'Hong_Kong': False    105
Name: OVHcloud, dtype: int64, 'Hungary': False    86
Name: OVHcloud, dtype: int64, 'Ireland': False    75
Name: OVHcloud, dtype: int64, 'Israel': False    252
Name: OVHcloud, dtype: int64, 'Italy': False    80
Name: OVHcloud, dtype: int64, 'Japan': False    42
Name: OVHcloud, dtype: int64, 'Luxembourg': False    40
Name: OVHcloud, dtype: int64, 'Netherlands': False    40
Name: OVHcloud, dtype: int64, 'New_Zealand': False    52
Name

##### 20.9 DigitalOcean

A cloud hosting provider that offers cloud computing services and Infrastructure as a Service (IaaS). Known for pricing and scalability

In [224]:
cloud_names = [
    r"DigitalOcean"
    ]

column_name = 'DigitalOcean_cloud'

dfs = add_tech_to_dfs(dfs, cloud_names, column_name)

{'Australia': False    67
Name: DigitalOcean_cloud, dtype: int64, 'Austria': False    139
Name: DigitalOcean_cloud, dtype: int64, 'Belgium': False    64
Name: DigitalOcean_cloud, dtype: int64, 'Canada': False    4
Name: DigitalOcean_cloud, dtype: int64, 'Czech_Republic': False    68
Name: DigitalOcean_cloud, dtype: int64, 'Denmark': False    91
Name: DigitalOcean_cloud, dtype: int64, 'Finland': False    58
Name: DigitalOcean_cloud, dtype: int64, 'France': False    227
Name: DigitalOcean_cloud, dtype: int64, 'Germany': False    164
Name: DigitalOcean_cloud, dtype: int64, 'Greece': False    56
Name: DigitalOcean_cloud, dtype: int64, 'Hong_Kong': False    105
Name: DigitalOcean_cloud, dtype: int64, 'Hungary': False    86
Name: DigitalOcean_cloud, dtype: int64, 'Ireland': False    75
Name: DigitalOcean_cloud, dtype: int64, 'Israel': False    252
Name: DigitalOcean_cloud, dtype: int64, 'Italy': False    80
Name: DigitalOcean_cloud, dtype: int64, 'Japan': False    42
Name: DigitalOcean_cloud

##### 20.10 Linode

An American cloud hosting provider that focused on providing Linux-based virtual machines, cloud infrastructure, and managed services.

In [225]:
cloud_names = [
    r"Linode",
    r"Akamai"
    ]

column_name = 'Lincode_cloud'

dfs = add_tech_to_dfs(dfs, cloud_names, column_name)

{'Australia': False    67
Name: Lincode_cloud, dtype: int64, 'Austria': False    139
Name: Lincode_cloud, dtype: int64, 'Belgium': False    64
Name: Lincode_cloud, dtype: int64, 'Canada': False    4
Name: Lincode_cloud, dtype: int64, 'Czech_Republic': False    68
Name: Lincode_cloud, dtype: int64, 'Denmark': False    91
Name: Lincode_cloud, dtype: int64, 'Finland': False    58
Name: Lincode_cloud, dtype: int64, 'France': False    227
Name: Lincode_cloud, dtype: int64, 'Germany': False    164
Name: Lincode_cloud, dtype: int64, 'Greece': False    56
Name: Lincode_cloud, dtype: int64, 'Hong_Kong': False    105
Name: Lincode_cloud, dtype: int64, 'Hungary': False    86
Name: Lincode_cloud, dtype: int64, 'Ireland': False    75
Name: Lincode_cloud, dtype: int64, 'Israel': False    252
Name: Lincode_cloud, dtype: int64, 'Italy': False    80
Name: Lincode_cloud, dtype: int64, 'Japan': False    42
Name: Lincode_cloud, dtype: int64, 'Luxembourg': False    40
Name: Lincode_cloud, dtype: int64, 'Ne

In [226]:
del cloud_names

#### 21. Relational Database Management Systems (RDBMS)

##### 21.1 PostgreSQL
Can be used as a data store for big data solutions.
Postgres, is a free and open-source relational database management system (RDBMS) emphasizing extensibility and SQL compliance. <br>
PostgreSQL features transactions with Atomicity, Consistency, Isolation, Durability (ACID) properties, automatically updatable views, materialized views, triggers, foreign keys, and stored procedures. <br> It is designed to handle a range of workloads, from single machines to data warehouses or Web services with many concurrent users. 

In [227]:
tool_names = [
    r"PostgreSQL",
    r"Postgres"
    ]

column_name = 'PostgreSQL'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: PostgreSQL, dtype: int64, 'Austria': False    136
True       3
Name: PostgreSQL, dtype: int64, 'Belgium': False    63
True      1
Name: PostgreSQL, dtype: int64, 'Canada': False    4
Name: PostgreSQL, dtype: int64, 'Czech_Republic': False    66
True      2
Name: PostgreSQL, dtype: int64, 'Denmark': False    90
True      1
Name: PostgreSQL, dtype: int64, 'Finland': False    57
True      1
Name: PostgreSQL, dtype: int64, 'France': False    215
True      12
Name: PostgreSQL, dtype: int64, 'Germany': False    164
Name: PostgreSQL, dtype: int64, 'Greece': False    55
True      1
Name: PostgreSQL, dtype: int64, 'Hong_Kong': False    104
True       1
Name: PostgreSQL, dtype: int64, 'Hungary': False    85
True      1
Name: PostgreSQL, dtype: int64, 'Ireland': False    74
True      1
Name: PostgreSQL, dtype: int64, 'Israel': False    247
True       5
Name: PostgreSQL, dtype: int64, 'Italy': False    80
Name: PostgreSQL, dtype: int64, 'Japan': False    42
Name: Po

##### 21.2 Microsoft SQL Server


A software product with the primary function of storing and retrieving data as requested by other software applications—which may run either on the same computer or on another computer across a network (including the Internet).

In [228]:
tool_names = [
    r"Microsoft SQL",
    r"SQL Server"
    ]

column_name = 'Microsoft_SQL_Server'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Microsoft_SQL_Server, dtype: int64, 'Austria': False    137
True       2
Name: Microsoft_SQL_Server, dtype: int64, 'Belgium': False    58
True      6
Name: Microsoft_SQL_Server, dtype: int64, 'Canada': False    4
Name: Microsoft_SQL_Server, dtype: int64, 'Czech_Republic': False    68
Name: Microsoft_SQL_Server, dtype: int64, 'Denmark': False    90
True      1
Name: Microsoft_SQL_Server, dtype: int64, 'Finland': False    58
Name: Microsoft_SQL_Server, dtype: int64, 'France': False    221
True       6
Name: Microsoft_SQL_Server, dtype: int64, 'Germany': False    163
True       1
Name: Microsoft_SQL_Server, dtype: int64, 'Greece': False    56
Name: Microsoft_SQL_Server, dtype: int64, 'Hong_Kong': False    104
True       1
Name: Microsoft_SQL_Server, dtype: int64, 'Hungary': False    86
Name: Microsoft_SQL_Server, dtype: int64, 'Ireland': False    75
Name: Microsoft_SQL_Server, dtype: int64, 'Israel': False    249
True       3
Name: Microsoft_SQL_Server, dty

##### 21.3 MySQL

An open-source relational database management system.

In [229]:
tool_names = [
    r"MySQL"
    ]

column_name = 'MySQL'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: MySQL, dtype: int64, 'Austria': False    137
True       2
Name: MySQL, dtype: int64, 'Belgium': False    61
True      3
Name: MySQL, dtype: int64, 'Canada': False    4
Name: MySQL, dtype: int64, 'Czech_Republic': False    66
True      2
Name: MySQL, dtype: int64, 'Denmark': False    91
Name: MySQL, dtype: int64, 'Finland': False    57
True      1
Name: MySQL, dtype: int64, 'France': False    222
True       5
Name: MySQL, dtype: int64, 'Germany': False    164
Name: MySQL, dtype: int64, 'Greece': False    55
True      1
Name: MySQL, dtype: int64, 'Hong_Kong': False    102
True       3
Name: MySQL, dtype: int64, 'Hungary': False    85
True      1
Name: MySQL, dtype: int64, 'Ireland': False    75
Name: MySQL, dtype: int64, 'Israel': False    247
True       5
Name: MySQL, dtype: int64, 'Italy': False    78
True      2
Name: MySQL, dtype: int64, 'Japan': False    39
True      3
Name: MySQL, dtype: int64, 'Luxembourg': False    40
Name: MySQL, dtype: int64, 'Ne

##### 21.4 IBM Db2 warehouse

A family of data management products, including database servers, developed by IBM. It initially supported the relational model, but was extended to support object–relational features and non-relational structures like JSON and XML.

In [230]:
tool_names = [
    r"Db2",
    r"IBMDb2"
    ]

column_name = 'IBM_Db2'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: IBM_Db2, dtype: int64, 'Austria': False    139
Name: IBM_Db2, dtype: int64, 'Belgium': False    64
Name: IBM_Db2, dtype: int64, 'Canada': False    4
Name: IBM_Db2, dtype: int64, 'Czech_Republic': False    68
Name: IBM_Db2, dtype: int64, 'Denmark': False    91
Name: IBM_Db2, dtype: int64, 'Finland': False    58
Name: IBM_Db2, dtype: int64, 'France': False    227
Name: IBM_Db2, dtype: int64, 'Germany': False    164
Name: IBM_Db2, dtype: int64, 'Greece': False    56
Name: IBM_Db2, dtype: int64, 'Hong_Kong': False    105
Name: IBM_Db2, dtype: int64, 'Hungary': False    86
Name: IBM_Db2, dtype: int64, 'Ireland': False    75
Name: IBM_Db2, dtype: int64, 'Israel': False    252
Name: IBM_Db2, dtype: int64, 'Italy': False    80
Name: IBM_Db2, dtype: int64, 'Japan': False    42
Name: IBM_Db2, dtype: int64, 'Luxembourg': False    39
True      1
Name: IBM_Db2, dtype: int64, 'Netherlands': False    40
Name: IBM_Db2, dtype: int64, 'New_Zealand': False    52
Name: IBM_

##### 21.5. Oracle PL/SQL

 A procedural language designed specifically to embrace SQL statements within its syntax. PL/SQL program units are compiled by the Oracle Database server and stored inside the database. And at run-time, both PL/SQL and SQL run within the same server process, bringing optimal efficiency

In [231]:
tool_names = [
    r"PL/SQL",
    r"PL / SQL",
    r"Procedural Language for SQL"
    ]

column_name = 'Oracle_PL_SQL'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Oracle_PL_SQL, dtype: int64, 'Austria': False    139
Name: Oracle_PL_SQL, dtype: int64, 'Belgium': False    64
Name: Oracle_PL_SQL, dtype: int64, 'Canada': False    4
Name: Oracle_PL_SQL, dtype: int64, 'Czech_Republic': False    68
Name: Oracle_PL_SQL, dtype: int64, 'Denmark': False    91
Name: Oracle_PL_SQL, dtype: int64, 'Finland': False    58
Name: Oracle_PL_SQL, dtype: int64, 'France': False    226
True       1
Name: Oracle_PL_SQL, dtype: int64, 'Germany': False    163
True       1
Name: Oracle_PL_SQL, dtype: int64, 'Greece': False    56
Name: Oracle_PL_SQL, dtype: int64, 'Hong_Kong': False    105
Name: Oracle_PL_SQL, dtype: int64, 'Hungary': False    86
Name: Oracle_PL_SQL, dtype: int64, 'Ireland': False    75
Name: Oracle_PL_SQL, dtype: int64, 'Israel': False    252
Name: Oracle_PL_SQL, dtype: int64, 'Italy': False    79
True      1
Name: Oracle_PL_SQL, dtype: int64, 'Japan': False    42
Name: Oracle_PL_SQL, dtype: int64, 'Luxembourg': False    38


#### 22. NoSQL Database Management Systems

##### 22.1 MongoDB

A source-available cross-platform document-oriented database program. Classified as a NoSQL database program, MongoDB uses JSON-like documents with optional schemas

In [232]:
tool_names = [
    r"MongoDB",
    r"Mongo DB",
    ]

column_name = 'MongoDB'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: MongoDB, dtype: int64, 'Austria': False    137
True       2
Name: MongoDB, dtype: int64, 'Belgium': False    64
Name: MongoDB, dtype: int64, 'Canada': False    4
Name: MongoDB, dtype: int64, 'Czech_Republic': False    66
True      2
Name: MongoDB, dtype: int64, 'Denmark': False    91
Name: MongoDB, dtype: int64, 'Finland': False    57
True      1
Name: MongoDB, dtype: int64, 'France': False    218
True       9
Name: MongoDB, dtype: int64, 'Germany': False    164
Name: MongoDB, dtype: int64, 'Greece': False    55
True      1
Name: MongoDB, dtype: int64, 'Hong_Kong': False    103
True       2
Name: MongoDB, dtype: int64, 'Hungary': False    86
Name: MongoDB, dtype: int64, 'Ireland': False    75
Name: MongoDB, dtype: int64, 'Israel': False    240
True      12
Name: MongoDB, dtype: int64, 'Italy': False    78
True      2
Name: MongoDB, dtype: int64, 'Japan': False    42
Name: MongoDB, dtype: int64, 'Luxembourg': False    40
Name: MongoDB, dtype: int64, 'Neth

##### 22.2 Cassandra

A free and open-source, distributed, wide-column store, NoSQL database management system designed to handle large amounts of data across many commodity servers, providing high availability with no single point of failure

In [233]:
tool_names = [
    r"Cassandra",
    ]

column_name = 'Cassandra'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Cassandra, dtype: int64, 'Austria': False    138
True       1
Name: Cassandra, dtype: int64, 'Belgium': False    64
Name: Cassandra, dtype: int64, 'Canada': False    4
Name: Cassandra, dtype: int64, 'Czech_Republic': False    67
True      1
Name: Cassandra, dtype: int64, 'Denmark': False    91
Name: Cassandra, dtype: int64, 'Finland': False    58
Name: Cassandra, dtype: int64, 'France': False    222
True       5
Name: Cassandra, dtype: int64, 'Germany': False    164
Name: Cassandra, dtype: int64, 'Greece': False    56
Name: Cassandra, dtype: int64, 'Hong_Kong': False    105
Name: Cassandra, dtype: int64, 'Hungary': False    85
True      1
Name: Cassandra, dtype: int64, 'Ireland': False    75
Name: Cassandra, dtype: int64, 'Israel': False    246
True       6
Name: Cassandra, dtype: int64, 'Italy': False    79
True      1
Name: Cassandra, dtype: int64, 'Japan': False    42
Name: Cassandra, dtype: int64, 'Luxembourg': False    40
Name: Cassandra, dtype: int

##### 22.3 Amazon DynamoDB

A proprietary NoSQL database service that supports key–value and document data structures and is offered by Amazon.com as part of the Amazon Web Services portfolio.

In [234]:
tool_names = [
    r"DynamoDB",
    r"Dynamo DB",
    r"SimpleDB"
    ]

column_name = 'Amazon_DynamoDB'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Amazon_DynamoDB, dtype: int64, 'Austria': False    139
Name: Amazon_DynamoDB, dtype: int64, 'Belgium': False    64
Name: Amazon_DynamoDB, dtype: int64, 'Canada': False    4
Name: Amazon_DynamoDB, dtype: int64, 'Czech_Republic': False    68
Name: Amazon_DynamoDB, dtype: int64, 'Denmark': False    91
Name: Amazon_DynamoDB, dtype: int64, 'Finland': False    58
Name: Amazon_DynamoDB, dtype: int64, 'France': False    225
True       2
Name: Amazon_DynamoDB, dtype: int64, 'Germany': False    164
Name: Amazon_DynamoDB, dtype: int64, 'Greece': False    56
Name: Amazon_DynamoDB, dtype: int64, 'Hong_Kong': False    105
Name: Amazon_DynamoDB, dtype: int64, 'Hungary': False    86
Name: Amazon_DynamoDB, dtype: int64, 'Ireland': False    73
True      2
Name: Amazon_DynamoDB, dtype: int64, 'Israel': False    250
True       2
Name: Amazon_DynamoDB, dtype: int64, 'Italy': False    80
Name: Amazon_DynamoDB, dtype: int64, 'Japan': False    41
True      1
Name: Amazon_Dynamo

#### 22.4 Neo4j

A graph database management system developed by Neo4j, Inc. Described by its developers as an ACID-compliant transactional database with native graph storage and processing

In [235]:
tool_names = [
    r"Neo4j"
    ]

column_name = 'Neo4j'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Neo4j, dtype: int64, 'Austria': False    139
Name: Neo4j, dtype: int64, 'Belgium': False    64
Name: Neo4j, dtype: int64, 'Canada': False    4
Name: Neo4j, dtype: int64, 'Czech_Republic': False    68
Name: Neo4j, dtype: int64, 'Denmark': False    91
Name: Neo4j, dtype: int64, 'Finland': False    58
Name: Neo4j, dtype: int64, 'France': False    227
Name: Neo4j, dtype: int64, 'Germany': False    164
Name: Neo4j, dtype: int64, 'Greece': False    56
Name: Neo4j, dtype: int64, 'Hong_Kong': False    105
Name: Neo4j, dtype: int64, 'Hungary': False    86
Name: Neo4j, dtype: int64, 'Ireland': False    75
Name: Neo4j, dtype: int64, 'Israel': False    250
True       2
Name: Neo4j, dtype: int64, 'Italy': False    80
Name: Neo4j, dtype: int64, 'Japan': False    42
Name: Neo4j, dtype: int64, 'Luxembourg': False    40
Name: Neo4j, dtype: int64, 'Netherlands': False    40
Name: Neo4j, dtype: int64, 'New_Zealand': False    52
Name: Neo4j, dtype: int64, 'Norway': False   

##### 22.5 Apache Solr

An open-source enterprise-search platform, written in Java. Its major features include full-text search, hit highlighting, faceted search, real-time indexing, dynamic clustering, database integration, NoSQL features[2] and rich document (e.g., Word, PDF) handling.

In [236]:
tool_names = [
    r"Solr"
    ]

column_name = 'Apache_Solr'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Apache_Solr, dtype: int64, 'Austria': False    138
True       1
Name: Apache_Solr, dtype: int64, 'Belgium': False    64
Name: Apache_Solr, dtype: int64, 'Canada': False    4
Name: Apache_Solr, dtype: int64, 'Czech_Republic': False    68
Name: Apache_Solr, dtype: int64, 'Denmark': False    91
Name: Apache_Solr, dtype: int64, 'Finland': False    58
Name: Apache_Solr, dtype: int64, 'France': False    227
Name: Apache_Solr, dtype: int64, 'Germany': False    164
Name: Apache_Solr, dtype: int64, 'Greece': False    56
Name: Apache_Solr, dtype: int64, 'Hong_Kong': False    105
Name: Apache_Solr, dtype: int64, 'Hungary': False    86
Name: Apache_Solr, dtype: int64, 'Ireland': False    75
Name: Apache_Solr, dtype: int64, 'Israel': False    251
True       1
Name: Apache_Solr, dtype: int64, 'Italy': False    80
Name: Apache_Solr, dtype: int64, 'Japan': False    42
Name: Apache_Solr, dtype: int64, 'Luxembourg': False    40
Name: Apache_Solr, dtype: int64, 'Netherland

#### 22. Data warehousing and Analytics

##### 22.1 Amazon Redshift

A data warehouse product which forms part of the larger cloud-computing platform Amazon Web Services. It is built on top of technology from the massive parallel processing data warehouse company ParAccel, to handle large scale data sets and database migrations.

In [237]:
tool_names = [
    r"Redshift",
    ]

column_name = 'Amazon_Redshift'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    66
True      1
Name: Amazon_Redshift, dtype: int64, 'Austria': False    139
Name: Amazon_Redshift, dtype: int64, 'Belgium': False    64
Name: Amazon_Redshift, dtype: int64, 'Canada': False    4
Name: Amazon_Redshift, dtype: int64, 'Czech_Republic': False    67
True      1
Name: Amazon_Redshift, dtype: int64, 'Denmark': False    91
Name: Amazon_Redshift, dtype: int64, 'Finland': False    58
Name: Amazon_Redshift, dtype: int64, 'France': False    220
True       7
Name: Amazon_Redshift, dtype: int64, 'Germany': False    161
True       3
Name: Amazon_Redshift, dtype: int64, 'Greece': False    56
Name: Amazon_Redshift, dtype: int64, 'Hong_Kong': False    103
True       2
Name: Amazon_Redshift, dtype: int64, 'Hungary': False    86
Name: Amazon_Redshift, dtype: int64, 'Ireland': False    74
True      1
Name: Amazon_Redshift, dtype: int64, 'Israel': False    244
True       8
Name: Amazon_Redshift, dtype: int64, 'Italy': False    79
True      1
Name: Amazon_Redshift, dtyp

##### 22.2 Google BigQuery

A serverless data warehouse that enables scalable analysis over petabytes of data. It is a Platform as a Service that supports querying using ANSI SQL. It also has built-in machine learning capabilities.

In [238]:
tool_names = [
    r"BigQuery",
    ]

column_name = 'Google_BigQuery'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    66
True      1
Name: Google_BigQuery, dtype: int64, 'Austria': False    139
Name: Google_BigQuery, dtype: int64, 'Belgium': False    64
Name: Google_BigQuery, dtype: int64, 'Canada': False    4
Name: Google_BigQuery, dtype: int64, 'Czech_Republic': False    68
Name: Google_BigQuery, dtype: int64, 'Denmark': False    91
Name: Google_BigQuery, dtype: int64, 'Finland': False    58
Name: Google_BigQuery, dtype: int64, 'France': False    213
True      14
Name: Google_BigQuery, dtype: int64, 'Germany': False    163
True       1
Name: Google_BigQuery, dtype: int64, 'Greece': False    56
Name: Google_BigQuery, dtype: int64, 'Hong_Kong': False    105
Name: Google_BigQuery, dtype: int64, 'Hungary': False    86
Name: Google_BigQuery, dtype: int64, 'Ireland': False    73
True      2
Name: Google_BigQuery, dtype: int64, 'Israel': False    245
True       7
Name: Google_BigQuery, dtype: int64, 'Italy': False    80
Name: Google_BigQuery, dtype: int64, 'Japan': False    38
True  

##### 22.3 Snowflake

Snowflake enables data storage, processing, and analytic solutions.

In [239]:
tool_names = [
    r"Snowflake"
    ]

column_name = 'Snowflake'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    66
True      1
Name: Snowflake, dtype: int64, 'Austria': False    139
Name: Snowflake, dtype: int64, 'Belgium': False    64
Name: Snowflake, dtype: int64, 'Canada': False    4
Name: Snowflake, dtype: int64, 'Czech_Republic': False    68
Name: Snowflake, dtype: int64, 'Denmark': False    88
True      3
Name: Snowflake, dtype: int64, 'Finland': False    57
True      1
Name: Snowflake, dtype: int64, 'France': False    215
True      12
Name: Snowflake, dtype: int64, 'Germany': False    164
Name: Snowflake, dtype: int64, 'Greece': False    55
True      1
Name: Snowflake, dtype: int64, 'Hong_Kong': False    105
Name: Snowflake, dtype: int64, 'Hungary': False    85
True      1
Name: Snowflake, dtype: int64, 'Ireland': False    73
True      2
Name: Snowflake, dtype: int64, 'Israel': False    244
True       8
Name: Snowflake, dtype: int64, 'Italy': False    80
Name: Snowflake, dtype: int64, 'Japan': False    41
True      1
Name: Snowflake, dtype: int64, 'Luxembourg': Fals

##### 22.4 Oracle Exadata

Designed to run Oracle Database workloads, such as an OLTP application running simultaneously with Analytics processing. Historically, specialized database computing platforms were designed for a particular workload, such as Data Warehousing, and poor or unusable for other workloads, such as OLTP. 

In [240]:
tool_names = [
    r"Exadata"
    ]

column_name = 'Oracle_Exadata'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Oracle_Exadata, dtype: int64, 'Austria': False    139
Name: Oracle_Exadata, dtype: int64, 'Belgium': False    64
Name: Oracle_Exadata, dtype: int64, 'Canada': False    4
Name: Oracle_Exadata, dtype: int64, 'Czech_Republic': False    68
Name: Oracle_Exadata, dtype: int64, 'Denmark': False    91
Name: Oracle_Exadata, dtype: int64, 'Finland': False    58
Name: Oracle_Exadata, dtype: int64, 'France': False    227
Name: Oracle_Exadata, dtype: int64, 'Germany': False    164
Name: Oracle_Exadata, dtype: int64, 'Greece': False    56
Name: Oracle_Exadata, dtype: int64, 'Hong_Kong': False    105
Name: Oracle_Exadata, dtype: int64, 'Hungary': False    86
Name: Oracle_Exadata, dtype: int64, 'Ireland': False    75
Name: Oracle_Exadata, dtype: int64, 'Israel': False    252
Name: Oracle_Exadata, dtype: int64, 'Italy': False    80
Name: Oracle_Exadata, dtype: int64, 'Japan': False    42
Name: Oracle_Exadata, dtype: int64, 'Luxembourg': False    40
Name: Oracle_Exadata, 

##### 22.5 SAP HANA

A multi-model database that stores data in its memory instead of keeping it on a disk.

In [241]:
tool_names = [
    r"HANA"
    ]

column_name = 'SAP_HANA'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: SAP_HANA, dtype: int64, 'Austria': False    138
True       1
Name: SAP_HANA, dtype: int64, 'Belgium': False    64
Name: SAP_HANA, dtype: int64, 'Canada': False    4
Name: SAP_HANA, dtype: int64, 'Czech_Republic': False    68
Name: SAP_HANA, dtype: int64, 'Denmark': False    90
True      1
Name: SAP_HANA, dtype: int64, 'Finland': False    58
Name: SAP_HANA, dtype: int64, 'France': False    227
Name: SAP_HANA, dtype: int64, 'Germany': False    163
True       1
Name: SAP_HANA, dtype: int64, 'Greece': False    56
Name: SAP_HANA, dtype: int64, 'Hong_Kong': False    105
Name: SAP_HANA, dtype: int64, 'Hungary': False    85
True      1
Name: SAP_HANA, dtype: int64, 'Ireland': False    75
Name: SAP_HANA, dtype: int64, 'Israel': False    252
Name: SAP_HANA, dtype: int64, 'Italy': False    80
Name: SAP_HANA, dtype: int64, 'Japan': False    42
Name: SAP_HANA, dtype: int64, 'Luxembourg': False    40
Name: SAP_HANA, dtype: int64, 'Netherlands': False    40
Name: SAP_H

##### 22.6 Teradata

It is mainly suitable for building large scale data warehousing applications.

In [242]:
tool_names = [
    r"Teradata"
    ]

column_name = 'Teradata'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    65
True      2
Name: Teradata, dtype: int64, 'Austria': False    139
Name: Teradata, dtype: int64, 'Belgium': False    64
Name: Teradata, dtype: int64, 'Canada': False    4
Name: Teradata, dtype: int64, 'Czech_Republic': False    68
Name: Teradata, dtype: int64, 'Denmark': False    91
Name: Teradata, dtype: int64, 'Finland': False    58
Name: Teradata, dtype: int64, 'France': False    220
True       7
Name: Teradata, dtype: int64, 'Germany': False    164
Name: Teradata, dtype: int64, 'Greece': False    56
Name: Teradata, dtype: int64, 'Hong_Kong': False    105
Name: Teradata, dtype: int64, 'Hungary': False    86
Name: Teradata, dtype: int64, 'Ireland': False    75
Name: Teradata, dtype: int64, 'Israel': False    248
True       4
Name: Teradata, dtype: int64, 'Italy': False    80
Name: Teradata, dtype: int64, 'Japan': False    42
Name: Teradata, dtype: int64, 'Luxembourg': False    40
Name: Teradata, dtype: int64, 'Netherlands': False    38
True      2
Name: Terad

#### 23. Data Integration and Processing

##### 23.1 Informatica PowerCenter - Data integration tool


Used extensively for ETL operations, data quality, data masking, data replication, data virtualization, and master data management services.

In [243]:
tool_names = [
    r"PowerCenter",
    r"Power Center",
    ]

column_name = 'Informatica_PowerCenter'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Informatica_PowerCenter, dtype: int64, 'Austria': False    139
Name: Informatica_PowerCenter, dtype: int64, 'Belgium': False    64
Name: Informatica_PowerCenter, dtype: int64, 'Canada': False    4
Name: Informatica_PowerCenter, dtype: int64, 'Czech_Republic': False    67
True      1
Name: Informatica_PowerCenter, dtype: int64, 'Denmark': False    91
Name: Informatica_PowerCenter, dtype: int64, 'Finland': False    58
Name: Informatica_PowerCenter, dtype: int64, 'France': False    224
True       3
Name: Informatica_PowerCenter, dtype: int64, 'Germany': False    164
Name: Informatica_PowerCenter, dtype: int64, 'Greece': False    56
Name: Informatica_PowerCenter, dtype: int64, 'Hong_Kong': False    105
Name: Informatica_PowerCenter, dtype: int64, 'Hungary': False    86
Name: Informatica_PowerCenter, dtype: int64, 'Ireland': False    75
Name: Informatica_PowerCenter, dtype: int64, 'Israel': False    252
Name: Informatica_PowerCenter, dtype: int64, 'Italy': Fa

##### 23.2 DataBricks - Data processing and analytics platform

A unified set of tools for building, deploying, sharing, and maintaining enterprise-grade data solutions at scale. 

In [244]:
tool_names = [
    r"Data Bricks",
    r"Databricks"
    ]

column_name = 'Databricks'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    63
True      4
Name: Databricks, dtype: int64, 'Austria': False    139
Name: Databricks, dtype: int64, 'Belgium': False    61
True      3
Name: Databricks, dtype: int64, 'Canada': False    4
Name: Databricks, dtype: int64, 'Czech_Republic': False    65
True      3
Name: Databricks, dtype: int64, 'Denmark': False    87
True      4
Name: Databricks, dtype: int64, 'Finland': False    54
True      4
Name: Databricks, dtype: int64, 'France': False    208
True      19
Name: Databricks, dtype: int64, 'Germany': False    164
Name: Databricks, dtype: int64, 'Greece': False    54
True      2
Name: Databricks, dtype: int64, 'Hong_Kong': False    102
True       3
Name: Databricks, dtype: int64, 'Hungary': False    85
True      1
Name: Databricks, dtype: int64, 'Ireland': False    73
True      2
Name: Databricks, dtype: int64, 'Israel': False    249
True       3
Name: Databricks, dtype: int64, 'Italy': False    79
True      1
Name: Databricks, dtype: int64, 'Japan': False    

##### 23.3 Presto - Query engine

 A distributed query engine for big data using the SQL query language. Its architecture allows users to query data sources such as Hadoop, Cassandra, Kafka, AWS S3, Alluxio, MySQL, MongoDB and Teradata, and allows use of multiple data sources

In [245]:
tool_names = [
    r"Presto",
    r"PrestoDB",
    r"PrestoSQL"
    ]

column_name = 'Presto'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Presto, dtype: int64, 'Austria': False    139
Name: Presto, dtype: int64, 'Belgium': False    64
Name: Presto, dtype: int64, 'Canada': False    4
Name: Presto, dtype: int64, 'Czech_Republic': False    68
Name: Presto, dtype: int64, 'Denmark': False    91
Name: Presto, dtype: int64, 'Finland': False    58
Name: Presto, dtype: int64, 'France': False    227
Name: Presto, dtype: int64, 'Germany': False    164
Name: Presto, dtype: int64, 'Greece': False    56
Name: Presto, dtype: int64, 'Hong_Kong': False    105
Name: Presto, dtype: int64, 'Hungary': False    86
Name: Presto, dtype: int64, 'Ireland': False    75
Name: Presto, dtype: int64, 'Israel': False    247
True       5
Name: Presto, dtype: int64, 'Italy': False    80
Name: Presto, dtype: int64, 'Japan': False    42
Name: Presto, dtype: int64, 'Luxembourg': False    40
Name: Presto, dtype: int64, 'Netherlands': False    40
Name: Presto, dtype: int64, 'New_Zealand': False    52
Name: Presto, dtype: int64,

#### 24. Stream processing tools

##### 24.1 Apache Kafka

An open-source system, distributed event store and stream-processing platform. The project aims to provide a unified, high-throughput, low-latency platform for handling real-time data feeds.

In [246]:
tool_names = [
    r"Kafka",
    ]

column_name = 'Apache_Kafka'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    66
True      1
Name: Apache_Kafka, dtype: int64, 'Austria': False    136
True       3
Name: Apache_Kafka, dtype: int64, 'Belgium': False    64
Name: Apache_Kafka, dtype: int64, 'Canada': False    4
Name: Apache_Kafka, dtype: int64, 'Czech_Republic': False    62
True      6
Name: Apache_Kafka, dtype: int64, 'Denmark': False    90
True      1
Name: Apache_Kafka, dtype: int64, 'Finland': False    56
True      2
Name: Apache_Kafka, dtype: int64, 'France': False    204
True      23
Name: Apache_Kafka, dtype: int64, 'Germany': False    164
Name: Apache_Kafka, dtype: int64, 'Greece': False    55
True      1
Name: Apache_Kafka, dtype: int64, 'Hong_Kong': False    105
Name: Apache_Kafka, dtype: int64, 'Hungary': False    84
True      2
Name: Apache_Kafka, dtype: int64, 'Ireland': False    73
True      2
Name: Apache_Kafka, dtype: int64, 'Israel': False    220
True      32
Name: Apache_Kafka, dtype: int64, 'Italy': False    78
True      2
Name: Apache_Kafka, dtype: int64, 

##### 24.2 Apache Flink

Process data streams at a large scale and to deliver real-time analytical insights about your processed data with your streaming application.

In [247]:
tool_names = [
    r"Flink",
    ]

column_name = 'Apache_Flink'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Apache_Flink, dtype: int64, 'Austria': False    139
Name: Apache_Flink, dtype: int64, 'Belgium': False    64
Name: Apache_Flink, dtype: int64, 'Canada': False    4
Name: Apache_Flink, dtype: int64, 'Czech_Republic': False    67
True      1
Name: Apache_Flink, dtype: int64, 'Denmark': False    91
Name: Apache_Flink, dtype: int64, 'Finland': False    58
Name: Apache_Flink, dtype: int64, 'France': False    226
True       1
Name: Apache_Flink, dtype: int64, 'Germany': False    164
Name: Apache_Flink, dtype: int64, 'Greece': False    56
Name: Apache_Flink, dtype: int64, 'Hong_Kong': False    105
Name: Apache_Flink, dtype: int64, 'Hungary': False    86
Name: Apache_Flink, dtype: int64, 'Ireland': False    75
Name: Apache_Flink, dtype: int64, 'Israel': False    244
True       8
Name: Apache_Flink, dtype: int64, 'Italy': False    80
Name: Apache_Flink, dtype: int64, 'Japan': False    42
Name: Apache_Flink, dtype: int64, 'Luxembourg': False    40
Name: Apache_Fli

##### 24.3 Dataflow


Dataflow is a managed service provided by Google Cloud for building and executing data processing pipelines. It enables developers to create scalable and efficient batch and streaming data pipelines using a simple programming model.

In [248]:
tool_names = [
    r"Dataflow",
    ]

column_name = 'Dataflow'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Dataflow, dtype: int64, 'Austria': False    139
Name: Dataflow, dtype: int64, 'Belgium': False    64
Name: Dataflow, dtype: int64, 'Canada': False    4
Name: Dataflow, dtype: int64, 'Czech_Republic': False    68
Name: Dataflow, dtype: int64, 'Denmark': False    91
Name: Dataflow, dtype: int64, 'Finland': False    58
Name: Dataflow, dtype: int64, 'France': False    225
True       2
Name: Dataflow, dtype: int64, 'Germany': False    164
Name: Dataflow, dtype: int64, 'Greece': False    56
Name: Dataflow, dtype: int64, 'Hong_Kong': False    105
Name: Dataflow, dtype: int64, 'Hungary': False    86
Name: Dataflow, dtype: int64, 'Ireland': False    75
Name: Dataflow, dtype: int64, 'Israel': False    252
Name: Dataflow, dtype: int64, 'Italy': False    80
Name: Dataflow, dtype: int64, 'Japan': False    42
Name: Dataflow, dtype: int64, 'Luxembourg': False    40
Name: Dataflow, dtype: int64, 'Netherlands': False    40
Name: Dataflow, dtype: int64, 'New_Zealand': Fal

#### 25 Workflow orchestration tools

##### 25.1 Apache Airflow

Apache Airflow is an open-source platform used for programmatically creating, scheduling, and monitoring complex workflows or data pipelines. It allows users to define and execute a sequence of tasks or operations, while providing tools for tracking and troubleshooting workflow executions.

In [249]:
tool_names = [
    r"Airflow",
    ]

column_name = 'Apache_Airflow'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Apache_Airflow, dtype: int64, 'Austria': False    136
True       3
Name: Apache_Airflow, dtype: int64, 'Belgium': False    62
True      2
Name: Apache_Airflow, dtype: int64, 'Canada': False    4
Name: Apache_Airflow, dtype: int64, 'Czech_Republic': False    67
True      1
Name: Apache_Airflow, dtype: int64, 'Denmark': False    89
True      2
Name: Apache_Airflow, dtype: int64, 'Finland': False    58
Name: Apache_Airflow, dtype: int64, 'France': False    206
True      21
Name: Apache_Airflow, dtype: int64, 'Germany': False    163
True       1
Name: Apache_Airflow, dtype: int64, 'Greece': False    56
Name: Apache_Airflow, dtype: int64, 'Hong_Kong': False    105
Name: Apache_Airflow, dtype: int64, 'Hungary': False    86
Name: Apache_Airflow, dtype: int64, 'Ireland': False    73
True      2
Name: Apache_Airflow, dtype: int64, 'Israel': False    240
True      12
Name: Apache_Airflow, dtype: int64, 'Italy': False    80
Name: Apache_Airflow, dtype: int64, 'Japa

##### 25.2 Luigi

Luigi is a Python-based open-source workflow management system that helps to build complex pipelines of batch jobs. It provides a flexible and extensible architecture to create and manage complex data workflows.

In [250]:
tool_names = [
    r"Luigi",
    ]

column_name = 'Luigi'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Luigi, dtype: int64, 'Austria': False    139
Name: Luigi, dtype: int64, 'Belgium': False    64
Name: Luigi, dtype: int64, 'Canada': False    4
Name: Luigi, dtype: int64, 'Czech_Republic': False    68
Name: Luigi, dtype: int64, 'Denmark': False    91
Name: Luigi, dtype: int64, 'Finland': False    58
Name: Luigi, dtype: int64, 'France': False    227
Name: Luigi, dtype: int64, 'Germany': False    164
Name: Luigi, dtype: int64, 'Greece': False    56
Name: Luigi, dtype: int64, 'Hong_Kong': False    105
Name: Luigi, dtype: int64, 'Hungary': False    86
Name: Luigi, dtype: int64, 'Ireland': False    75
Name: Luigi, dtype: int64, 'Israel': False    252
Name: Luigi, dtype: int64, 'Italy': False    80
Name: Luigi, dtype: int64, 'Japan': False    42
Name: Luigi, dtype: int64, 'Luxembourg': False    40
Name: Luigi, dtype: int64, 'Netherlands': False    40
Name: Luigi, dtype: int64, 'New_Zealand': False    52
Name: Luigi, dtype: int64, 'Norway': False    31
Name: Lui

##### 25.3 SSIS

SQL Server Integration Services (SSIS) is a Microsoft tool used for building data integration and ETL (extract, transform, load) workflows. It allows users to perform a range of tasks such as data extraction, transformation, and loading from various sources to different destinations.

In [251]:
tool_names = [
    r"SSIS",
    r"SQL Server Integration Services"
    ]

column_name = 'SSIS'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: SSIS, dtype: int64, 'Austria': False    138
True       1
Name: SSIS, dtype: int64, 'Belgium': False    61
True      3
Name: SSIS, dtype: int64, 'Canada': False    4
Name: SSIS, dtype: int64, 'Czech_Republic': False    68
Name: SSIS, dtype: int64, 'Denmark': False    91
Name: SSIS, dtype: int64, 'Finland': False    58
Name: SSIS, dtype: int64, 'France': False    222
True       5
Name: SSIS, dtype: int64, 'Germany': False    164
Name: SSIS, dtype: int64, 'Greece': False    56
Name: SSIS, dtype: int64, 'Hong_Kong': False    105
Name: SSIS, dtype: int64, 'Hungary': False    86
Name: SSIS, dtype: int64, 'Ireland': False    75
Name: SSIS, dtype: int64, 'Israel': False    249
True       3
Name: SSIS, dtype: int64, 'Italy': False    79
True      1
Name: SSIS, dtype: int64, 'Japan': False    42
Name: SSIS, dtype: int64, 'Luxembourg': False    39
True      1
Name: SSIS, dtype: int64, 'Netherlands': False    38
True      2
Name: SSIS, dtype: int64, 'New_Zealand': F

#### 26. Big Data processing

##### 24.1 Apache Hadoop

Apache Hadoop is an open-source framework that allows for the distributed processing of large data sets across clusters of computers using simple programming models. It provides a distributed file system and supports various distributed computing models, such as MapReduce and Spark, for processing and analyzing large data sets.

In [252]:
tool_names = [
    r"Hadoop",
    ]

column_name = 'Apache_Hadoop'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    66
True      1
Name: Apache_Hadoop, dtype: int64, 'Austria': False    138
True       1
Name: Apache_Hadoop, dtype: int64, 'Belgium': False    64
Name: Apache_Hadoop, dtype: int64, 'Canada': False    4
Name: Apache_Hadoop, dtype: int64, 'Czech_Republic': False    67
True      1
Name: Apache_Hadoop, dtype: int64, 'Denmark': False    91
Name: Apache_Hadoop, dtype: int64, 'Finland': False    58
Name: Apache_Hadoop, dtype: int64, 'France': False    217
True      10
Name: Apache_Hadoop, dtype: int64, 'Germany': False    164
Name: Apache_Hadoop, dtype: int64, 'Greece': False    56
Name: Apache_Hadoop, dtype: int64, 'Hong_Kong': False    103
True       2
Name: Apache_Hadoop, dtype: int64, 'Hungary': False    85
True      1
Name: Apache_Hadoop, dtype: int64, 'Ireland': False    73
True      2
Name: Apache_Hadoop, dtype: int64, 'Israel': False    234
True      18
Name: Apache_Hadoop, dtype: int64, 'Italy': False    78
True      2
Name: Apache_Hadoop, dtype: int64, 'Japan':

##### 24.2 Apache Hive


Apache Hive is a data warehouse software that facilitates querying and managing large datasets stored in Hadoop file systems using a SQL-like language called HiveQL. It provides a high-level interface for data analysts and developers to analyze, transform, and summarize data stored in Hadoop Distributed File System (HDFS) and other compatible storage systems.

In [253]:
tool_names = [
    r"Hive",
    ]

column_name = 'Apache_Hive'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Apache_Hive, dtype: int64, 'Austria': False    139
Name: Apache_Hive, dtype: int64, 'Belgium': False    64
Name: Apache_Hive, dtype: int64, 'Canada': False    4
Name: Apache_Hive, dtype: int64, 'Czech_Republic': False    65
True      3
Name: Apache_Hive, dtype: int64, 'Denmark': False    91
Name: Apache_Hive, dtype: int64, 'Finland': False    58
Name: Apache_Hive, dtype: int64, 'France': False    218
True       9
Name: Apache_Hive, dtype: int64, 'Germany': False    164
Name: Apache_Hive, dtype: int64, 'Greece': False    56
Name: Apache_Hive, dtype: int64, 'Hong_Kong': False    103
True       2
Name: Apache_Hive, dtype: int64, 'Hungary': False    86
Name: Apache_Hive, dtype: int64, 'Ireland': False    73
True      2
Name: Apache_Hive, dtype: int64, 'Israel': False    239
True      13
Name: Apache_Hive, dtype: int64, 'Italy': False    78
True      2
Name: Apache_Hive, dtype: int64, 'Japan': False    42
Name: Apache_Hive, dtype: int64, 'Luxembourg': False  

##### 24.3 Apache Spark

Apache Spark is a distributed computing framework designed to process large-scale data processing and analysis workloads in parallel. It can be used for batch processing, real-time stream processing, machine learning, and graph processing, among other things.

In [254]:
tool_names = [
    r"Spark",
    r"PySpark"
    ]

column_name = 'Apache_Spark'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    63
True      4
Name: Apache_Spark, dtype: int64, 'Austria': False    134
True       5
Name: Apache_Spark, dtype: int64, 'Belgium': False    62
True      2
Name: Apache_Spark, dtype: int64, 'Canada': False    4
Name: Apache_Spark, dtype: int64, 'Czech_Republic': False    60
True      8
Name: Apache_Spark, dtype: int64, 'Denmark': False    90
True      1
Name: Apache_Spark, dtype: int64, 'Finland': False    54
True      4
Name: Apache_Spark, dtype: int64, 'France': False    174
True      53
Name: Apache_Spark, dtype: int64, 'Germany': False    163
True       1
Name: Apache_Spark, dtype: int64, 'Greece': False    53
True      3
Name: Apache_Spark, dtype: int64, 'Hong_Kong': False    101
True       4
Name: Apache_Spark, dtype: int64, 'Hungary': False    85
True      1
Name: Apache_Spark, dtype: int64, 'Ireland': False    73
True      2
Name: Apache_Spark, dtype: int64, 'Israel': False    197
True      55
Name: Apache_Spark, dtype: int64, 'Italy': False    75
True    

#### 25. OS

##### 25.1 Linux

A free and open-source operating system based on the Unix system.

In [255]:
tool_names = [
    r"Linux", r"Ubuntu", r"CentOS", r"Red Hat", r"Debian", r"Fedora", r"openSUSE", r"RHEL", r"Gentoo", r"Kali"
    ]

column_name = 'Linux'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    66
True      1
Name: Linux, dtype: int64, 'Austria': False    138
True       1
Name: Linux, dtype: int64, 'Belgium': False    64
Name: Linux, dtype: int64, 'Canada': False    4
Name: Linux, dtype: int64, 'Czech_Republic': False    67
True      1
Name: Linux, dtype: int64, 'Denmark': False    89
True      2
Name: Linux, dtype: int64, 'Finland': False    58
Name: Linux, dtype: int64, 'France': False    220
True       7
Name: Linux, dtype: int64, 'Germany': False    163
True       1
Name: Linux, dtype: int64, 'Greece': False    54
True      2
Name: Linux, dtype: int64, 'Hong_Kong': False    100
True       5
Name: Linux, dtype: int64, 'Hungary': False    85
True      1
Name: Linux, dtype: int64, 'Ireland': False    74
True      1
Name: Linux, dtype: int64, 'Israel': False    237
True      15
Name: Linux, dtype: int64, 'Italy': False    79
True      1
Name: Linux, dtype: int64, 'Japan': False    41
True      1
Name: Linux, dtype: int64, 'Luxembourg': False    40
Name:

##### 25.2 Unix

Family of multitasking, multiuser computer operating systems that derive from the original AT&T Unix.

In [256]:
tool_names = [
    r"Solaris", r"AIX", r"HP-UX", r"BSD", r"IRIX", r"SCO Unix", r"Xenix", r"OpenServer", r"Unix"
    ]

column_name = 'Unix'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Unix, dtype: int64, 'Austria': False    139
Name: Unix, dtype: int64, 'Belgium': False    64
Name: Unix, dtype: int64, 'Canada': False    4
Name: Unix, dtype: int64, 'Czech_Republic': False    68
Name: Unix, dtype: int64, 'Denmark': False    91
Name: Unix, dtype: int64, 'Finland': False    58
Name: Unix, dtype: int64, 'France': False    223
True       4
Name: Unix, dtype: int64, 'Germany': False    164
Name: Unix, dtype: int64, 'Greece': False    55
True      1
Name: Unix, dtype: int64, 'Hong_Kong': False    103
True       2
Name: Unix, dtype: int64, 'Hungary': False    86
Name: Unix, dtype: int64, 'Ireland': False    75
Name: Unix, dtype: int64, 'Israel': False    251
True       1
Name: Unix, dtype: int64, 'Italy': False    79
True      1
Name: Unix, dtype: int64, 'Japan': False    42
Name: Unix, dtype: int64, 'Luxembourg': False    39
True      1
Name: Unix, dtype: int64, 'Netherlands': False    40
Name: Unix, dtype: int64, 'New_Zealand': False    52
N

In [257]:
for country, df in dfs.items():
    dfs[country]['Unix'] = df.apply(lambda row: False if row['Unix'] and row['Linux'] else row['Unix'], axis=1)

show_results(column_name, dfs)

{'Australia': False    67
Name: Unix, dtype: int64, 'Austria': False    139
Name: Unix, dtype: int64, 'Belgium': False    64
Name: Unix, dtype: int64, 'Canada': False    4
Name: Unix, dtype: int64, 'Czech_Republic': False    68
Name: Unix, dtype: int64, 'Denmark': False    91
Name: Unix, dtype: int64, 'Finland': False    58
Name: Unix, dtype: int64, 'France': False    223
True       4
Name: Unix, dtype: int64, 'Germany': False    164
Name: Unix, dtype: int64, 'Greece': False    56
Name: Unix, dtype: int64, 'Hong_Kong': False    104
True       1
Name: Unix, dtype: int64, 'Hungary': False    86
Name: Unix, dtype: int64, 'Ireland': False    75
Name: Unix, dtype: int64, 'Israel': False    252
Name: Unix, dtype: int64, 'Italy': False    79
True      1
Name: Unix, dtype: int64, 'Japan': False    42
Name: Unix, dtype: int64, 'Luxembourg': False    39
True      1
Name: Unix, dtype: int64, 'Netherlands': False    40
Name: Unix, dtype: int64, 'New_Zealand': False    52
Name: Unix, dtype: int64, 

##### 25.3 Windows

A family of operating systems developed by Microsoft Corporation primarily for personal computers.

In [258]:
# If you have some soft like "Windows Remote Desktop", it's logical that it has to run on Windows, and there the system is required.

tool_names = [
    r"Windows", r"WinNT"
    ]

column_name = 'Windows'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Windows, dtype: int64, 'Austria': False    139
Name: Windows, dtype: int64, 'Belgium': False    64
Name: Windows, dtype: int64, 'Canada': False    4
Name: Windows, dtype: int64, 'Czech_Republic': False    68
Name: Windows, dtype: int64, 'Denmark': False    91
Name: Windows, dtype: int64, 'Finland': False    58
Name: Windows, dtype: int64, 'France': False    224
True       3
Name: Windows, dtype: int64, 'Germany': False    162
True       2
Name: Windows, dtype: int64, 'Greece': False    56
Name: Windows, dtype: int64, 'Hong_Kong': False    102
True       3
Name: Windows, dtype: int64, 'Hungary': False    86
Name: Windows, dtype: int64, 'Ireland': False    75
Name: Windows, dtype: int64, 'Israel': False    251
True       1
Name: Windows, dtype: int64, 'Italy': False    80
Name: Windows, dtype: int64, 'Japan': False    42
Name: Windows, dtype: int64, 'Luxembourg': False    40
Name: Windows, dtype: int64, 'Netherlands': False    39
True      1
Name: Windows,

##### 25.4 macOS

A proprietary operating system developed by Apple Inc. for its Macintosh line of computers.

In [259]:
tool_names = [
    r"macOS"
    ]

column_name = 'macOS'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: macOS, dtype: int64, 'Austria': False    139
Name: macOS, dtype: int64, 'Belgium': False    64
Name: macOS, dtype: int64, 'Canada': False    4
Name: macOS, dtype: int64, 'Czech_Republic': False    68
Name: macOS, dtype: int64, 'Denmark': False    91
Name: macOS, dtype: int64, 'Finland': False    58
Name: macOS, dtype: int64, 'France': False    227
Name: macOS, dtype: int64, 'Germany': False    164
Name: macOS, dtype: int64, 'Greece': False    56
Name: macOS, dtype: int64, 'Hong_Kong': False    105
Name: macOS, dtype: int64, 'Hungary': False    86
Name: macOS, dtype: int64, 'Ireland': False    75
Name: macOS, dtype: int64, 'Israel': False    252
Name: macOS, dtype: int64, 'Italy': False    80
Name: macOS, dtype: int64, 'Japan': False    42
Name: macOS, dtype: int64, 'Luxembourg': False    40
Name: macOS, dtype: int64, 'Netherlands': False    40
Name: macOS, dtype: int64, 'New_Zealand': False    52
Name: macOS, dtype: int64, 'Norway': False    31
Name: mac

#### 26. Programming languages

##### 26.1 Python 🐍

Python is a high-level, interpreted programming language used for various purposes such as web development, data analysis, artificial intelligence, and more.

In [260]:
tool_names = [
    r"Python",
    ]

column_name = 'Python'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    63
True      4
Name: Python, dtype: int64, 'Austria': False    133
True       6
Name: Python, dtype: int64, 'Belgium': False    56
True      8
Name: Python, dtype: int64, 'Canada': False    4
Name: Python, dtype: int64, 'Czech_Republic': False    62
True      6
Name: Python, dtype: int64, 'Denmark': False    88
True      3
Name: Python, dtype: int64, 'Finland': False    54
True      4
Name: Python, dtype: int64, 'France': False    168
True      59
Name: Python, dtype: int64, 'Germany': False    153
True      11
Name: Python, dtype: int64, 'Greece': False    51
True      5
Name: Python, dtype: int64, 'Hong_Kong': False    87
True     18
Name: Python, dtype: int64, 'Hungary': False    82
True      4
Name: Python, dtype: int64, 'Ireland': False    71
True      4
Name: Python, dtype: int64, 'Israel': False    137
True     115
Name: Python, dtype: int64, 'Italy': False    75
True      5
Name: Python, dtype: int64, 'Japan': False    38
True      4
Name: Python, dtype: 

##### 26.2 R 👴🏻

A programming language and environment for statistical graphics and computing.

In [261]:
tool_names = [
    # Avoid They'r etc.
    r"(?<!')[rR]",
    r"RStudio"
    ]

column_name = 'R'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    66
True      1
Name: R, dtype: int64, 'Austria': False    137
True       2
Name: R, dtype: int64, 'Belgium': False    60
True      4
Name: R, dtype: int64, 'Canada': False    4
Name: R, dtype: int64, 'Czech_Republic': False    65
True      3
Name: R, dtype: int64, 'Denmark': False    87
True      4
Name: R, dtype: int64, 'Finland': False    58
Name: R, dtype: int64, 'France': False    220
True       7
Name: R, dtype: int64, 'Germany': False    159
True       5
Name: R, dtype: int64, 'Greece': False    56
Name: R, dtype: int64, 'Hong_Kong': False    102
True       3
Name: R, dtype: int64, 'Hungary': False    86
Name: R, dtype: int64, 'Ireland': False    73
True      2
Name: R, dtype: int64, 'Israel': False    239
True      13
Name: R, dtype: int64, 'Italy': False    70
True     10
Name: R, dtype: int64, 'Japan': False    41
True      1
Name: R, dtype: int64, 'Luxembourg': False    40
Name: R, dtype: int64, 'Netherlands': False    40
Name: R, dtype: int64, 'New_Zea

##### 26.3 Scala

Scala is a high-level, statically typed programming language designed for functional programming and scalable, concurrent applications.

In [262]:
tool_names = [
    r"Scala"
    ]

column_name = 'Scala'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    64
True      3
Name: Scala, dtype: int64, 'Austria': False    139
Name: Scala, dtype: int64, 'Belgium': False    63
True      1
Name: Scala, dtype: int64, 'Canada': False    4
Name: Scala, dtype: int64, 'Czech_Republic': False    67
True      1
Name: Scala, dtype: int64, 'Denmark': False    90
True      1
Name: Scala, dtype: int64, 'Finland': False    58
Name: Scala, dtype: int64, 'France': False    206
True      21
Name: Scala, dtype: int64, 'Germany': False    164
Name: Scala, dtype: int64, 'Greece': False    55
True      1
Name: Scala, dtype: int64, 'Hong_Kong': False    104
True       1
Name: Scala, dtype: int64, 'Hungary': False    85
True      1
Name: Scala, dtype: int64, 'Ireland': False    75
Name: Scala, dtype: int64, 'Israel': False    211
True      41
Name: Scala, dtype: int64, 'Italy': False    76
True      4
Name: Scala, dtype: int64, 'Japan': False    42
Name: Scala, dtype: int64, 'Luxembourg': False    38
True      2
Name: Scala, dtype: int64, 'Net

##### 26.4 Julia

Julia is a high-level, high-performance programming language that is particularly suited for scientific computing, numerical analysis, and data science.

In [263]:
tool_names = [
    r"Julia",
    r"JuliaLang ",
    ]

column_name = 'Julia'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Julia, dtype: int64, 'Austria': False    139
Name: Julia, dtype: int64, 'Belgium': False    64
Name: Julia, dtype: int64, 'Canada': False    4
Name: Julia, dtype: int64, 'Czech_Republic': False    68
Name: Julia, dtype: int64, 'Denmark': False    91
Name: Julia, dtype: int64, 'Finland': False    58
Name: Julia, dtype: int64, 'France': False    227
Name: Julia, dtype: int64, 'Germany': False    164
Name: Julia, dtype: int64, 'Greece': False    56
Name: Julia, dtype: int64, 'Hong_Kong': False    105
Name: Julia, dtype: int64, 'Hungary': False    86
Name: Julia, dtype: int64, 'Ireland': False    75
Name: Julia, dtype: int64, 'Israel': False    252
Name: Julia, dtype: int64, 'Italy': False    80
Name: Julia, dtype: int64, 'Japan': False    42
Name: Julia, dtype: int64, 'Luxembourg': False    40
Name: Julia, dtype: int64, 'Netherlands': False    40
Name: Julia, dtype: int64, 'New_Zealand': False    52
Name: Julia, dtype: int64, 'Norway': False    31
Name: Jul

##### 26.4 SQL

A programming language used to manage and manipulate relational databases.

In [264]:
# You should know SQL and its flavors if anyone below is required
tool_names = [
    r"SQL",
    r"MySQL",
    r"PostgreSQL",
    r"Postgres",
    r"SQLite",
    r"MariaDB",
    r"IBM DB2",
    r"Oracle Database",
    r"Db2",
    ]

column_name = 'SQL'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    63
True      4
Name: SQL, dtype: int64, 'Austria': False    125
True      14
Name: SQL, dtype: int64, 'Belgium': False    52
True     12
Name: SQL, dtype: int64, 'Canada': False    4
Name: SQL, dtype: int64, 'Czech_Republic': False    62
True      6
Name: SQL, dtype: int64, 'Denmark': False    86
True      5
Name: SQL, dtype: int64, 'Finland': False    54
True      4
Name: SQL, dtype: int64, 'France': False    173
True      54
Name: SQL, dtype: int64, 'Germany': False    154
True      10
Name: SQL, dtype: int64, 'Greece': False    52
True      4
Name: SQL, dtype: int64, 'Hong_Kong': False    91
True     14
Name: SQL, dtype: int64, 'Hungary': False    81
True      5
Name: SQL, dtype: int64, 'Ireland': False    70
True      5
Name: SQL, dtype: int64, 'Israel': False    187
True      65
Name: SQL, dtype: int64, 'Italy': False    71
True      9
Name: SQL, dtype: int64, 'Japan': False    38
True      4
Name: SQL, dtype: int64, 'Luxembourg': False    34
True      6
Nam

##### 26.5 Java

Java is a high-level, object-oriented programming language widely used for developing robust and scalable enterprise applications.

In Data Science, Java can be used for developing machine learning models, data analysis, and data processing applications, as well as for building large-scale distributed systems for big data processing and management.

In [265]:
tool_names = [
    r"Java",
    ]

column_name = 'Java'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    65
True      2
Name: Java, dtype: int64, 'Austria': False    135
True       4
Name: Java, dtype: int64, 'Belgium': False    64
Name: Java, dtype: int64, 'Canada': False    4
Name: Java, dtype: int64, 'Czech_Republic': False    65
True      3
Name: Java, dtype: int64, 'Denmark': False    90
True      1
Name: Java, dtype: int64, 'Finland': False    58
Name: Java, dtype: int64, 'France': False    206
True      21
Name: Java, dtype: int64, 'Germany': False    160
True       4
Name: Java, dtype: int64, 'Greece': False    54
True      2
Name: Java, dtype: int64, 'Hong_Kong': False    100
True       5
Name: Java, dtype: int64, 'Hungary': False    85
True      1
Name: Java, dtype: int64, 'Ireland': False    75
Name: Java, dtype: int64, 'Israel': False    200
True      52
Name: Java, dtype: int64, 'Italy': False    76
True      4
Name: Java, dtype: int64, 'Japan': False    41
True      1
Name: Java, dtype: int64, 'Luxembourg': False    39
True      1
Name: Java, dtype: in

##### 26.6 C++

A general-purpose programming language designed for systems and application programming, and it is used in Data Science for building high-performance libraries and applications that require intensive computational tasks.

In [266]:
tool_names = [
    r"C\+\+",
    ]

column_name = 'C++'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: C++, dtype: int64, 'Austria': False    139
Name: C++, dtype: int64, 'Belgium': False    64
Name: C++, dtype: int64, 'Canada': False    4
Name: C++, dtype: int64, 'Czech_Republic': False    68
Name: C++, dtype: int64, 'Denmark': False    91
Name: C++, dtype: int64, 'Finland': False    58
Name: C++, dtype: int64, 'France': False    227
Name: C++, dtype: int64, 'Germany': False    164
Name: C++, dtype: int64, 'Greece': False    56
Name: C++, dtype: int64, 'Hong_Kong': False    105
Name: C++, dtype: int64, 'Hungary': False    86
Name: C++, dtype: int64, 'Ireland': False    75
Name: C++, dtype: int64, 'Israel': False    252
Name: C++, dtype: int64, 'Italy': False    80
Name: C++, dtype: int64, 'Japan': False    42
Name: C++, dtype: int64, 'Luxembourg': False    40
Name: C++, dtype: int64, 'Netherlands': False    40
Name: C++, dtype: int64, 'New_Zealand': False    52
Name: C++, dtype: int64, 'Norway': False    31
Name: C++, dtype: int64, 'Poland': False    109

##### 26.7 Go

A statically typed programming language designed for building simple, efficient, and reliable software, and it can be used in data engineering for building scalable, distributed systems for data processing and analysis.

In [267]:
tool_names = [
    r"Go language", # Go as separate word is too common in English
    r"Golang",
    ]

column_name = 'Go'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Go, dtype: int64, 'Austria': False    139
Name: Go, dtype: int64, 'Belgium': False    64
Name: Go, dtype: int64, 'Canada': False    4
Name: Go, dtype: int64, 'Czech_Republic': False    68
Name: Go, dtype: int64, 'Denmark': False    91
Name: Go, dtype: int64, 'Finland': False    58
Name: Go, dtype: int64, 'France': False    227
Name: Go, dtype: int64, 'Germany': False    164
Name: Go, dtype: int64, 'Greece': False    56
Name: Go, dtype: int64, 'Hong_Kong': False    105
Name: Go, dtype: int64, 'Hungary': False    86
Name: Go, dtype: int64, 'Ireland': False    75
Name: Go, dtype: int64, 'Israel': False    251
True       1
Name: Go, dtype: int64, 'Italy': False    80
Name: Go, dtype: int64, 'Japan': False    42
Name: Go, dtype: int64, 'Luxembourg': False    40
Name: Go, dtype: int64, 'Netherlands': False    40
Name: Go, dtype: int64, 'New_Zealand': False    52
Name: Go, dtype: int64, 'Norway': False    31
Name: Go, dtype: int64, 'Poland': False    109
Name: 

##### 26.8 Rust 🦀

Although Rust is not yet as widely used as Python or R for data science, it is gaining popularity due to its ability to handle large-scale, computationally intensive tasks with high efficiency and safety, making it a promising language for data scientists and researchers alike. Additionally, Rust's rich set of libraries and tools, such as ndarray and RustDataScience, provide a solid foundation for building data-driven applications.

In [268]:
tool_names = [
    r"Rust",
    ]

column_name = 'Rust'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Rust, dtype: int64, 'Austria': False    139
Name: Rust, dtype: int64, 'Belgium': False    64
Name: Rust, dtype: int64, 'Canada': False    4
Name: Rust, dtype: int64, 'Czech_Republic': False    68
Name: Rust, dtype: int64, 'Denmark': False    91
Name: Rust, dtype: int64, 'Finland': False    58
Name: Rust, dtype: int64, 'France': False    227
Name: Rust, dtype: int64, 'Germany': False    164
Name: Rust, dtype: int64, 'Greece': False    56
Name: Rust, dtype: int64, 'Hong_Kong': False    105
Name: Rust, dtype: int64, 'Hungary': False    86
Name: Rust, dtype: int64, 'Ireland': False    75
Name: Rust, dtype: int64, 'Israel': False    252
Name: Rust, dtype: int64, 'Italy': False    80
Name: Rust, dtype: int64, 'Japan': False    42
Name: Rust, dtype: int64, 'Luxembourg': False    40
Name: Rust, dtype: int64, 'Netherlands': False    40
Name: Rust, dtype: int64, 'New_Zealand': False    52
Name: Rust, dtype: int64, 'Norway': False    31
Name: Rust, dtype: int64, 'P

##### 26.8 Bash

A shell scripting language used for automating repetitive tasks and managing the operating system, including data processing tasks, in the command-line interface (CLI) on Unix and Unix-like systems.

In [269]:
tool_names = [
    r"Bash",
    ]

column_name = 'Bash'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Bash, dtype: int64, 'Austria': False    139
Name: Bash, dtype: int64, 'Belgium': False    64
Name: Bash, dtype: int64, 'Canada': False    4
Name: Bash, dtype: int64, 'Czech_Republic': False    68
Name: Bash, dtype: int64, 'Denmark': False    91
Name: Bash, dtype: int64, 'Finland': False    58
Name: Bash, dtype: int64, 'France': False    227
Name: Bash, dtype: int64, 'Germany': False    164
Name: Bash, dtype: int64, 'Greece': False    56
Name: Bash, dtype: int64, 'Hong_Kong': False    104
True       1
Name: Bash, dtype: int64, 'Hungary': False    86
Name: Bash, dtype: int64, 'Ireland': False    75
Name: Bash, dtype: int64, 'Israel': False    250
True       2
Name: Bash, dtype: int64, 'Italy': False    80
Name: Bash, dtype: int64, 'Japan': False    42
Name: Bash, dtype: int64, 'Luxembourg': False    40
Name: Bash, dtype: int64, 'Netherlands': False    39
True      1
Name: Bash, dtype: int64, 'New_Zealand': False    52
Name: Bash, dtype: int64, 'Norway': Fa

##### 26.9 Powershell

A task automation and configuration management framework from Microsoft, which can be used in Data Science for automating various data processing tasks on Windows machines in the command-line interface (CLI).

In [270]:
tool_names = [
    r"PowerShell",
    r"DOS Shell"
    ]

column_name = 'PowerShell'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    66
True      1
Name: PowerShell, dtype: int64, 'Austria': False    139
Name: PowerShell, dtype: int64, 'Belgium': False    63
True      1
Name: PowerShell, dtype: int64, 'Canada': False    4
Name: PowerShell, dtype: int64, 'Czech_Republic': False    68
Name: PowerShell, dtype: int64, 'Denmark': False    91
Name: PowerShell, dtype: int64, 'Finland': False    58
Name: PowerShell, dtype: int64, 'France': False    226
True       1
Name: PowerShell, dtype: int64, 'Germany': False    164
Name: PowerShell, dtype: int64, 'Greece': False    56
Name: PowerShell, dtype: int64, 'Hong_Kong': False    104
True       1
Name: PowerShell, dtype: int64, 'Hungary': False    86
Name: PowerShell, dtype: int64, 'Ireland': False    75
Name: PowerShell, dtype: int64, 'Israel': False    251
True       1
Name: PowerShell, dtype: int64, 'Italy': False    80
Name: PowerShell, dtype: int64, 'Japan': False    42
Name: PowerShell, dtype: int64, 'Luxembourg': False    40
Name: PowerShell, dtype

##### 26.10 CLI

CLI stands for Command Line Interface, which is a way to interact with a computer program through text commands, and it is commonly used in Data Science for running scripts, automating tasks, and managing software packages.

In [271]:
tool_names = [
    r"CLI",
    r"Command Line Interface"
    ]

column_name = 'CLI'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    66
True      1
Name: CLI, dtype: int64, 'Austria': False    139
Name: CLI, dtype: int64, 'Belgium': False    64
Name: CLI, dtype: int64, 'Canada': False    4
Name: CLI, dtype: int64, 'Czech_Republic': False    68
Name: CLI, dtype: int64, 'Denmark': False    91
Name: CLI, dtype: int64, 'Finland': False    58
Name: CLI, dtype: int64, 'France': False    227
Name: CLI, dtype: int64, 'Germany': False    164
Name: CLI, dtype: int64, 'Greece': False    56
Name: CLI, dtype: int64, 'Hong_Kong': False    105
Name: CLI, dtype: int64, 'Hungary': False    86
Name: CLI, dtype: int64, 'Ireland': False    74
True      1
Name: CLI, dtype: int64, 'Israel': False    252
Name: CLI, dtype: int64, 'Italy': False    80
Name: CLI, dtype: int64, 'Japan': False    42
Name: CLI, dtype: int64, 'Luxembourg': False    40
Name: CLI, dtype: int64, 'Netherlands': False    40
Name: CLI, dtype: int64, 'New_Zealand': False    52
Name: CLI, dtype: int64, 'Norway': False    31
Name: CLI, dtype: int64

#### 27. Virtualization Tools

Business intelligence and data visualization tools used for analyzing and visualizing data.

##### 27.1 Tableau

In [272]:
tool_names = [
    r"Tableau"
    ]

column_name = 'Tableau'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Tableau, dtype: int64, 'Austria': False    139
Name: Tableau, dtype: int64, 'Belgium': False    64
Name: Tableau, dtype: int64, 'Canada': False    4
Name: Tableau, dtype: int64, 'Czech_Republic': False    68
Name: Tableau, dtype: int64, 'Denmark': False    91
Name: Tableau, dtype: int64, 'Finland': False    58
Name: Tableau, dtype: int64, 'France': False    220
True       7
Name: Tableau, dtype: int64, 'Germany': False    163
True       1
Name: Tableau, dtype: int64, 'Greece': False    56
Name: Tableau, dtype: int64, 'Hong_Kong': False    102
True       3
Name: Tableau, dtype: int64, 'Hungary': False    86
Name: Tableau, dtype: int64, 'Ireland': False    74
True      1
Name: Tableau, dtype: int64, 'Israel': False    247
True       5
Name: Tableau, dtype: int64, 'Italy': False    79
True      1
Name: Tableau, dtype: int64, 'Japan': False    41
True      1
Name: Tableau, dtype: int64, 'Luxembourg': False    39
True      1
Name: Tableau, dtype: int64, 'Neth

##### 27.2 Power BI

In [273]:
tool_names = [
    r"Power BI"
    ]

column_name = 'Power_BI'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    65
True      2
Name: Power_BI, dtype: int64, 'Austria': False    138
True       1
Name: Power_BI, dtype: int64, 'Belgium': False    58
True      6
Name: Power_BI, dtype: int64, 'Canada': False    4
Name: Power_BI, dtype: int64, 'Czech_Republic': False    68
Name: Power_BI, dtype: int64, 'Denmark': False    88
True      3
Name: Power_BI, dtype: int64, 'Finland': False    58
Name: Power_BI, dtype: int64, 'France': False    220
True       7
Name: Power_BI, dtype: int64, 'Germany': False    157
True       7
Name: Power_BI, dtype: int64, 'Greece': False    55
True      1
Name: Power_BI, dtype: int64, 'Hong_Kong': False    102
True       3
Name: Power_BI, dtype: int64, 'Hungary': False    85
True      1
Name: Power_BI, dtype: int64, 'Ireland': False    74
True      1
Name: Power_BI, dtype: int64, 'Israel': False    250
True       2
Name: Power_BI, dtype: int64, 'Italy': False    78
True      2
Name: Power_BI, dtype: int64, 'Japan': False    41
True      1
Name: Power_B

##### 27.3 Google Analytics

In [274]:
tool_names = [
    r"Google Analytics"
    ]

column_name = 'Google_Analytics'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Google_Analytics, dtype: int64, 'Austria': False    138
True       1
Name: Google_Analytics, dtype: int64, 'Belgium': False    64
Name: Google_Analytics, dtype: int64, 'Canada': False    4
Name: Google_Analytics, dtype: int64, 'Czech_Republic': False    68
Name: Google_Analytics, dtype: int64, 'Denmark': False    91
Name: Google_Analytics, dtype: int64, 'Finland': False    58
Name: Google_Analytics, dtype: int64, 'France': False    227
Name: Google_Analytics, dtype: int64, 'Germany': False    164
Name: Google_Analytics, dtype: int64, 'Greece': False    55
True      1
Name: Google_Analytics, dtype: int64, 'Hong_Kong': False    105
Name: Google_Analytics, dtype: int64, 'Hungary': False    86
Name: Google_Analytics, dtype: int64, 'Ireland': False    75
Name: Google_Analytics, dtype: int64, 'Israel': False    251
True       1
Name: Google_Analytics, dtype: int64, 'Italy': False    80
Name: Google_Analytics, dtype: int64, 'Japan': False    42
Name: Google_Ana

##### 27.4 QlikView

In [275]:
tool_names = [
    r"QlikView",
    r"Qlik"
    ]

column_name = 'QlikView'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: QlikView, dtype: int64, 'Austria': False    139
Name: QlikView, dtype: int64, 'Belgium': False    64
Name: QlikView, dtype: int64, 'Canada': False    4
Name: QlikView, dtype: int64, 'Czech_Republic': False    68
Name: QlikView, dtype: int64, 'Denmark': False    91
Name: QlikView, dtype: int64, 'Finland': False    58
Name: QlikView, dtype: int64, 'France': False    225
True       2
Name: QlikView, dtype: int64, 'Germany': False    164
Name: QlikView, dtype: int64, 'Greece': False    56
Name: QlikView, dtype: int64, 'Hong_Kong': False    104
True       1
Name: QlikView, dtype: int64, 'Hungary': False    86
Name: QlikView, dtype: int64, 'Ireland': False    75
Name: QlikView, dtype: int64, 'Israel': False    251
True       1
Name: QlikView, dtype: int64, 'Italy': False    78
True      2
Name: QlikView, dtype: int64, 'Japan': False    42
Name: QlikView, dtype: int64, 'Luxembourg': False    40
Name: QlikView, dtype: int64, 'Netherlands': False    39
True      

##### 27.5 Oracle BI server

In [276]:
tool_names = [
    r"Oracle Business Intelligence Enterprise Edition",
    r"OBIEE",
    r"Oracle BI server",
    ]

column_name = 'Oracle_BI_server'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Oracle_BI_server, dtype: int64, 'Austria': False    139
Name: Oracle_BI_server, dtype: int64, 'Belgium': False    64
Name: Oracle_BI_server, dtype: int64, 'Canada': False    4
Name: Oracle_BI_server, dtype: int64, 'Czech_Republic': False    68
Name: Oracle_BI_server, dtype: int64, 'Denmark': False    91
Name: Oracle_BI_server, dtype: int64, 'Finland': False    58
Name: Oracle_BI_server, dtype: int64, 'France': False    227
Name: Oracle_BI_server, dtype: int64, 'Germany': False    164
Name: Oracle_BI_server, dtype: int64, 'Greece': False    56
Name: Oracle_BI_server, dtype: int64, 'Hong_Kong': False    105
Name: Oracle_BI_server, dtype: int64, 'Hungary': False    86
Name: Oracle_BI_server, dtype: int64, 'Ireland': False    75
Name: Oracle_BI_server, dtype: int64, 'Israel': False    252
Name: Oracle_BI_server, dtype: int64, 'Italy': False    80
Name: Oracle_BI_server, dtype: int64, 'Japan': False    42
Name: Oracle_BI_server, dtype: int64, 'Luxembourg': Fa

##### 27.6 SAS Analytics

In [277]:
tool_names = [
    r"SAS Analytics",
    r"Statistical Analysis System",
    ]

column_name = 'SAS_Analytics'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: SAS_Analytics, dtype: int64, 'Austria': False    139
Name: SAS_Analytics, dtype: int64, 'Belgium': False    64
Name: SAS_Analytics, dtype: int64, 'Canada': False    4
Name: SAS_Analytics, dtype: int64, 'Czech_Republic': False    68
Name: SAS_Analytics, dtype: int64, 'Denmark': False    91
Name: SAS_Analytics, dtype: int64, 'Finland': False    58
Name: SAS_Analytics, dtype: int64, 'France': False    227
Name: SAS_Analytics, dtype: int64, 'Germany': False    164
Name: SAS_Analytics, dtype: int64, 'Greece': False    56
Name: SAS_Analytics, dtype: int64, 'Hong_Kong': False    105
Name: SAS_Analytics, dtype: int64, 'Hungary': False    86
Name: SAS_Analytics, dtype: int64, 'Ireland': False    75
Name: SAS_Analytics, dtype: int64, 'Israel': False    252
Name: SAS_Analytics, dtype: int64, 'Italy': False    80
Name: SAS_Analytics, dtype: int64, 'Japan': False    42
Name: SAS_Analytics, dtype: int64, 'Luxembourg': False    40
Name: SAS_Analytics, dtype: int64, 'Ne

##### 27.7 Lumira

In [278]:
tool_names = [
    r"Lumira",
    ]

column_name = 'Lumira'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Lumira, dtype: int64, 'Austria': False    139
Name: Lumira, dtype: int64, 'Belgium': False    64
Name: Lumira, dtype: int64, 'Canada': False    4
Name: Lumira, dtype: int64, 'Czech_Republic': False    68
Name: Lumira, dtype: int64, 'Denmark': False    91
Name: Lumira, dtype: int64, 'Finland': False    58
Name: Lumira, dtype: int64, 'France': False    227
Name: Lumira, dtype: int64, 'Germany': False    164
Name: Lumira, dtype: int64, 'Greece': False    56
Name: Lumira, dtype: int64, 'Hong_Kong': False    105
Name: Lumira, dtype: int64, 'Hungary': False    86
Name: Lumira, dtype: int64, 'Ireland': False    75
Name: Lumira, dtype: int64, 'Israel': False    252
Name: Lumira, dtype: int64, 'Italy': False    80
Name: Lumira, dtype: int64, 'Japan': False    42
Name: Lumira, dtype: int64, 'Luxembourg': False    40
Name: Lumira, dtype: int64, 'Netherlands': False    40
Name: Lumira, dtype: int64, 'New_Zealand': False    52
Name: Lumira, dtype: int64, 'Norway': Fa

##### 27.8 IBM Cognos Impromptu

In [279]:
tool_names = [
    r"Cognos Impromptu",
    ]

column_name = 'Cognos_Impromptu'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Cognos_Impromptu, dtype: int64, 'Austria': False    139
Name: Cognos_Impromptu, dtype: int64, 'Belgium': False    64
Name: Cognos_Impromptu, dtype: int64, 'Canada': False    4
Name: Cognos_Impromptu, dtype: int64, 'Czech_Republic': False    68
Name: Cognos_Impromptu, dtype: int64, 'Denmark': False    91
Name: Cognos_Impromptu, dtype: int64, 'Finland': False    58
Name: Cognos_Impromptu, dtype: int64, 'France': False    227
Name: Cognos_Impromptu, dtype: int64, 'Germany': False    164
Name: Cognos_Impromptu, dtype: int64, 'Greece': False    56
Name: Cognos_Impromptu, dtype: int64, 'Hong_Kong': False    105
Name: Cognos_Impromptu, dtype: int64, 'Hungary': False    86
Name: Cognos_Impromptu, dtype: int64, 'Ireland': False    75
Name: Cognos_Impromptu, dtype: int64, 'Israel': False    252
Name: Cognos_Impromptu, dtype: int64, 'Italy': False    80
Name: Cognos_Impromptu, dtype: int64, 'Japan': False    42
Name: Cognos_Impromptu, dtype: int64, 'Luxembourg': Fa

##### 27.9 MicroStrategy

In [280]:
tool_names = [
    r"MicroStrategy",
    ]

column_name = 'MicroStrategy'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: MicroStrategy, dtype: int64, 'Austria': False    139
Name: MicroStrategy, dtype: int64, 'Belgium': False    64
Name: MicroStrategy, dtype: int64, 'Canada': False    4
Name: MicroStrategy, dtype: int64, 'Czech_Republic': False    68
Name: MicroStrategy, dtype: int64, 'Denmark': False    91
Name: MicroStrategy, dtype: int64, 'Finland': False    58
Name: MicroStrategy, dtype: int64, 'France': False    226
True       1
Name: MicroStrategy, dtype: int64, 'Germany': False    164
Name: MicroStrategy, dtype: int64, 'Greece': False    56
Name: MicroStrategy, dtype: int64, 'Hong_Kong': False    105
Name: MicroStrategy, dtype: int64, 'Hungary': False    86
Name: MicroStrategy, dtype: int64, 'Ireland': False    75
Name: MicroStrategy, dtype: int64, 'Israel': False    252
Name: MicroStrategy, dtype: int64, 'Italy': False    80
Name: MicroStrategy, dtype: int64, 'Japan': False    41
True      1
Name: MicroStrategy, dtype: int64, 'Luxembourg': False    39
True      1
N

##### 27.10 InsightSquared

In [281]:
tool_names = [
    r"InsightSquared",
    ]

column_name = 'InsightSquared'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: InsightSquared, dtype: int64, 'Austria': False    139
Name: InsightSquared, dtype: int64, 'Belgium': False    64
Name: InsightSquared, dtype: int64, 'Canada': False    4
Name: InsightSquared, dtype: int64, 'Czech_Republic': False    68
Name: InsightSquared, dtype: int64, 'Denmark': False    91
Name: InsightSquared, dtype: int64, 'Finland': False    58
Name: InsightSquared, dtype: int64, 'France': False    227
Name: InsightSquared, dtype: int64, 'Germany': False    164
Name: InsightSquared, dtype: int64, 'Greece': False    56
Name: InsightSquared, dtype: int64, 'Hong_Kong': False    105
Name: InsightSquared, dtype: int64, 'Hungary': False    86
Name: InsightSquared, dtype: int64, 'Ireland': False    75
Name: InsightSquared, dtype: int64, 'Israel': False    252
Name: InsightSquared, dtype: int64, 'Italy': False    80
Name: InsightSquared, dtype: int64, 'Japan': False    42
Name: InsightSquared, dtype: int64, 'Luxembourg': False    40
Name: InsightSquared, 

##### 27.11 Sisense

In [282]:
tool_names = [
    r"Sisense",
    ]

column_name = 'Sisense'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Sisense, dtype: int64, 'Austria': False    139
Name: Sisense, dtype: int64, 'Belgium': False    64
Name: Sisense, dtype: int64, 'Canada': False    4
Name: Sisense, dtype: int64, 'Czech_Republic': False    68
Name: Sisense, dtype: int64, 'Denmark': False    91
Name: Sisense, dtype: int64, 'Finland': False    58
Name: Sisense, dtype: int64, 'France': False    227
Name: Sisense, dtype: int64, 'Germany': False    164
Name: Sisense, dtype: int64, 'Greece': False    56
Name: Sisense, dtype: int64, 'Hong_Kong': False    105
Name: Sisense, dtype: int64, 'Hungary': False    86
Name: Sisense, dtype: int64, 'Ireland': False    75
Name: Sisense, dtype: int64, 'Israel': False    252
Name: Sisense, dtype: int64, 'Italy': False    80
Name: Sisense, dtype: int64, 'Japan': False    42
Name: Sisense, dtype: int64, 'Luxembourg': False    40
Name: Sisense, dtype: int64, 'Netherlands': False    40
Name: Sisense, dtype: int64, 'New_Zealand': False    52
Name: Sisense, dtype: 

##### 27.12 Dundas BI

In [283]:
tool_names = [
    r"Dundas BI",
    ]

column_name = 'Dundas_BI'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Dundas_BI, dtype: int64, 'Austria': False    139
Name: Dundas_BI, dtype: int64, 'Belgium': False    64
Name: Dundas_BI, dtype: int64, 'Canada': False    4
Name: Dundas_BI, dtype: int64, 'Czech_Republic': False    68
Name: Dundas_BI, dtype: int64, 'Denmark': False    91
Name: Dundas_BI, dtype: int64, 'Finland': False    58
Name: Dundas_BI, dtype: int64, 'France': False    227
Name: Dundas_BI, dtype: int64, 'Germany': False    164
Name: Dundas_BI, dtype: int64, 'Greece': False    56
Name: Dundas_BI, dtype: int64, 'Hong_Kong': False    105
Name: Dundas_BI, dtype: int64, 'Hungary': False    86
Name: Dundas_BI, dtype: int64, 'Ireland': False    75
Name: Dundas_BI, dtype: int64, 'Israel': False    252
Name: Dundas_BI, dtype: int64, 'Italy': False    80
Name: Dundas_BI, dtype: int64, 'Japan': False    42
Name: Dundas_BI, dtype: int64, 'Luxembourg': False    40
Name: Dundas_BI, dtype: int64, 'Netherlands': False    40
Name: Dundas_BI, dtype: int64, 'New_Zealand'

##### 27.13 Domo

In [284]:
tool_names = [
    r"Domo",
    ]

column_name = 'Domo'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Domo, dtype: int64, 'Austria': False    139
Name: Domo, dtype: int64, 'Belgium': False    64
Name: Domo, dtype: int64, 'Canada': False    4
Name: Domo, dtype: int64, 'Czech_Republic': False    68
Name: Domo, dtype: int64, 'Denmark': False    91
Name: Domo, dtype: int64, 'Finland': False    58
Name: Domo, dtype: int64, 'France': False    227
Name: Domo, dtype: int64, 'Germany': False    164
Name: Domo, dtype: int64, 'Greece': False    56
Name: Domo, dtype: int64, 'Hong_Kong': False    105
Name: Domo, dtype: int64, 'Hungary': False    86
Name: Domo, dtype: int64, 'Ireland': False    75
Name: Domo, dtype: int64, 'Israel': False    252
Name: Domo, dtype: int64, 'Italy': False    80
Name: Domo, dtype: int64, 'Japan': False    42
Name: Domo, dtype: int64, 'Luxembourg': False    40
Name: Domo, dtype: int64, 'Netherlands': False    40
Name: Domo, dtype: int64, 'New_Zealand': False    52
Name: Domo, dtype: int64, 'Norway': False    31
Name: Domo, dtype: int64, 'P

##### 27.14 Looker

In [285]:
tool_names = [
    r"Looker",
    ]

column_name = 'Looker'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    67
Name: Looker, dtype: int64, 'Austria': False    139
Name: Looker, dtype: int64, 'Belgium': False    64
Name: Looker, dtype: int64, 'Canada': False    4
Name: Looker, dtype: int64, 'Czech_Republic': False    68
Name: Looker, dtype: int64, 'Denmark': False    91
Name: Looker, dtype: int64, 'Finland': False    58
Name: Looker, dtype: int64, 'France': False    223
True       4
Name: Looker, dtype: int64, 'Germany': False    164
Name: Looker, dtype: int64, 'Greece': False    56
Name: Looker, dtype: int64, 'Hong_Kong': False    105
Name: Looker, dtype: int64, 'Hungary': False    86
Name: Looker, dtype: int64, 'Ireland': False    74
True      1
Name: Looker, dtype: int64, 'Israel': False    249
True       3
Name: Looker, dtype: int64, 'Italy': False    79
True      1
Name: Looker, dtype: int64, 'Japan': False    42
Name: Looker, dtype: int64, 'Luxembourg': False    40
Name: Looker, dtype: int64, 'Netherlands': False    40
Name: Looker, dtype: int64, 'New_Zealand': Fa

#### 28. Microsoft Excel

In [286]:
tool_names = [
    r"Excel",
    ]

column_name = 'Excel'

dfs = add_tech_to_dfs(dfs, tool_names, column_name)

{'Australia': False    66
True      1
Name: Excel, dtype: int64, 'Austria': False    139
Name: Excel, dtype: int64, 'Belgium': False    62
True      2
Name: Excel, dtype: int64, 'Canada': False    4
Name: Excel, dtype: int64, 'Czech_Republic': False    68
Name: Excel, dtype: int64, 'Denmark': False    91
Name: Excel, dtype: int64, 'Finland': False    57
True      1
Name: Excel, dtype: int64, 'France': False    227
Name: Excel, dtype: int64, 'Germany': False    164
Name: Excel, dtype: int64, 'Greece': False    56
Name: Excel, dtype: int64, 'Hong_Kong': False    105
Name: Excel, dtype: int64, 'Hungary': False    86
Name: Excel, dtype: int64, 'Ireland': False    74
True      1
Name: Excel, dtype: int64, 'Israel': False    252
Name: Excel, dtype: int64, 'Italy': False    80
Name: Excel, dtype: int64, 'Japan': False    42
Name: Excel, dtype: int64, 'Luxembourg': False    39
True      1
Name: Excel, dtype: int64, 'Netherlands': False    40
Name: Excel, dtype: int64, 'New_Zealand': False    5

#### 29. Certifications

In [287]:
def add_edu_to_dfs(dfs, education_names, education_non_eng, column_name):

    for country, df in dfs.items():

        country_languages = countries_languages[country]

        for language in country_languages:

            names_to_add = education_non_eng[language]

            education_names.extend(names_to_add)

        add_is_needed_column_to_df(df, column_name, education_names)

        dfs[country] = df

    show_results(column_name, dfs)
    
    return dfs

Checking if there is a need for any certification.

In [288]:
certificates_non_eng = {
    'Arabic': [r"الشهادات", r"شهادة"],
    'Basque': [r"ziurtagiriak", r"ziurtagiri"],
    'Catalan': [r"certificats", r"certificat"],
    'Czech': [r"certifikáty", r"certifikát"],
    'German': [r"Zertifikate", r"Zertifikat"],
    'Danish': [r"certifikater", r"certifikat"],
    'Spanish': [r"certificados", r"certificado"],
    'Finnish': [r"todistukset", r"todistus"],
    'French': [r"certificats", r"certificat"],
    'Frisian': [r"sertifikaten", r"sertifikaat"],
    'Galician': [r"certificados", r"certificado"],
    'Greek': [r"πιστοποιητικά", r"πιστοποιητικό"],
    'Hebrew': [r"תעודות", r"תעודה"],
    'Hungarian': [r"tanúsítványok", r"tanúsítvány"],
    'Italian': [r"certificati", r"certificato"],
    'Kurdish': [r"belge", r"bername"],
    'Dutch': [r"certificaten", r"certificaat"],
    'Norwegian': [r"sertifikater", r"sertifikat"],
    'Polish': [r"certyfikat", r"certyfikaty"],
    'Portuguese': [r"certificados", r"certificado"],
    'Romanian': [r"certificate", r"certificat"],
    'Slovakian': [r"certifikáty", r"certifikát"],
    'Slovenian': [r"certifikati", r"certifikat"],
    'Swedish': [r"certifikat", r"certifikat"],
    'Turkish': [r"sertifikalar", r"sertifika"],
    'Japanese': [r"証明書", r"資格証明書"],
    'Korean': [r"증명서", r"자격증"],
    'Chinese_TR': [r"證書", r"資格證書"],
    'Chinese_SP': [r"证书", r"资格证书"],
}

In [289]:
def make_check_certificate(country: str):

    def check_certificate(job_description: str):

        # Coursera, Udemy, Datacamp etc. list
        certifications = [
            r"Data Engineering, Big Data, and Machine Learning on GCP",
            r"Google Professional Data Engineer",
            r"Microsoft Azure Data Engineering",
            r"Nanodegree",
            r"DataCamp",
            r"Data Engineering, Big Data, and Machine Learning on GCP",
            r"Python, Bash and SQL Essentials for Data Engineering Specialization",
            r"Data Engineering ETL, Web Scraping, and Automation",
            r"Big Data Engineering with Hadoop and Spark",
            r"Certificate",
            r"Certificates",
            ]

        country_languages = countries_languages[country]
        certificate_eng = certifications[-2:]

        for language in country_languages:

            certs_to_add = certificates_non_eng[language]

            certifications.extend(certs_to_add)


        for certificate in certifications:
            if re.search((r"\b" + certificate + r"\b"), job_description, re.IGNORECASE):

                for certificate_non_eng in certificates_non_eng.values():

                    if any(certificate.lower() in cert.lower() for cert in certificate_non_eng) or \
                    any(certificate.lower() in cert.lower() for cert in certificate_eng):
                        return "Other"

                return certificate
            
        return np.nan
    
    return check_certificate


column_name = 'Certificate'

for country, df in dfs.items():

    check_certificate = make_check_certificate(country)
        
    df[column_name] = df['Description'].apply(check_certificate)

    dfs[country] = df

del check_certificate

show_results(column_name, dfs)

{'Australia': Series([], Name: Certificate, dtype: int64), 'Austria': Other    1
Name: Certificate, dtype: int64, 'Belgium': Other    2
Name: Certificate, dtype: int64, 'Canada': Series([], Name: Certificate, dtype: int64), 'Czech_Republic': Series([], Name: Certificate, dtype: int64), 'Denmark': Series([], Name: Certificate, dtype: int64), 'Finland': Series([], Name: Certificate, dtype: int64), 'France': Other    1
Name: Certificate, dtype: int64, 'Germany': Series([], Name: Certificate, dtype: int64), 'Greece': Series([], Name: Certificate, dtype: int64), 'Hong_Kong': Other    4
Name: Certificate, dtype: int64, 'Hungary': Series([], Name: Certificate, dtype: int64), 'Ireland': Other    1
Name: Certificate, dtype: int64, 'Israel': Series([], Name: Certificate, dtype: int64), 'Italy': Other    1
Name: Certificate, dtype: int64, 'Japan': Series([], Name: Certificate, dtype: int64), 'Luxembourg': Series([], Name: Certificate, dtype: int64), 'Netherlands': Series([], Name: Certificate, dt

In [290]:
del add_tech_to_dfs, certificates_non_eng, make_check_certificate

#### 30. Needed education level

Keep in mind that not all countries' educational levels are equal or the same.

##### 30.1 BA

In [291]:
education_non_eng = {
    'Arabic': [r"بكالوريوس", r"العلوم الأساسية"],
    'Basque': [ r"Lizentziatura",  r"Lizentziadun"],
    'Catalan': [r"Llicenciatura", r"Llicenciat"],
    'Czech': [r"Bakalář", r"Bakalářský"],
    'German': [r"Bakkalaureatsabschluss", r"Bakkalaureat", r"Bakkalaureus"],
    'Danish': [r"Kandidat"],
    'Spanish': [r"Grado", r"Licenciatura"],
    'Finnish': [ r"Kandidaatti", r"Luonnontieteiden kandidaatti"],
    'French': [r"licence", r"licence universitaire"],
    'Frisian': [],
    'Galician': [r"Grao", r"licenciatura", r"licenciado"],
    'Greek': [r"Πτυχίο", r"Πτυχιακός"],
    'Hebrew': [r"תואר ראשון", r"בצלמל"],
    'Hungarian': [r"Alapképzés", r"diplomás"],
    'Italian': [ r"Laurea", r"Triennale"],
    'Kurdish': [ r"Zanist"],
    'Dutch': [r"Bachelordiploma"],
    'Norwegian': [r"Bachelorgrad"],
    'Polish': [r"Licencjat", r"Inżynier"],
    'Portuguese': [r"Bacharelado", r"diploma de bacharel", r"solteiro", r"celibatário"],
    'Romanian': [r"burlac", r"licenţiat"],
    'Slovakian': [r"Bakalár"],
    'Slovenian': [r"Samec"],
    'Swedish': [r"Ungkarl", r"Kandidat"],
    'Turkish': [r"Lisans"],
    'Japanese': [r"学士号", r"学士", r"理学士", r"学士課程"],
    'Korean': [r"학사 학위", r"학사", r"이학사", r"학사 학위과정"],
    'Chinese_TR': [r"學士學位", r"學士", r"理學士", r"學士學位課程"],
    'Chinese_SP': [r"学士学位", r"学士", r"理学士", r"学士学位课程"],
}

In [292]:
education_name = [
    r"BA",
    r"Bachelor",
    r"BSc",
    r"Bachelors"
    ]

column_name = 'BA'

dfs = add_edu_to_dfs(dfs, education_name, education_non_eng, column_name)

{'Australia': False    67
Name: BA, dtype: int64, 'Austria': False    134
True       5
Name: BA, dtype: int64, 'Belgium': False    60
True      4
Name: BA, dtype: int64, 'Canada': False    4
Name: BA, dtype: int64, 'Czech_Republic': False    68
Name: BA, dtype: int64, 'Denmark': False    90
True      1
Name: BA, dtype: int64, 'Finland': False    58
Name: BA, dtype: int64, 'France': False    225
True       2
Name: BA, dtype: int64, 'Germany': False    162
True       2
Name: BA, dtype: int64, 'Greece': False    55
True      1
Name: BA, dtype: int64, 'Hong_Kong': False    95
True     10
Name: BA, dtype: int64, 'Hungary': False    86
Name: BA, dtype: int64, 'Ireland': False    71
True      4
Name: BA, dtype: int64, 'Israel': False    229
True      23
Name: BA, dtype: int64, 'Italy': False    71
True      9
Name: BA, dtype: int64, 'Japan': False    42
Name: BA, dtype: int64, 'Luxembourg': False    38
True      2
Name: BA, dtype: int64, 'Netherlands': False    40
Name: BA, dtype: int64, 'New

##### 30.2 MS

In [293]:
education_non_eng = {
    'Arabic':  [r"الماجستير", r"الماجستيرات"],
    'Basque': [r"Masterren"],
    'Catalan': [r"Mestres"],
    'Czech': [r"Magistr", r"Magisterský", r"magisterský"],
    'German': [r"Meister"],
    'Danish': [r"Kandidatuddannelse"],
    'Spanish': [r"Máster", r"Maestría"],
    'Finnish': [r"Maisteri", r"Luonnontieteiden maisteri"],
    'French': [r"Maîtrise", r"Master universitaire"],
    'Frisian': [],
    'Galician': [r"Mestrado", r"mestrado universitario"],
    'Greek': [ r"Μεταπτυχιακό", r"μεταπτυχιακός", r"Μεταπτυχιακή σπουδή"],
    'Hebrew': [r"תואר שני", r"מגיסטר"],
    'Hungarian': [r"Mesterképzés", r"mesterképző"],
    'Italian': [r"Laurea magistrale", r"Magistrale"],
    'Kurdish': [ r"Masterên"],
    'Dutch': [r"Meesters"],
    'Norwegian': [r"Mestere"],
    'Polish': [r"Magister", r"magisterski"],
    'Portuguese': [r"Mestras", r"Mestres"],
    'Romanian': [r"Masterat"],
    'Slovakian': [r"Majstri"],
    'Slovenian': [r"Magistri", r"Mojstri"],
    'Swedish': [r"Mästare"],
    'Turkish': [r"Ustalar", r"ustaları"],
    'Japanese': [r"学士", r"学士号", r"学士課程", r"バチェラー"],
    'Korean': [r"학사", r"학사학위", r"배철러"],
    'Chinese_TR': [r"學士", r"學士學位", r"學士學位課程", r"學士學位課程"],
    'Chinese_SP': [r"学士", r"学士学位", r"学士学位课程", r"本科"],
}

In [294]:
education_name = [
    r"MS",
    r"MSc",
    r"Master",
    r"Masters",
    r"master\'s"
    ]

column_name = 'MS'

dfs = add_edu_to_dfs(dfs, education_name, education_non_eng, column_name)

{'Australia': False    64
True      3
Name: MS, dtype: int64, 'Austria': False    133
True       6
Name: MS, dtype: int64, 'Belgium': False    55
True      9
Name: MS, dtype: int64, 'Canada': False    4
Name: MS, dtype: int64, 'Czech_Republic': False    66
True      2
Name: MS, dtype: int64, 'Denmark': False    87
True      4
Name: MS, dtype: int64, 'Finland': False    57
True      1
Name: MS, dtype: int64, 'France': False    200
True      27
Name: MS, dtype: int64, 'Germany': False    161
True       3
Name: MS, dtype: int64, 'Greece': False    55
True      1
Name: MS, dtype: int64, 'Hong_Kong': False    99
True      6
Name: MS, dtype: int64, 'Hungary': False    86
Name: MS, dtype: int64, 'Ireland': False    73
True      2
Name: MS, dtype: int64, 'Israel': False    245
True       7
Name: MS, dtype: int64, 'Italy': False    76
True      4
Name: MS, dtype: int64, 'Japan': False    41
True      1
Name: MS, dtype: int64, 'Luxembourg': False    33
True      7
Name: MS, dtype: int64, 'Nether

##### 30.3 Phd

In [295]:
education_non_eng = {
    'Arabic':  [r"دكتوراه في الفلسفة", r"دكتوراه", r"دكتوراة"],
    'Basque': [r"Filosofia Doktore", r"doktoretza", r"Doktoregoa"],
    'Catalan': [r"Mestres", r"Doctorat"],
    'Czech': [r"Doktor", r"Doktorský"],
    'German': [r"Doktorin", r"Doktor"],
    'Danish': [],
    'Spanish': [r"Doctor", r"Doctora"],
    'Finnish': [r"Tohtori"],
    'French': [r"Doctorat"],
    'Frisian': [],
    'Galician': [r"Doutorando", r"Doutoramento"],
    'Greek': [],
    'Hebrew': [r"דוקטורט", r"תואר שלישי"],
    'Hungarian': [r"Dr"],
    'Italian': [r"dottorato di ricerca"],
    'Kurdish': [ r"Dr"],
    'Dutch': [r"Doctoraat"],
    'Norwegian': [],
    'Polish': [r"Doktor", r"doktorski"],
    'Portuguese': [r"doutorado"],
    'Romanian': [r"doctorat"],
    'Slovakian': [r"PhDr"],
    'Slovenian': [r"doktorat znanosti"],
    'Swedish': [r"doktorsexamen"],
    'Turkish': [r"Doktora"],
    'Japanese': [r"哲学博士"],
    'Korean': [r"박사학위"],
    'Chinese_TR': [r"博士", r"哲學博士"],
    'Chinese_SP': [r"博士", r"哲学博士"],
}

In [296]:
education_name = [
    r"Phd",
    r"Ph\.D",
    r"DPhil",
    r"Doctor of Philosophy",
    ]

column_name = 'Phd'

dfs = add_edu_to_dfs(dfs, education_name, education_non_eng, column_name)

{'Australia': False    67
Name: Phd, dtype: int64, 'Austria': False    139
Name: Phd, dtype: int64, 'Belgium': False    63
True      1
Name: Phd, dtype: int64, 'Canada': False    4
Name: Phd, dtype: int64, 'Czech_Republic': False    68
Name: Phd, dtype: int64, 'Denmark': False    91
Name: Phd, dtype: int64, 'Finland': False    57
True      1
Name: Phd, dtype: int64, 'France': False    227
Name: Phd, dtype: int64, 'Germany': False    164
Name: Phd, dtype: int64, 'Greece': False    56
Name: Phd, dtype: int64, 'Hong_Kong': False    104
True       1
Name: Phd, dtype: int64, 'Hungary': False    86
Name: Phd, dtype: int64, 'Ireland': False    75
Name: Phd, dtype: int64, 'Israel': False    252
Name: Phd, dtype: int64, 'Italy': False    79
True      1
Name: Phd, dtype: int64, 'Japan': False    42
Name: Phd, dtype: int64, 'Luxembourg': False    40
Name: Phd, dtype: int64, 'Netherlands': False    40
Name: Phd, dtype: int64, 'New_Zealand': False    52
Name: Phd, dtype: int64, 'Norway': False    3

### Overview

In [297]:
dfs['Austria'].shape

(139, 109)

In [298]:
dfs['Austria'].columns

Index(['Company_name', 'Rating', 'Job_title', 'Seniority', 'Salary_min',
       'Salary_max', 'Salary_avg', 'Salary_currency',
       'Salary_employer_provided', 'Salary_hourly',
       ...
       'InsightSquared', 'Sisense', 'Dundas_BI', 'Domo', 'Looker', 'Excel',
       'Certificate', 'BA', 'MS', 'Phd'],
      dtype='object', length=109)

In [299]:
del column_name, tool_names, education_name, education_non_eng

### 31. Final cleanup

##### 31.1 Rename columns

In [300]:
for country, df in dfs.items():
    dfs[country] = df.rename({
        'Company_name': 'Name',
        'Job_title': 'Title',
        'Salary_min': 'Min',
        'Salary_max': 'Max',
        'Salary_avg': 'Avg',
        'Salary_currency': 'Currency',
        'Is_salary': 'Specified',
        'Salary_employer_provided': 'Employer_provided',
        'Salary_hourly': 'Is_hourly',
        'Alibaba_Cloud': 'Alibaba',
        'Oracle_Cloud': 'Oracle',
        'IBM_cloud': 'IBM',
        'Tencent_cloud': 'Tencent',
        'DigitalOcean_cloud': 'DigitalOcean',
        'Lincode_cloud': 'Lincode'
        }, axis=1)


In [301]:
dfs['United_States']['Alibaba'].value_counts()

False    241
Name: Alibaba, dtype: int64

##### 31.2 Change columns order

In [302]:
columns_multiindex = [
    ('Job_details', 'Title'),
    ('Job_details', 'Description'),
    ('Job_details', 'Seniority'),
    ('Job_details', 'City'),
    ('Job_details', 'State'),
    ('Job_details', 'Country'),
    ('Job_details', 'Region'),
    ('Job_details', 'Job_age'),
    ('Job_details', 'Easy_apply'),
    ('Salary', 'Min'),
    ('Salary', 'Max'),
    ('Salary', 'Avg'),
    ('Salary', 'Currency'),
    ('Salary', 'Employer_provided'),
    ('Salary', 'Is_hourly'),
    ('Salary', 'Specified'),
    ('Company_info', 'Name'),
    ('Company_info', 'Rating'),
    ('Company_info', 'Employees'),
    ('Company_info', 'Type_of_ownership'),
    ('Company_info', 'Sector'),
    ('Company_info', 'Industry'),
    ('Company_info', 'Company_age'),
    ('Company_info', 'Revenue_USD'),
    ('Company_info', 'Friend_recommend'),
    ('Company_info', 'CEO_approval'),
    ('Company_info', 'Career_opportunities'),
    ('Company_info', 'Comp_&_benefits'),
    ('Company_info', 'Senior_management'),
    ('Company_info', 'Work/Life_balance'),
    ('Company_info', 'Culture_&_values'),
    ('Company_info', 'Pros'),
    ('Company_info', 'Cons'),
    ('Company_info', 'Benefits_rating'),
    ('Company_info', 'Benefits_reviews'),
    ('Education', 'BA'),
    ('Education', 'MS'),
    ('Education', 'Phd'),
    ('Education', 'Certificate'),
    ('Version_control', 'Git'),
    ('Cloud_platforms', 'AWS'),
    ('Cloud_platforms', 'Microsoft_Azure'),
    ('Cloud_platforms', 'GPC'),
    ('Cloud_platforms', 'Alibaba'),
    ('Cloud_platforms', 'Oracle'),
    ('Cloud_platforms', 'IBM'),
    ('Cloud_platforms', 'Tencent'),
    ('Cloud_platforms', 'OVHcloud'),
    ('Cloud_platforms', 'DigitalOcean'),
    ('Cloud_platforms', 'Lincode'),
    ('RDBMS', 'PostgreSQL'),
    ('RDBMS', 'Microsoft_SQL_Server'),
    ('RDBMS', 'IBM_Db2'),
    ('RDBMS', 'MySQL'),
    ('RDBMS', 'Oracle_PL_SQL'),
    ('NOSQL', 'MongoDB'),
    ('NOSQL', 'Cassandra'),
    ('NOSQL', 'Amazon_DynamoDB'),
    ('NOSQL', 'Neo4j'),
    ('Search_&_Analytics', 'Apache_Solr'),
    ('Search_&_Analytics', 'Amazon_Redshift'),
    ('Search_&_Analytics', 'Google_BigQuery'),
    ('Search_&_Analytics', 'Snowflake'),
    ('Search_&_Analytics', 'Oracle_Exadata'),
    ('Search_&_Analytics', 'SAP_HANA'),
    ('Search_&_Analytics', 'Teradata'),
    ('Data_integration_and_processing', 'Informatica_PowerCenter'),
    ('Data_integration_and_processing', 'Databricks'),
    ('Data_integration_and_processing', 'Presto'),
    ('Stream_processing_tools', 'Apache_Kafka'),
    ('Stream_processing_tools', 'Apache_Flink'),
    ('Stream_processing_tools', 'Dataflow'),
    ('Workflow_orchestration_tools', 'Apache_Airflow'),
    ('Workflow_orchestration_tools', 'Luigi'),
    ('Workflow_orchestration_tools', 'SSIS'),
    ('Big_Data_processing', 'Apache_Hadoop'),
    ('Big_Data_processing', 'Apache_Hive'),
    ('Big_Data_processing', 'Apache_Spark'),
    ('OS', 'Linux'),
    ('OS', 'Unix'),
    ('OS', 'Windows'),
    ('OS', 'macOS'),
    ('Programming_languages', 'Python'),
    ('Programming_languages', 'R'),
    ('Programming_languages', 'Scala'),
    ('Programming_languages', 'Julia'),
    ('Programming_languages', 'SQL'),
    ('Programming_languages', 'Java'),
    ('Programming_languages', 'C++'),
    ('Programming_languages', 'Go'),
    ('Programming_languages', 'Rust'),
    ('Programming_languages', 'Bash'),
    ('Programming_languages', 'PowerShell'),
    ('Programming_languages', 'CLI'),
    ('Business_Intelligence_Tools', 'Tableau'),
    ('Business_Intelligence_Tools', 'Power_BI'),
    ('Business_Intelligence_Tools', 'Google_Analytics'),
    ('Business_Intelligence_Tools', 'QlikView'),
    ('Business_Intelligence_Tools', 'Oracle_BI_server'),
    ('Business_Intelligence_Tools', 'SAS_Analytics'),
    ('Business_Intelligence_Tools', 'Lumira'),
    ('Business_Intelligence_Tools', 'Cognos_Impromptu'),
    ('Business_Intelligence_Tools', 'MicroStrategy'),
    ('Business_Intelligence_Tools', 'InsightSquared'), 
    ('Business_Intelligence_Tools', 'Sisense'), 
    ('Business_Intelligence_Tools', 'Dundas_BI'),
    ('Business_Intelligence_Tools', 'Domo'), 
    ('Business_Intelligence_Tools', 'Looker'), 
    ('Business_Intelligence_Tools', 'Excel')
]

In [303]:
def move_column__to_index(df: pd.DataFrame, column_name: str, index: int):
    df.insert(index, column_name, df.pop(column_name))
    return df


def move_columns_to_index(df: pd.DataFrame, column_names: list[str], index: int):
    for col in column_names:
        df.insert(index, col, df.pop(col))
        index += 1

    return df

new_columns_order = [t[1] for t in columns_multiindex]
for country, df in dfs.items():
    dfs[country] = move_columns_to_index(df, new_columns_order, 0)


dfs['Austria'].dtypes

  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
  df.insert(index, col, df.pop(col))
 

Title           object
Description     object
Seniority       object
City            object
State          float64
                ...   
Sisense           bool
Dundas_BI         bool
Domo              bool
Looker            bool
Excel             bool
Length: 109, dtype: object

In [304]:
dfs['Austria'].columns[-1]

'Excel'

##### 31.3 Add multiindex

In [305]:
for country, df in dfs.items():
    dfs[country].columns = pd.MultiIndex.from_tuples(columns_multiindex)


In [306]:
dfs['Austria']['Company_info']['Name'].head()

9                Talentbase
10           Ratbacher GmbH
11    REWE International IT
12      Erste Group Bank AG
15               Talentbase
Name: Name, dtype: object

In [307]:
dfs['Austria']['Business_Intelligence_Tools']['Excel'].head()

9     False
10    False
11    False
12    False
15    False
Name: Excel, dtype: bool

In [308]:
dfs['United_States']['Programming_languages'].head()

Unnamed: 0,Python,R,Scala,Julia,SQL,Java,C++,Go,Rust,Bash,PowerShell,CLI
0,False,False,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False


#### 32. Concat countries to one dataframe 

In [309]:
df_all_countries = pd.concat(dfs.values())

df_all_countries.shape

(3094, 109)

#### 33. Reset the index

In [310]:
df_all_countries.reset_index(inplace=True, drop=True)
df_all_countries.tail()

Unnamed: 0_level_0,Job_details,Job_details,Job_details,Job_details,Job_details,Job_details,Job_details,Job_details,Job_details,Salary,...,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools
Unnamed: 0_level_1,Title,Description,Seniority,City,State,Country,Region,Job_age,Easy_apply,Min,...,SAS_Analytics,Lumira,Cognos_Impromptu,MicroStrategy,InsightSquared,Sisense,Dundas_BI,Domo,Looker,Excel
3089,Data Warehouse Engineer,"Decision Research Corporation (DRC), a softwar...",,Honolulu,HI,United States,North America,2,True,83000.0,...,False,False,False,False,False,False,False,False,False,False
3090,Data Engineer,Overview:\r\nAbout Wipro:\r\nWipro Limited (NY...,,Warren,MI,United States,North America,1,False,66000.0,...,False,False,False,False,False,False,False,False,False,False
3091,: Data Engineer,Title: Data Engineer (NO C2C/C2H/OPT)\r\nLocat...,,Saint Louis,MO,United States,North America,21,True,124800.0,...,False,False,False,False,False,False,False,False,False,False
3092,Data Engineer,"Job Title: Data Engineer\r\nLocation: Atlanta,...",,Atlanta,GA,United States,North America,1,True,120000.0,...,False,False,False,False,False,False,False,False,False,False
3093,Data Engineer,Position Details:\r\nTitle: Data Engineer\r\nI...,,Durham,NC,United States,North America,1,True,77000.0,...,False,False,False,False,False,False,False,False,False,False


#### 34. Save CSV

##### 34.1 Save

In [311]:
import os
from pathlib import Path
from scraper.config.get import get_config

config = get_config()

local_path = os.path.join(
    config['output_path']['main'],
    config['output_path']['clean'],
    "Data_Engineer"
    )

file_name = "Data_Engineer_15-04-2023.csv"
file_path = Path(f"{local_path}/{file_name}")

folder = os.path.dirname(file_path)
if not os.path.exists(folder):
    os.mkdir(folder)


df_all_countries.to_csv(file_path)

##### 34.2 Check save

In [312]:
df_check = pd.read_csv(file_path, index_col=0, header=[0, 1])
df_check.head()

Unnamed: 0_level_0,Job_details,Job_details,Job_details,Job_details,Job_details,Job_details,Job_details,Job_details,Job_details,Salary,...,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools
Unnamed: 0_level_1,Title,Description,Seniority,City,State,Country,Region,Job_age,Easy_apply,Min,...,SAS_Analytics,Lumira,Cognos_Impromptu,MicroStrategy,InsightSquared,Sisense,Dundas_BI,Domo,Looker,Excel
0,Data Engineer,Help us make a big green dent in the universe....,,Melbourne,,Australia,Oceania,7,True,,...,False,False,False,False,False,False,False,False,False,False
1,Data Engineer,"We’ve only just begun, but what a beginning. I...",,North Sydney,,Australia,Oceania,2,False,75932.0,...,False,False,False,False,False,False,False,False,False,False
2,"Expressions of Interest - Data Engineer, Data ...","At EY, you’ll have the chance to build a caree...",,Sydney,,Australia,Oceania,14,False,61017.0,...,False,False,False,False,False,False,False,False,False,False
3,Data Engineer,"Why 7-Eleven?\r\nWe're an agile, human centred...",,New South Wales,,Australia,Oceania,2,False,,...,False,False,False,False,False,False,False,False,False,False
4,Data Engineer,Overview\r\nThe Data Engineer delivers enterpr...,,Melbourne,,Australia,Oceania,17,False,67796.0,...,False,False,False,False,False,False,False,False,False,False


In [313]:
df_all_countries.head()

Unnamed: 0_level_0,Job_details,Job_details,Job_details,Job_details,Job_details,Job_details,Job_details,Job_details,Job_details,Salary,...,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools,Business_Intelligence_Tools
Unnamed: 0_level_1,Title,Description,Seniority,City,State,Country,Region,Job_age,Easy_apply,Min,...,SAS_Analytics,Lumira,Cognos_Impromptu,MicroStrategy,InsightSquared,Sisense,Dundas_BI,Domo,Looker,Excel
0,Data Engineer,Help us make a big green dent in the universe....,,Melbourne,,Australia,Oceania,7,True,,...,False,False,False,False,False,False,False,False,False,False
1,Data Engineer,"We’ve only just begun, but what a beginning. I...",,North Sydney,,Australia,Oceania,2,False,75932.0,...,False,False,False,False,False,False,False,False,False,False
2,"Expressions of Interest - Data Engineer, Data ...","At EY, you’ll have the chance to build a caree...",,Sydney,,Australia,Oceania,14,False,61017.0,...,False,False,False,False,False,False,False,False,False,False
3,Data Engineer,"Why 7-Eleven?\r\nWe're an agile, human centred...",,New South Wales,,Australia,Oceania,2,False,,...,False,False,False,False,False,False,False,False,False,False
4,Data Engineer,Overview\r\nThe Data Engineer delivers enterpr...,,Melbourne,,Australia,Oceania,17,False,67796.0,...,False,False,False,False,False,False,False,False,False,False


In [314]:
df_check.shape == df_all_countries.shape

True