## Clean data

### 1. Importing all packages

In [1]:
# External
import os
import re
import numpy as np
import pandas as pd
from typing import Callable

### 2. Importing  a CSV file

In [2]:
def get_dfs_from_CSVs_in_folder(directory: str) -> dict[str, pd.DataFrame]:

    dfs = {}

    # https://regex101.com/r/QYuVDf/1
    pattern = r"Data_Engineer_([a-zA-Z_]+)_\d{2}-\d{2}-\d{4}_\d{2}-\d{2}.csv"

    for __, _, files in os.walk(directory):

        for file in files:
            if file.endswith('.csv'):
                match = re.search(pattern, file)
                if match:
                    country = match.group(1)
                    file_path = os.path.join(directory, file)
                    dfs[country] = pd.read_csv(file_path)

    return dfs

In [3]:
CSVs_folder = "data/RAW/Data Engineer"
dfs = get_dfs_from_CSVs_in_folder(CSVs_folder)
dfs

{'Austria':                          Company_name  Rating            Location  \
 0               Riverty Services GmbH     3.9              Vienna   
 1            Infineon Technologies AG     4.2             Villach   
 2            Infineon Technologies AG     4.2             Villach   
 3            Infineon Technologies AG     4.2             Villach   
 4            Infineon Technologies AG     4.2             Villach   
 ..                                ...     ...                 ...   
 865             Infineon Technologies     4.2             Villach   
 866                 Wien Energie GmbH     4.3  Gerasdorf bei Wien   
 867  Trenkwalder Personaldienste GmbH     3.9             Austria   
 868              Silicon Austria Labs     3.6             Villach   
 869                   Noir Consulting     5.0              Vienna   
 
                                              Job_title  \
 0               Process Manager (m/f/d) 80% Homeoffice   
 1    Senior Staff Engineer D

In [4]:
dfs['Austria']

Unnamed: 0,Company_name,Rating,Location,Job_title,Description,Salary,Job_age,Easy_apply,Employees,Type_of_ownership,...,CEO_approval,Career_opportunities,Comp_&_benefits,Culture_&_values,Senior_management,Work/Life_balance,Pros,Cons,Benefits_rating,Benefits_reviews
0,Riverty Services GmbH,3.9,Vienna,Process Manager (m/f/d) 80% Homeoffice,"Everything we do, starts with you.\r\nTogether...",,30d+,False,5001 to 10000,Company - Private,...,1.00,3.7,3.2,3.9,3.5,4.0,"['""Good colleagues and overall atmosphere"" (in...","['""New strategy process is taking ages and peo...",4.0,
1,Infineon Technologies AG,4.2,Villach,Senior Staff Engineer Digital Verification (f/...,You are looking for a new challenge to bring i...,,25d,False,10000+,Company - Public,...,0.95,3.9,3.8,4.1,3.7,4.1,"['""Good work life balance and ambitious"" (in 1...","['""Descent pay, work life balance"" (in 152 rev...",3.9,"['Health Insurance (22 comments)\n""Great cover..."
2,Infineon Technologies AG,4.2,Villach,Senior Staff Engineer Product Development for ...,Do you want to get to know the development of ...,,18d,False,10000+,Company - Public,...,0.95,3.9,3.8,4.1,3.7,4.1,"['""Good work life balance and ambitious"" (in 1...","['""Descent pay, work life balance"" (in 152 rev...",3.9,"['Health Insurance (22 comments)\n""Great cover..."
3,Infineon Technologies AG,4.2,Villach,Component Verification and Product Characteriz...,You are looking for a new challenge to bring i...,,25d,False,10000+,Company - Public,...,0.95,3.9,3.8,4.1,3.7,4.1,"['""Good work life balance and ambitious"" (in 1...","['""Descent pay, work life balance"" (in 152 rev...",3.9,"['Health Insurance (22 comments)\n""Great cover..."
4,Infineon Technologies AG,4.2,Villach,Product Application Engineer (f/m/div)*,"You enjoy working in an international team, in...",,26d,False,10000+,Company - Public,...,0.95,3.9,3.8,4.1,3.7,4.1,"['""Good work life balance and ambitious"" (in 1...","['""Descent pay, work life balance"" (in 152 rev...",3.9,"['Health Insurance (22 comments)\n""Great cover..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865,Infineon Technologies,4.2,Villach,International Graduate Program: Senior Enginee...,Are you ready to start your career in a fast-m...,,30d,False,10000+,Company - Public,...,0.95,3.9,3.8,4.1,3.7,4.1,"['""Good work life balance and ambitious"" (in 1...","['""Descent pay, work life balance"" (in 152 rev...",3.9,"['Health Insurance (22 comments)\n""Great cover..."
866,Wien Energie GmbH,4.3,Gerasdorf bei Wien,Business Intelligence Specialist (w/m/d),Business Intelligence Specialist (w/m/d)\r\n==...,,25d,False,,,...,1.00,3.9,3.8,3.7,3.5,4.4,['No Pros have been reported by the Glassdoor ...,"['""Zu viele low performer, welche in die Pensi...",,
867,Trenkwalder Personaldienste GmbH,3.9,Austria,C++ Entwickler (m/w/d) mit Schwerpunkt Machine...,Unser Kunde (KV EMG) ist ein international tät...,,11d,False,10000+,Company - Private,...,1.00,3.5,3.1,3.8,3.5,3.7,"['""ilfe balance and career growth"" (in 3 revie...","['""Horrible communication between departments,...",,
868,Silicon Austria Labs,3.6,Villach,Cleanroom Process Engineer - Wet Processing (f...,Your future responsibilities\r\nThe process en...,,30d+,False,1 to 50,Self-employed,...,,3.2,3.2,3.5,3.4,3.9,"['""Good work environment."" (in 4 reviews)', '""...","['""Poor management."" (in 3 reviews)', '""there ...",,


In [5]:
del CSVs_folder

### 3. Remove rows only with NaNs

In [6]:

def process_dfs(dfs: dict[str, pd.DataFrame], method: Callable):
    differences = {}

    for country_name, country_df in dfs.items():

        country_df_before = country_df
        country_df: pd.DataFrame = method(country_df)

        is_difference = country_df_before.shape[0] != country_df.shape[0]

        if is_difference:
            differences[country_name] = (
                country_df_before.shape[0], 
                country_df.shape[0]
                )


    return differences

In [7]:
process_dfs(dfs=dfs, method = lambda x: x.dropna(how='all'))

{}

### 4. Remove duplicates

In [8]:
process_dfs(dfs=dfs, method = lambda x: x.drop_duplicates())

{'Austria': (870, 93),
 'Belgium': (810, 165),
 'Czech_Republic': (870, 121),
 'Denmark': (600, 119),
 'Finland': (285, 230),
 'France': (900, 137),
 'Germany': (900, 286),
 'Greece': (720, 119),
 'Hungary': (900, 140),
 'Ireland': (900, 364),
 'Israel': (900, 120),
 'Italy': (600, 143),
 'Luxembourg': (390, 126),
 'Netherlands': (900, 193),
 'Norway': (391, 317),
 'Poland': (420, 131),
 'Portugal': (900, 209),
 'Romania': (900, 131),
 'Spain': (690, 198),
 'Sweden': (900, 135),
 'Switzerland': (900, 159),
 'Turkey': (900, 136),
 'United_Kingdom': (900, 130),
 'United_States': (900, 220)}

There is huge amount of duplicates.

### 5. Remove empty columns

In [9]:
process_dfs(dfs=dfs, method = lambda x: x.dropna(axis=1, how='all'))

{}

#### 6. Split `Location` column into `Region`, `Country`, `State` and `City`.

##### 6.1 Add columns

In [10]:
# todo difference languages

def apply_locations(df, country, region):

    # All those fancy, pansy names for the remote
    remote = [
        "Home office",
        "Telecommute",
        "Virtual office",
        "Off-site",
        "Work from anywhere",
        "Distributed team",
        "Location-independent",
        "Mobile workforce",
        "Cloud office",
        "Online workspace",
        "Digital nomad",
        "Flexible location",
        "Anywhere office",
        "Distance work",
        "Virtual workplace",
        "Mobile office",
        "Roaming job",
        "Borderless office",
        "Satellite office",
        "Remote-enabled",
        "Work from afar"
    ]

    df = dfs[country]

    df['Location'] = df['Location'].apply(lambda location: np.nan if location is not np.nan and location.strip() == "" else location)
    df['Location'] = df['Location'].apply(lambda location: "Remote" if location is not np.nan and location.lower() in remote else location)

    df['City'] = df['Location'].apply(lambda location: location.split(',')[0] if location is not np.nan and "," in location else location)
    df['State'] = df['Location'].apply(lambda location: location.split(',')[1] if location is not np.nan and "," in location else np.nan)

    df['Country'] = country
    df['Region'] = region

    return df

for country_name, country_df in dfs.items():

    region = "Europe"
    if country_name in ["United_States", "Canada"]:
        region = "North America"

    dfs[country_name] = apply_locations(country_df, country_name, region)

In [11]:
dfs['Austria'].iloc[: , -4:30].head()

Unnamed: 0,City,State,Country,Region
0,Vienna,,Austria,Europe
1,Villach,,Austria,Europe
2,Villach,,Austria,Europe
3,Villach,,Austria,Europe
4,Villach,,Austria,Europe


In [12]:
dfs['United_States'].iloc[: , -4:30].head()

Unnamed: 0,City,State,Country,Region
0,Santa Clara,CA,United_States,North America
1,Dearborn,MI,United_States,North America
2,Santa Clara,CA,United_States,North America
3,Newton,MS,United_States,North America
4,Remote,,United_States,North America


##### 6.2 Remove `Location` column

In [13]:

for df in dfs.values():
    del df['Location']

In [14]:
dfs['Austria'].columns

Index(['Company_name', 'Rating', 'Job_title', 'Description', 'Salary',
       'Job_age', 'Easy_apply', 'Employees', 'Type_of_ownership', 'Sector',
       'Founded', 'Industry', 'Revenue_USD', 'Friend_recommend',
       'CEO_approval', 'Career_opportunities', 'Comp_&_benefits',
       'Culture_&_values', 'Senior_management', 'Work/Life_balance', 'Pros',
       'Cons', 'Benefits_rating', 'Benefits_reviews', 'City', 'State',
       'Country', 'Region'],
      dtype='object')

In [15]:
dfs['United_States'].columns == dfs['Austria'].columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

### 7. Clean jobs to only revenant ones

In [16]:
def is_data_engineering_job(job_title: str, specs_variants: list[str] | list[list[str]] = [], terms_variants: list[str] | list[list[str]]= []):

    def flatten(_list):
        """
        Flatten a list of any dimension to a one dimensional list.
        """
        result = []
        for item in _list:
            if isinstance(item, list):
                result.extend(flatten(item))
            else:
                result.append(item)
        return result
    
    specs_variants = flatten(specs_variants)
    terms_variants = flatten(terms_variants)

    specializations = ["Engineer", "Engineering", "Consultant", "architect", "SPECIALIST", "Manager", "Developer", "Architecture", "Administrator", "Head of", "Lead"] + specs_variants

    # We try to not take into account back-end, fullstack, they are somehow different domain, but sometimes in some companies they are data engineers
    data_terms = ["Data", "ETL", "Cloud", "Analytical", "Analytics", "BI", "Buisness Intelligence", "Buisness Analytics", "Database", "Pipeline", "Metadata", "Monitoring", "Datacenter"] + terms_variants

    any_in_specs = isinstance(job_title, str) and any(isinstance(spec, str) and spec.lower() in job_title.lower() for spec in specializations)

    any_in_terms = isinstance(job_title, str) and any(isinstance(term, str) and term.lower() in job_title.lower() for term in data_terms)

    invalid = ["Mobile engineer", "Biomedical Engineer", "Engineering Geologist", "Geotechnical Engineer", "Electrical Engineer", "Project Manager", "Quality Engineer", "Mechanical Engineer", "Mechanical-Design Engineer"]
    
    is_valid = isinstance(job_title, str) and not any(isinstance(term, str) and term.lower() in job_title.lower() for term in invalid)

    return any_in_specs and any_in_terms and is_valid

In [17]:
def show_unique_and_its_len(df: pd.Series):
    print(f"{len(df.unique())} :\n{df.unique()}")


##### 7.1 Austria

In [18]:
selector = 'Austria'

# German Belgian, German Austrian, German Swiss, German Luxembourgian, German Alsatian
specializations_DE = ["Ingenieur", "Berater", "Architekt", "SPEZIALIST", "Manager", "Entwickler", "Architektur", "Architecture", "Administrator"]
data_terms_DE = ["Daten", "ETL", "Cloud", "Analytisch", "Analytics", "BI", "Business Intelligence", "Business Analytics", "Datenbank", "Pipeline", "Metadaten", "Überwachung", "Rechenzentrum", "Datenzentrum"]

In [19]:
show_unique_and_its_len(dfs[selector]['Job_title'])

91 :
['Process Manager (m/f/d) 80% Homeoffice'
 'Senior Staff Engineer Digital Verification (f/m/div)*'
 'Senior Staff Engineer Product Development for GaN-based ICs (f/m/div)*'
 'Component Verification and Product Characterization Engineer (f/m/div)*'
 'Product Application Engineer (f/m/div)*'
 'Freelance Hardware/ Data Centre Field Engineer'
 'IT Solution Engineer (m/w/d)' 'Data Analyst & -Engineer (m/w/d)'
 'Android Mobile Developer' 'Sr. Software Development Engineer – C++'
 'Data Engineer, Analytics (m/f/x)' 'Data Engineer (m/w/d)'
 'Data Engineer im Bereich Data Integration (m/w/d)'
 'Data Platform Engineer (m/f/x)'
 'Head of Formulation Development (m/f/d)' 'Full Stack Engineer (m/f/x)'
 'Data Engineer' 'Data Engineer (m/f/d) - maternity cover 1yr.'
 'Data Engineer, CRM (m/f/x)' 'Data Engineer*'
 'Maschinenbautechniker (m/w/d) / Data Engineer (m/w/d)'
 'Backend Engineer (m/f/x)' 'DATABASE ADMINISTRATOR / DATA ENGINEER'
 'System Engineer (m/w/d)' 'DevOps Engineer'
 'Data Warehous

In [20]:
df = dfs[selector]
filtered_df = df[df.apply(lambda row: is_data_engineering_job(row['Job_title'], specializations_DE, data_terms_DE), axis=1)]


In [21]:
show_unique_and_its_len(filtered_df['Job_title'])

29 :
['Freelance Hardware/ Data Centre Field Engineer'
 'Data Analyst & -Engineer (m/w/d)' 'Android Mobile Developer'
 'Data Engineer, Analytics (m/f/x)' 'Data Engineer (m/w/d)'
 'Data Engineer im Bereich Data Integration (m/w/d)'
 'Data Platform Engineer (m/f/x)' 'Data Engineer'
 'Data Engineer (m/f/d) - maternity cover 1yr.'
 'Data Engineer, CRM (m/f/x)' 'Data Engineer*'
 'Maschinenbautechniker (m/w/d) / Data Engineer (m/w/d)'
 'DATABASE ADMINISTRATOR / DATA ENGINEER'
 'Data Warehouse DevOps Engineer (f/m/x)'
 'Data Scientist / Machine Learning Engineer (m/w/d)'
 'Data Virtualization (Denodo) Engineer (f/m/x)'
 '(Senior) Data Engineer (m/w/d) (Remote innerhalb von...'
 'Senior Data Engineer (m/f/x)' 'Senior Data Engineer | Europe | Remote'
 'Junior - Data Engineer (m/f/x)'
 'Development Engineer and Data Analyst (w/m/d)'
 'Business Intelligence Specialist (w/m/d)'
 'Data Engineer – Python and SQL Specialist (m/f/d)'
 'System Engineer – Monitoring & Analytics (m/f/d)'
 'Senior Data En

In [22]:
dfs[selector] = df

##### 7.2 Belgium

In [23]:
selector = 'Belgium'

# Dutch NL, Dutch BE
specializations_NL = ["SPECIALIST", "Engineering", "Manager", "architect", "Beheerder", "Ingenieur", "Adviseur", "Ontwikkelaar", "Architect", "Techniek", "Architectuur"]
data_terms_NL = ["Business Analytics", "Monitoring", "Leiding", "Metadata", "Cloud", "Business Intelligence", "Data", "Analytisch", "BI", "Database", "Datacenter", "Pipeline", "Analytics", "Bedrijfsanalyse", "Bedrijfsinformatie", "ETL"]

# French, Canadian French, Swiss French, Belgian French
specializations_FR = ["Architecte", "Développeur", "architecte", "SPECIALISTE", "SPÉCIALISTE", "Consultant", "Conseiller", "Ingénieur", "Administrateur", "Architecture", "Manager", "Ingénierie"]
data_terms_FR = ["Analyse", "Cloud", "Business Intelligence", "Analytique", "BI", "Analytique d'affaires", "Analyse de données", "Surveillance", "Métadonnées", "Données", "Pipeline", "Base de données", "Business Analytics", "Analytics", "Centre de données", "ETL", "Data", "Analyse d'affaires", "Intelligence d'affaires"]

In [24]:
show_unique_and_its_len(dfs[selector]['Job_title'])

136 :
['Data Engineer (met affiniteit voor Data Science)' 'Data Engineer'
 'Data engineer / ETL consultant' 'Medior Data Protection Engineer'
 'data manager' 'AWS Data Engineer' 'Azure Data Engineer'
 'Project controls engineer' 'DevOps Engineer (Data)'
 'Mechanical Design Engineer (Hudson Sharp)'
 'Maintenance Reliability Engineer' 'Junior Integration Engineer'
 'Datacenter Engineer' 'District Engineer' 'Azure Data Platform Engineer'
 'Computer vision and image recognition data scientist' 'data analist'
 'Regional Product Engineer' 'Security Engineer | Last step before CISO'
 'Security Engineer | GreenTech Company'
 'Field and solutions engineer IOT'
 'Data Engineer | Digital Manufacturing Platform' 'Senior data engineer'
 'Application Controls Engineer' 'Expert Talent Advisor'
 'Reliability Engineer' 'Developer' 'Signalling Design Engineer'
 'Data Migration Consultant' 'Production Engineer' 'DataOps Engineer'
 'DevOps engineer' 'Quality & Compliance Engineer'
 'Engineer Elektriciteit

In [25]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_FR, specializations_NL, specializations_DE],
        [data_terms_FR, data_terms_NL, data_terms_DE]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

41 :
['Data Engineer (met affiniteit voor Data Science)' 'Data Engineer'
 'Data engineer / ETL consultant' 'Medior Data Protection Engineer'
 'data manager' 'AWS Data Engineer' 'Azure Data Engineer'
 'DevOps Engineer (Data)' 'Maintenance Reliability Engineer'
 'Datacenter Engineer' 'Azure Data Platform Engineer'
 'Data Engineer | Digital Manufacturing Platform' 'Senior data engineer'
 'Reliability Engineer' 'Data Migration Consultant' 'DataOps Engineer'
 'B2B Data Engineer' 'Cloud system engineer'
 'R&D Professional – Power Systems / Data Engineer'
 'Project Engineer Sustainability' 'Data Platform Integration Engineer'
 'PRODUCTION DATA ENGINEER' 'DATA ENGINEER' 'System Engineer Datacenter'
 'Cloud solution architect'
 'DATA ENGINEER VOOR SOFWAREBEDRIJF BINNEN LOGISTIEKE SECTOR'
 'Data Integrations Engineer' 'DATA & APPLICATION SPECIALIST'
 'Industrial Data Engineer' 'Analytical Maintenance Engineer'
 'DATA ENGINEER & MODELER' 'E2E DATABASE ENGINEER' 'Cloud Data Engineer'
 'Data Engine

In [26]:
dfs[selector] = df

##### 7.3 Canada

In [27]:
selector = 'Canada'

In [28]:
show_unique_and_its_len(dfs[selector]['Job_title'])

11 :
['BI Engineer' 'Analytics Engineer' 'Senior Analytics Engineer'
 'Analytics Implementation Engineer'
 'Financial Engineer/ACM Business Analyst'
 'Business Analyst & Process Engineer'
 'Analyst/Senior Associate, AML - Financial Engineering & Modeling (FEM)'
 'Capital & Maintenance Program Analyst'
 'Business Analyst Engineering Budget Planning'
 'Analytics Engineering Manager'
 'Senior Manager, Project Delivery Business Analyst, Retail & Small Business (Contract)']


In [29]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_FR],
        [data_terms_FR]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

5 :
['BI Engineer' 'Analytics Engineer' 'Senior Analytics Engineer'
 'Analytics Implementation Engineer' 'Analytics Engineering Manager']


In [30]:
dfs[selector] = df

##### 7.4 Czech_Republic

In [31]:
selector = 'Czech_Republic'

specializations_CZ = ["Inženýr", "Konstruktér", "Architekt", "SPECIALISTA", "Manažer", "Vývojář", "Architektura", "Správce"]
data_terms_CZ = ["Data", "ETL", "Cloud", "Analytický", "Analytika", "BI", "Business Intelligence", "Business Analytics", "Databáze", "Pipeline", "Metadata", "Monitorování", "Datacentrum"]

specializations_SK = ["Inžinier", "Konzultant", "architekt", "SPECIALISTA", "Manažér", "Vývojár", "Architektúra", "Správca"]
data_terms_SK = ["Data", "ETL", "Cloud", "Analytický", "Analytika", "BI", "Business Intelligence", "Business Analytics", "Databáza", "Pipeline", "Metadata", "Monitorovanie", "Datacentrum"]

specializations_HU = ["Mérnök", "Mérnöki", "Tanácsadó", "építész", "SZAKÉRTŐ", "Menedzser", "Fejlesztő", "Architektúra", "Rendszergazda"]
data_terms_HU = ["Adat", "ETL", "Felhő", "Elemző", "Analitika", "BI", "Üzleti Intelligencia", "Üzleti Analitika", "Adatbázis", "Csővezeték", "Metadaták", "Monitoring", "Adatközpont"]


In [32]:
show_unique_and_its_len(dfs[selector]['Job_title'])

97 :
['IT Desktop Support Engineer Technician'
 'Cloud Services Engineer (Based in Germany)' 'Technical Systems Manager'
 'Cloud Engineer (f/m/d)' 'Data Engineer'
 'Senior Data Engineer - Ingestion'
 'Field Service Engineer Austria & Hungary & Slovakia & Czechia'
 'Part-time - Junior Data Analytics Engineer'
 'Junior Data Engineer - AI & Data tým'
 'Data Processing and Automation-Development Engineer (f/m/d)'
 'Data Engineer - Part time for a student' 'Technolog/Industrial Engineer'
 'Design Engineer' 'DATA ENGINEER' 'Data QA Engineer'
 'Catia V5 Design Engineer (m/ž) - projekt. řízení automotive'
 'Senior Quality Engineer' 'Junior SAS Engineer' 'ML Engineer - peoly'
 'Big data engineer' 'SWE - Full Stack Engineer - CoreOS'
 'Staff Data Engineer' 'Junior Software Engineer'
 'Strojní Engineer (procesy a technologie)'
 'Test and application engineer' 'Sr Data Engineer' 'Application Engineer'
 'L1 Support Engineer with English 24x7'
 'Junior Machine Learning Engineer - Part time for a stu

In [33]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_FR, specializations_NL, specializations_DE, specializations_HU],
        [data_terms_FR, data_terms_NL, data_terms_DE, data_terms_HU]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

32 :
['Cloud Services Engineer (Based in Germany)' 'Cloud Engineer (f/m/d)'
 'Data Engineer' 'Senior Data Engineer - Ingestion'
 'Part-time - Junior Data Analytics Engineer'
 'Junior Data Engineer - AI & Data tým'
 'Data Processing and Automation-Development Engineer (f/m/d)'
 'Data Engineer - Part time for a student' 'DATA ENGINEER'
 'Data QA Engineer' 'Big data engineer' 'Staff Data Engineer'
 'Sr Data Engineer' 'Data Analytics Engineer'
 'Global Azure Data Architect (Engineer)' 'DATA ENGINEER (BIOTECH)'
 'Data Engineer & Process Mining Architect' 'Big Data Engineer Junior'
 'IT Specialist - Data Engineer'
 'Global Support Engineer – Mobile Data Network Czechia'
 'Big Data DevOps Engineer Junior' 'Senior Data Engineer'
 'Data Engineer - Engineering & Manufacturing'
 'evergreen Public 360° Senior Cloud Data & AI Engineer'
 'Data Analyst/ Machine Learning Engineer' 'Data Engineer/Scientist'
 'Data Engineer - ML Ops' 'Software Engineer - Data Platform'
 'Data Engineer - ETL Developer (S

In [34]:
dfs[selector] = df

##### 7.5 Denmark

In [35]:
selector = 'Denmark'

specializations_DK = ["Ingeniør", "Konsulent", "Arkitekt", "SPECIALIST", "Manager", "Udvikler", "Arkitektur", "Administrator"]

data_terms_DK = ["Data", "ETL", "Cloud", "Analytisk", "Analyse", "BI", "Forretningsanalyse", "Database", "Pipeline", "Metadata", "Overvågning", "Datacenter"]

In [36]:
show_unique_and_its_len(dfs[selector]['Job_title'])

89 :
['Data Scientist / Machine Learning Engineer, FinTech Remote'
 'Lead Instrument Engineer' 'Lead Structural Engineer' 'Data engineer'
 'Lead Electrical Engineer' 'Data Engineer' 'Associate Data Engineer'
 'Data Platform Engineer' 'Operations Engineer til Big Data'
 'Student Data Engineer' 'Data Engineer for Telia'
 'Forward-deployed Data Engineer' 'Experienced Data Engineer'
 'Junior Backend Engineer' 'Data Engineer - QuantumBlack'
 'Data Center Engineer' 'Analytics Engineer'
 'Senior Data & Automation Engineer HR Technology'
 'Business Data Engineer'
 'Integrated Logistics Support Engineer, Lystrup or Søborg'
 'Software Engineer' 'Machine Learning Engineer'
 'DATA ENGINEER, PROCESS DATA AND ANALYTICS, NOVOZYMES'
 'Senior Data Engineer in SuperAI' 'Lead Machine Learning Engineer'
 'Data/Software Engineer' 'Lead Data Engineer' 'QAQC Turnover Engineer'
 'Sr Machine Learning Research Engineer @ Corti'
 'Data Engineer til bekæmpelse af skatteunddragelse'
 'Machine Learning Engineer Int

In [37]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_DK],
        [data_terms_DK]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

42 :
['Data Scientist / Machine Learning Engineer, FinTech Remote'
 'Data engineer' 'Data Engineer' 'Associate Data Engineer'
 'Data Platform Engineer' 'Operations Engineer til Big Data'
 'Student Data Engineer' 'Data Engineer for Telia'
 'Forward-deployed Data Engineer' 'Experienced Data Engineer'
 'Data Engineer - QuantumBlack' 'Data Center Engineer'
 'Analytics Engineer' 'Senior Data & Automation Engineer HR Technology'
 'Business Data Engineer'
 'DATA ENGINEER, PROCESS DATA AND ANALYTICS, NOVOZYMES'
 'Senior Data Engineer in SuperAI' 'Data/Software Engineer'
 'Lead Data Engineer' 'Data Engineer til bekæmpelse af skatteunddragelse'
 'SAP Data Lifecycle Engineer'
 'Cloud Engineer for our Computer Vision and MLOps area'
 'Data Solutions Engineer'
 'Data Specialist, Snowflake Engineer - Copenhagen:'
 'Data Engineers and Data Analysts for Copenhagen' 'Senior Data Engineer'
 '(Senior) Data Engineer, Data Engineering & Bioinformatics'
 'Data Engineer with an interest in creating value fro

In [38]:
dfs[selector] = df

##### 7.6 Finland

In [39]:
selector = 'Finland'

specializations_FI = ["Insinööri", "Konsultti", "Arkkitehti", "ERITYISOSAAMINEN", "Johtaja", "Kehittäjä", "Arkkitehtuuri", "Ylläpitäjä"]

data_terms_FI = ["Data", "ETL", "Pilvi", "Analytiikka", "BI", "Liiketoiminta-analytiikka", "Tietokanta", "Putkisto", "Metatiedot", "Seuranta", "Tietokeskus"]

In [40]:
show_unique_and_its_len(dfs[selector]['Job_title'])

200 :
['Senior Software Engineer, React' 'SOFTWARE ENGINEER' 'Data Engineer'
 'Grow as Data Engineer with us - Solita´s personalised onboarding program'
 'Data Engineer (Azure)' 'Backend engineer (Data science interest)'
 'DATA ENGINEER' 'Data Engineers' 'Forward-deployed Data Engineer'
 'Data Engineer (Mid/Senior experience) - Fluent Finnish required'
 'Sr Data Engineer' 'Data Engineer Helsinkiin' 'Data Integration Engineer'
 'Consumer Data Specialist' 'Data Engineer - End-user Development'
 'Data Center Critical facilities Engineer II' 'Desktop Engineer'
 'Senior Data Engineer, Netvisor' 'Network Engineer'
 'Software Engineer - Data Platform'
 'Golang System Software Engineer - Containers / Virtualisation'
 'Software Engineer - App Stores Backend (Remote)'
 'Software Engineer - Launchpad' 'Field Applications Engineer'
 'Senior Software Engineer (PHP) - Connectors Engineering' 'R&D Engineer'
 'Junior Software Engineer' 'Senior Data Engineer'
 'Software Engineer (node.js)' 'Remote Seni

In [41]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_FI],
        [data_terms_FI]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

46 :
['Data Engineer'
 'Grow as Data Engineer with us - Solita´s personalised onboarding program'
 'Data Engineer (Azure)' 'Backend engineer (Data science interest)'
 'DATA ENGINEER' 'Data Engineers' 'Forward-deployed Data Engineer'
 'Data Engineer (Mid/Senior experience) - Fluent Finnish required'
 'Sr Data Engineer' 'Data Engineer Helsinkiin' 'Data Integration Engineer'
 'Consumer Data Specialist' 'Data Engineer - End-user Development'
 'Data Center Critical facilities Engineer II'
 'Senior Data Engineer, Netvisor' 'Software Engineer - Data Platform'
 'Senior Data Engineer' 'Lead Data Engineer for K-Group'
 'Lead Data Scientist/Engineer' 'Cloud Data Engineer'
 'Azure Data Engineer'
 'Software Engineer (experience in computer security) Abu Dhabi'
 'Data Integration Engineer Confluent & Kafka - Tietoevry Transform'
 'Azure Data Platform Specialist / Azure Data Engineer'
 'Data Engineer, Tietohallinto, Helsinki' 'Cloud Platform Engineer'
 'Site Reliability Engineer'
 'Senior Observabili

In [42]:
dfs[selector] = df

##### 7.7 France

In [43]:
selector = 'France'

specializations_CA = ["Enginyer", "Enginyeria", "Consultor", "arquitecte", "ESPECIALISTA", "Gerent", "Desenvolupador", "Arquitectura", "Administrador"]

data_terms_CA = ["Dades", "ETL", "Núvol", "Analític", "Anàlisi de dades", "BI", "Intel·ligència de negocis", "Anàlisi de negocis", "Base de dades", "Pipeline", "Metadades", "Monitorització", "Centre de dades"]

specializations_IT = ["Ingegnere", "Consulente", "architetto", "SPECIALISTA", "Manager", "Sviluppatore", "Architettura", "Amministratore"]

data_terms_IT = ["Dati", "ETL", "Cloud", "Analitico", "Analytics", "BI", "Business Intelligence", "Business Analytics", "Database", "Pipeline", "Metadati", "Monitoraggio", "Centro dati"]

specializations_BASQ = ["Ingeniaria", "Ingeniaritza", "Konsultore", "arkitektoa", "ESPECIALISTA", "Kudeatzailea", "Garatzailea", "Arkitektura", "Administratzailea"]

data_terms_BASQ = ["Datuak", "ETL", "Cloud", "Analitikoa", "Analitika", "BI", "Negozioaren Inteligentzia", "Negozioaren Analitika", "Datubasea", "Pipeline-a", "Metadatuak", "Monitoreo", "Datuen zentroa"]

In [44]:
show_unique_and_its_len(dfs[selector]['Job_title'])

116 :
[nan 'CDD - DATA ENGINEER F/H' 'Data Analytics Leader'
 'Manager QA Testing Automation H/F'
 'Software engineer fullstack - Lancement nouveau produit H/F'
 'DATA ENGINEER (H/F)' 'Data Engineer (H/F)' 'Data Engineer H/F'
 'Cybersecurity Auditor'
 'International Trainee Program: Process Engineer - Semiconductor Industry'
 'Application Engineer' 'System Engineer for advanced projects'
 'Fullstack Java/JS Developer w/m'
 'Junior Process Engineer in Semiconductor Industry - Grenoble (F) / Dresden (D) / Austin (USA)'
 'Tech lead | CDI | H/F' 'Ingénieur Support niveau 2' 'Data scientist'
 'Data engineer - data factory (H/F)'
 'PROJECT MECHANICAL ENGINEER M/F - INGENIEUR CHEF DE PROJET H/F'
 'Data Scientist / Data Engineer confirmé - F/H'
 'Packaging Implementation Project Leader (H/F)'
 'Développeur Full Stack Senior | CDI | H/F' 'Field Service Engineer H/F'
 'Cloud Data Engineer (H/F) - POEI'
 "Project QA/QC Engineer / Ingénieur(e) qualité d'études et travaux"
 'Chef de projet Digital 

In [45]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_FR, specializations_DE, specializations_CA,specializations_IT, specializations_BASQ],
        [data_terms_FR,data_terms_DE, data_terms_CA, data_terms_IT, data_terms_BASQ]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

72 :
['CDD - DATA ENGINEER F/H' 'Data Analytics Leader' 'DATA ENGINEER (H/F)'
 'Data Engineer (H/F)' 'Data Engineer H/F'
 'Data engineer - data factory (H/F)'
 'Data Scientist / Data Engineer confirmé - F/H'
 'Cloud Data Engineer (H/F) - POEI' 'Data Engineer (F/H)'
 'Data architect - data factory (H/F)' 'Head of Data Analytics (H/F)'
 'Lead Data Analyst (H / F)' 'CDI - Data Engineer (Média) (F/H)'
 'CDI - Développeur PYTHON / Data engineer (Media) H/F'
 'CDI - Cloud Data Engineer (Média) F/H' 'Data engineer - H/F'
 'Data Engineer - H/F' 'CDI - Data Engineer (Industry) (F/H)'
 'Data Engineer Snowflake F/H' 'DATA ENGINEER (F/H)'
 'Responsable Data engineer' 'Data Engineer confirmé (H/F)'
 'Alternant - 1 an - Developer Data Engineer - Sensitivities F/H'
 'Data engineer manufacturing (H/F)'
 'Alternance - 24 mois - Data Engineer F/H - Paris'
 'DATA ENGINEER JUNIOR (F/H)'
 'Alternance Software Engineer C/C++ Data structure - Lyon (F/H)'
 'Développeur Big Data (F/H)' 'DATA SCIENTIST ML ENGIN

In [46]:
dfs[selector] = df

##### 7.8 Germany

In [47]:
selector = 'Germany'

In [48]:
show_unique_and_its_len(dfs[selector]['Job_title'])

248 :
['Data Engineer (w/m/d)' 'Senior Data Engineer (m/f/d)'
 'RNA Data Engineer (w/m/d)'
 'Full Stack Developer .NET Core / Angular (w/m/d)'
 'Senior Data Analyst (w/m/d) - onsite or remote / home office'
 'Senior Cloud - Data-Engineer (m/w/d)'
 'Data Engineer (m/w) in der Schweiz'
 'Lead Analytics Engineer / BI Engineer (m/f/d)'
 'Senior DevOps Engineer (m/w/d) für Dortmund, Hannover oder remote'
 'Site Reliability Engineer (f/m/d)'
 'IT Data Center Infrastructure Engineer (m/f/d)'
 'Data Engineer (w/m/d) Automotive Testing Unit'
 '(Senior) Data Engineer (w/m/d) – Marketing & Communications'
 'DevOps Engineer (m/w/d)'
 'Data Warehouse Architekt / Entwickler / Data Engineer (m/w/d)'
 'Senior Full Stack Developer .NET Core / Angular (w/m/d)'
 'Technical Data Engineer Material (d/m/w) für Airbus'
 'Research Software Engineer/Developer (f/m/d) for an HPC Particle in Cell Code'
 'SENIOR DATA ENGINEER (M/W/D)' 'Senior ML/AI Engineer (m/w/d)'
 'IT Data Center Support Engineer (m/w/d)'
 'Av

In [49]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_DE, specializations_DK,specializations_NL],
        [data_terms_DE, data_terms_DK,data_terms_NL]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

146 :
['Data Engineer (w/m/d)' 'Senior Data Engineer (m/f/d)'
 'RNA Data Engineer (w/m/d)' 'Senior Cloud - Data-Engineer (m/w/d)'
 'Data Engineer (m/w) in der Schweiz'
 'Lead Analytics Engineer / BI Engineer (m/f/d)'
 'Site Reliability Engineer (f/m/d)'
 'IT Data Center Infrastructure Engineer (m/f/d)'
 'Data Engineer (w/m/d) Automotive Testing Unit'
 '(Senior) Data Engineer (w/m/d) – Marketing & Communications'
 'Data Warehouse Architekt / Entwickler / Data Engineer (m/w/d)'
 'Technical Data Engineer Material (d/m/w) für Airbus'
 'SENIOR DATA ENGINEER (M/W/D)' 'IT Data Center Support Engineer (m/w/d)'
 'Data Engineer (m/w/d) Big Data | Python | Azure in Ulm'
 'Manager Data Engineering' 'Senior Associate Data Engineering'
 'Data Scientist / Feature Engineer (m/w/d)'
 '(Senior) Data Engineer (m/f/d) 80% Homeoffice'
 'Technical Data Engineer (d/m/w) für Airbus'
 'Software Entwicklerin / Engineering (m/w/d) Python | Big Data in Ulm'
 'Data Warehouse Engineer (m/w/d)' 'BI Engineer Cloud (m

In [50]:
dfs[selector] = df

##### 7.9 Greece

In [51]:
selector = 'Greece'

specializations_GR = ["Μηχανικός", "Σύμβουλος", "Αρχιτέκτονας", "ΕΙΔΙΚΟΣ", "Διευθυντής", "Προγραμματιστής", "Αρχιτεκτονική", "Διαχειριστής"]

data_terms_GR = ["Δεδομένα", "ETL", "Νέφος", "Αναλυτική", "Ανάλυση", "BI", "Επιχειρηματική Νοημοσύνη", "Επιχειρηματική Αναλυτική", "Βάση δεδομένων", "Αγωγός", "Μεταδεδομένα", "Παρακολούθηση", "Κέντρο δεδομένων"]

In [52]:
show_unique_and_its_len(dfs[selector]['Job_title'])

98 :
['Data Engineer' 'Jr Data Engineer' 'Data Engineer - Cyprus based'
 'Junior Telecom Data Analysis Engineer' 'Junior Network Engineer'
 'Associate Data Engineer' 'Junior Analytics Software Engineer (remote)'
 'Data Engineer Greece' 'Data Engineer / BI Engineer @Thessaloniki'
 'Senior Data Engineer' 'Data Platform Engineer DWH'
 'Data Engineer / BI Consultant in Thessaloniki'
 'Technology Consulting - Big Data Engineer' 'Senior Data Engineer, Java'
 'Data Engineer / BI Consultant in Athens' 'ML Ops and Data Engineer'
 'Technical Data Engineer (Customer Excellence)' 'Big Data Engineer'
 'Junior Data Engineer / Reporting Specialist' 'Senior Big Data Engineer'
 'Senior Backend Engineer - Go'
 'Lead Machine Learning Engineer (Remote Option)' 'NOC Engineer'
 'Working Student/SW Engineer' 'Software Engineer - Data Platform'
 'Data Analyst' 'Data Software Engineer' 'Sr. Big Data Engineer'
 'AWS Cloud Data Engineer' 'Reporting Engineer Team Lead (Big Data)'
 'Software Engineer - Data (Athen

In [53]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_DE, specializations_DK,specializations_NL],
        [data_terms_DE, data_terms_DK, data_terms_NL]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

49 :
['Data Engineer' 'Jr Data Engineer' 'Data Engineer - Cyprus based'
 'Junior Telecom Data Analysis Engineer' 'Associate Data Engineer'
 'Junior Analytics Software Engineer (remote)' 'Data Engineer Greece'
 'Data Engineer / BI Engineer @Thessaloniki' 'Senior Data Engineer'
 'Data Platform Engineer DWH'
 'Data Engineer / BI Consultant in Thessaloniki'
 'Technology Consulting - Big Data Engineer' 'Senior Data Engineer, Java'
 'Data Engineer / BI Consultant in Athens' 'ML Ops and Data Engineer'
 'Technical Data Engineer (Customer Excellence)' 'Big Data Engineer'
 'Junior Data Engineer / Reporting Specialist' 'Senior Big Data Engineer'
 'Software Engineer - Data Platform' 'Data Software Engineer'
 'Sr. Big Data Engineer' 'AWS Cloud Data Engineer'
 'Reporting Engineer Team Lead (Big Data)'
 'Software Engineer - Data (Athens Office or Remote in Greece)'
 'Data Engineer (Hybrid Workplace)' 'Software Engineer - Data Services'
 'Data Engineer (Junior/Mid-level) – Athens' 'AWS Data Engineer'


In [54]:
dfs[selector] = df

##### 7.10 Hungary

In [55]:
selector = "Hungary"

specializations_ROM = ["Inginer", "Consultant", "Arhitect", "Specialist", "Manager", "Dezvoltator", "Arhitectura", "Administrator"]

data_terms_ROM = ["Date", "ETL", "Noroi", "Analitic", "Analize", "BI", "Business Intelligence", "Analiză de afaceri", "Bază de date", "Conductă", "Metadate", "Monitorizare", "Centru de date"]

In [56]:
show_unique_and_its_len(dfs[selector]['Job_title'])

109 :
['Soldering Technology (f/m/div)*'
 'Factory Integration Engineer - Equipment Automation (f/m/div)*'
 'Electrical Engineer / Physicist Electrical Development Power Semiconductors (f/m/div)'
 'Electric Development Engineer (L&M ED) (f/m/div)*' 'Engineer'
 'Site Reliability Engineer' 'Data Engineer' 'Data Analytics Engineer'
 'AI Data Engineer Intern'
 'SAP BW Engineer(s) with data engineering flair'
 'Data Engineer (software/application)'
 'Software Engineer (Backend) - Credit' 'Data DevOps Engineer'
 'Cloud Data Engineer' 'Senior FrontEnd Engineer'
 'Junior Data Engineer (REF1600P)' 'Data Engineer - OPJ állás Budapest'
 'Machine Learning Engineer' 'Staff Data Engineer'
 'BETSSON ACADEMY - FRONTEND ENGINEER (SPORTSBOOK)'
 'Data Engineer for IoT (REF1449L)' 'Spark Data Engineer'
 'Software engineer' 'Business Automation - Technology Engineer'
 'Trainee Electrical Development Engineer Power Semiconductor Modules - International Graduate Program (f/m/div)*'
 'Senior Engineer (Manufac

In [57]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_HU, specializations_SK,specializations_DE, specializations_ROM],
        [data_terms_DE, data_terms_SK, specializations_DE, data_terms_ROM]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

35 :
['Site Reliability Engineer' 'Data Engineer' 'Data Analytics Engineer'
 'AI Data Engineer Intern'
 'SAP BW Engineer(s) with data engineering flair'
 'Data Engineer (software/application)' 'Data DevOps Engineer'
 'Cloud Data Engineer' 'Junior Data Engineer (REF1600P)'
 'Data Engineer - OPJ állás Budapest' 'Staff Data Engineer'
 'Data Engineer for IoT (REF1449L)' 'Spark Data Engineer'
 'Big Data Engineer' 'Sustainability Quality Management System engineer'
 'Data Platform Engineer' 'Cisco Data 2nd line engineer'
 'Software Engineer (mid or senior level) for Data Platform'
 'Sr Data Engineer' 'Senior Data Engineer' 'Power BI Platform Engineer'
 'Data Security Engineer' 'data scientist & ai engineer'
 'Sales Application Engineer (Energy Efficiency & Sustainability)'
 'Data Integration Engineer' 'Cloud Devops Engineer (m/f/x)'
 'Senior Data Pipeline Engineer (Data Platform)'
 '(Remote, Hungary) Senior Analytics Engineer' 'Senior Big Data Engineer'
 'Reliability engineer - SiC Semicondu

In [58]:
dfs[selector] = df

##### 7.11 Ireland

In [59]:
selector = 'Ireland'

In [60]:
show_unique_and_its_len(dfs[selector]['Job_title'])

323 :
['Process Excellence Engineer'
 'Process and applications engineer (f/m/d)-Automation Background'
 'Developer Support Engineer - (Remote)'
 'Senior Quality Engineer - Galway' 'Platform Security Engineer'
 'Designer FTTH - NBI' 'Software Engineer in Test SDET'
 'Mechanical Building Services Design Engineer'
 'Senior Automation Engineer' 'Senior Quality Assurance Engineer'
 'Cloud Engineer - Zero Trust' 'Cloud Developer'
 'Graduate Geologist, Engineering Geologist, Geotechnical Engineer'
 'VFX Systems Engineer - On site' 'Senior Architectural Technologist'
 'BER Assessor / Energy Consultant'
 'GreenLake Cloud Platform Zero Trust Security - Master Level Engineer'
 'Senior Cloud Developer' 'ARCGIS Specialist - Kilkenny'
 'Principal Engineer'
 'Senior Geotechnical Engineer / Engineering Geologist' 'Splunk Engineer'
 'Senior Electrical Engineer - Denmark' 'Product Engineer'
 'Full Time BMS Engineer' 'Quality Engineer'
 'HVAC Test & Balancing Commissioning Engineer' 'In-Die Engineer'
 '

In [61]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_DE, specializations_DK,specializations_NL],
        [data_terms_DE, data_terms_DK, specializations_NL]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

82 :
['Cloud Engineer - Zero Trust' 'Cloud Developer'
 'Senior Architectural Technologist'
 'GreenLake Cloud Platform Zero Trust Security - Master Level Engineer'
 'Senior Cloud Developer' 'ARCGIS Specialist - Kilkenny'
 'Solution Architect' 'Google Cloud Platform - GCP Technical Architect'
 'Engineering Manager' 'Solutions Engineering Manager'
 'Mechanical BIM Engineer' 'Data Cabling Engineer Data Cabling Engineer'
 'R&D Manager'
 'R&D Manager - Product Engineering / Sustaining Engineering'
 'Software Engineering Team Lead' 'Big Data Engineer' 'Data Engineer'
 '2nd Line Data Engineer'
 'Data Center Engineering Operation Technician (Shift)'
 'Infrastructure Database Engineer - SQL Server Specialism'
 'Quantitative Data Engineer' 'Data Analyst Engineer'
 'Senior Data Engineer (remote)' 'Data Engineer (Azure)'
 'Data Core technical authority engineer'
 'Software Engineer, Data Pipelines - Opportunity for Working Remotely Cork,'
 'Technical Design Lead - Data Centers' 'Data Engineering Ma

In [62]:
dfs[selector] = df

##### 7.12 Israel

In [63]:
selector = "Israel"

specializations = ["מהנדס", "הנדסה", "יועץ", "אדריכל", "מומחה", "מנהל", "מפתח", "ארכיטקטורה", "מנהל מערכות"]

data_terms = ["נתונים", "ETL", "ענן", "ניתוח", "ניתוח נתונים", "BI", "בינה מעסיקתית", "אנליטיקה עסקית", "מסד נתונים", "צינורות נתונים", "מטא נתונים", "מעקב", "מרכז נתונים"]

In [64]:
show_unique_and_its_len(dfs[selector]['Job_title'])

83 :
['Data Engineer' 'Fullstack Engineer' 'Big Data Engineer'
 'ML Engineer - Iguazio (Acquired by McKinsey)' 'Full-Stack Engineer'
 'BI Data Engineer' 'Junior Windows Low Level Engineer - Engine team'
 'Machine Learning Engineer Student' 'Data Tools Engineer'
 'Software Dev Engineer Intern' 'Senior Data Engineer'
 'Data Solutions Engineer' 'Senior/Junior Data Engineer'
 'Data Engineer - Data Assets Group' 'Senior Frontend Engineer'
 'Backend Engineer' 'DATA ENGINEER' '\uf4ccData Engineer\uf4cc(Hybrid'
 'Machine Learning Engineer' 'Embedded Software Engineer- AWS Nitro'
 'Cloud Data Engineer' 'Data Infra Engineer' 'Data Platform Engineer'
 'Finout-Data Engineer' 'Data Engineer Lead' 'Experienced Data Engineer'
 'Storage Analytics - Data Engineer' 'Data Engineer, Mobile Identity'
 'Junior DevOps Engineer' 'Senior Software Engineer - Iguazio'
 'Deep Learning Engineer' 'Back End Engineer'
 'מעל שנתיים ניסיון | Data engineer'
 'Data Engineer לארגון פיננסי מוביל בת"א!' 'Data Engineer לחברה

In [65]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_DE, specializations_DK,specializations_NL],
        [data_terms_DE, data_terms_DK, specializations_NL]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

50 :
['Data Engineer' 'Big Data Engineer' 'BI Data Engineer'
 'Data Tools Engineer' 'Senior Data Engineer' 'Data Solutions Engineer'
 'Senior/Junior Data Engineer' 'Data Engineer - Data Assets Group'
 'DATA ENGINEER' '\uf4ccData Engineer\uf4cc(Hybrid' 'Cloud Data Engineer'
 'Data Infra Engineer' 'Data Platform Engineer' 'Finout-Data Engineer'
 'Data Engineer Lead' 'Experienced Data Engineer'
 'Storage Analytics - Data Engineer' 'Data Engineer, Mobile Identity'
 'מעל שנתיים ניסיון | Data engineer'
 'Data Engineer לארגון פיננסי מוביל בת"א!' 'Data Engineer לחברה פיננסית'
 'Data Engineer לארגון מסווג' 'Software Engineer - Data Science Team'
 'Data Engineer בחברת Online Gaming מצליחה!'
 'Senior Software Engineer - Data Path'
 'CPU Silicon Engineer, Design Verification, Google Cloud'
 'Senior Data Infrastructure Engineer (Architecture Group)'
 'Data Engineer בחברת הייטק ותיקה ורווחית'
 'Data Engineer לחברת הייטק גלובלית בתחום הE-Commerce'
 'QA Automation Engineer - Data Group' 'Senior Backen

In [66]:
dfs[selector] = df

##### 7.13 Italy

In [67]:
selector = 'Italy'

specializations_SL = ["Inženir", "Inženiring", "Svetovalec", "arhitekt", "SPECIALIST", "Vodja", "Razvijalec", "Arhitektura", "Administrator"]

data_terms_SL = ["Podatki", "ETL", "Oblak", "Analitični", "Analitika", "BI", "Poslovna Inteligenca", "Poslovna Analitika", "Podatkovna Baza", "Cevovod", "Metapodatki", "Spremljanje", "Podatkovni Center"]

In [68]:
show_unique_and_its_len(dfs[selector]['Job_title'])

131 :
['IT Data Engineer' 'ingegnere elettrico o perito di esperienza'
 'Process Engineer (PED-05) Start-up Technician - Tecnico Avviamento Impianti'
 'Direttore di cantiere' 'INGEGNERE ENERGETICO - PROGETTISTA TERMOTECNICO'
 'supervisore di cantiere (elettromeccanico)'
 'SENIOR ENGINEER - SVILUPPATORE WEB / E-COMMERCE'
 'Embedded Software Engineer' 'SYSTEM ENGINEER' 'Data Engineer'
 'Control System Engineer' 'Senior System Engineer Evangelist'
 'Ingegnere progettista' 'Backend Engineer Ecommerce - Python (Odoo)'
 'INGEGNERE CIVILE / EDILE' 'Software Developer Senior'
 'Data Engineer / ETL Developer' 'Ingegnere Elettronico Programmatore PLC'
 'Ingegnere elettrico / meccanico / edile'
 'Senior Application Engineer M/F' 'Ingegnere meccanico/strutturista'
 'INGEGNERE PER PIANIFICAZIONE TRASPORTO PERSONE' 'Progettista elettrico'
 'Data Engineer (hybrid-remote)' 'Progettista fotovoltaico'
 'DATA ARCHITECT/FULL STACK DEVELOPER' 'Project Engineer'
 "CONSULENTE AZIENDALE con esperienza in SANI

In [69]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_IT, specializations_DE,specializations_FR, specializations_CA, specializations_GR, specializations_SL],
        [data_terms_IT, data_terms_DE, specializations_FR, data_terms_CA,data_terms_GR, data_terms_SL]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

15 :
['IT Data Engineer' 'Data Engineer' 'Data Engineer / ETL Developer'
 'Data Engineer (hybrid-remote)' 'DATA ARCHITECT/FULL STACK DEVELOPER'
 'ingegnere o architetto per settore rinnovabili ed efficienza energetica'
 'Ingegnere/Architetto - energie rinnovabili'
 'Junior Data & Analytics Engineer - parzialmente remoto' 'DATA ENGINEER'
 'Junior Data Engineer' 'Consultant ambito PMO'
 'architetto bim specialist' 'Manager/Direttore Tecnico di Cantiere'
 'Stage Addetto/A Data Engineer' 'DataOps Engineer']


In [70]:
dfs[selector] = df

##### 7.14 Luxembourg

In [71]:
selector = 'Luxembourg'

In [72]:
show_unique_and_its_len(dfs[selector]['Job_title'])

118 :
['DATA ENGINEER' '2023 Data Engineer Internship' 'Data Engineer'
 'IT Support Engineer' 'Software Engineer Data' 'MS Engineer VMware'
 'Process Engineer (m/f)' 'DATA ENGINEER H/F'
 'Senior Data Engineer (f/m/d)'
 'Data Scientist ou Data Analytics Engineer (m/f) (réf. E00024412) (réf. F00024413)'
 'Data Engineer (H/F)' 'Senior Software-Defined Data Center Engineer'
 'Data Architect/Senior Data Engineer (m/f/n)'
 '2023 Software Development Engineer Internship'
 'Unified Communications Engineer' 'Materials Engineer'
 'Unified Communications Engineer (Microsoft)'
 'Business Intelligence Engineer, EU Production Planning Central Team'
 'Mechanical Engineer with an interest in Design' 'Data Engineer (m/f/gn)'
 'DATA ENGINEER (F/M)' 'Internship - Development Process Engineer (m/f/d)'
 'Analytics Engineer (m/f/d)' 'Senior Data Engineer (F/M)'
 'Quality Engineer' '2023 Business Intelligence Engineer Internship'
 'Internship as control systems engineer'
 'AV Support Engineer, IT Services' '

In [73]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_DE, specializations_FR],
        [data_terms_DE, data_terms_FR]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

28 :
['DATA ENGINEER' '2023 Data Engineer Internship' 'Data Engineer'
 'Software Engineer Data' 'DATA ENGINEER H/F'
 'Senior Data Engineer (f/m/d)'
 'Data Scientist ou Data Analytics Engineer (m/f) (réf. E00024412) (réf. F00024413)'
 'Data Engineer (H/F)' 'Senior Software-Defined Data Center Engineer'
 'Data Architect/Senior Data Engineer (m/f/n)'
 'Business Intelligence Engineer, EU Production Planning Central Team'
 'Data Engineer (m/f/gn)' 'DATA ENGINEER (F/M)'
 'Analytics Engineer (m/f/d)' 'Senior Data Engineer (F/M)'
 '2023 Business Intelligence Engineer Internship'
 'Data Development Engineer' 'Data Centre Engineer' 'AZURE DATA ENGINEER'
 'Data and Analytics Engineer – New role – build a data factory'
 'Head of Data Engineering / Lead Database Engineer'
 'SITE RELIABILITY ENGINEER' 'Business Intelligence Engineer M/F'
 'ETL/Data Engineer (m/w/d)' 'Senior Software Engineer – Data Platform'
 'GCP Data Solution Architect' 'Cloud Solutions Architect (f/m/d)'
 'Software Engineers Data

In [74]:
dfs[selector] = df

##### 7.12 Netherlands

In [75]:
selector = 'Netherlands'

specializations_FRI = ["Ynżenier", "Ynženiering", "Konsultant", "arkitekt", "SPESJALIST", "Manager", "Ûntwikkelers", "Arktitektuer", "Administrator"]

data_terms_FRI = ["Data", "ETL", "Cloud", "Analytysk", "Analitika", "BI", "Bisykens Intelligence", "Bisykens Analytics", "Database", "Pipeline", "Metadata", "Monitoring", "Datacenter"]


In [76]:
show_unique_and_its_len(dfs[selector]['Job_title'])

158 :
['Engineer Industriële Automatisering' 'Senior AWS Cloud Engineer'
 'Backend Software Engineer C# .NET' 'ILS Engineer' 'Growth Engineer'
 'Data Engineer' 'Engineer elektrotechniek' 'C/C++ Software Engineer'
 'Vacature Engineer Industriële Automatisering'
 'Software Engineer Industriële Automatisering'
 'Application Engineer - portal beheerder' 'Mechanical Engineer'
 'Software Engineer' 'Data Engineering stagiair(e)'
 'Data Engineer Nivel Zorgregistraties Eerste Lijn'
 'Medior/Senior Embedded Systems Engineer' 'INFORMATIE / DATA ANALIST'
 'Werkvoorbereider machinebouw' 'Product Engineer'
 'Engineer infrastructuur Binnenkabels' 'DevOps Engineer'
 'Analist cost engineer' 'Analytics Engineer'
 'Global Performance Paid Social Specialist' 'Desktop Support Technician'
 'Supply Chain Engineer' 'QHSE Manager'
 'Operationeel/Technisch netwerk monitoring specialist'
 'Parttime PowerBI Developer (16-32 uur)' 'IT Support Engineer'
 'Application Engineer' 'Control Systems Engineer'
 'IT traine

In [77]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_NL, specializations_FRI],
        [data_terms_NL, data_terms_FRI]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

46 :
['Senior AWS Cloud Engineer' 'Data Engineer'
 'Data Engineering stagiair(e)'
 'Data Engineer Nivel Zorgregistraties Eerste Lijn'
 'Engineer infrastructuur Binnenkabels' 'Analytics Engineer'
 'Operationeel/Technisch netwerk monitoring specialist'
 'Parttime PowerBI Developer (16-32 uur)' 'Senior Database Specialist'
 'lead engineer cloud & big data' 'Data Engineer traineeship'
 'Cloud Engineer' 'Data Modeling Engineer Freelance'
 'Azure Cloud Engineer bij Movir' 'Data engineer'
 'OPS Engineer Global Data Centre' 'Junior Big Data Engineer'
 'Consultant Data Engineering' 'Senior Data Engineer Klantinteractie'
 'Junior data engineer' 'Data Engineer Business Intelligence'
 'Junior Data Engineer - YEPP' 'Data Engineer (junior)'
 'Data Engineer (Amsterdam)' '(Junior) Data Engineer'
 'Senior Data Engineer - PySpark, AWS' 'Engineer Data Publishing'
 'Data Engineer(Netherlands, Valid work Permit)'
 'Data Engineer - DataSense' 'Data Engineer DataStage'
 'Junior Data Engineer' 'Cloud Data Eng

In [78]:
dfs[selector] = df

##### 7.13 Norway

In [79]:
selector = 'Norway'

specializations_NO = ["Ingeniør", "Konsulent", "arkitekt", "SPECIALIST", "Manager", "Utvikler", "Arkitektur", "Administrator"]
data_terms_NO = ["Data", "ETL", "Sky", "Analytisk", "Analytics", "BI", "Forretningsinnsikt", "Forretningsanalyse", "Database", "Pipeline", "Metadata", "Overvåking", "Databehandlingssenter"]

In [80]:
show_unique_and_its_len(dfs[selector]['Job_title'])

278 :
[nan 'Contracts Officer (Based in Germany)'
 'Cloud Services Engineer (Based in Germany)' 'Data Engineer'
 'Data Engineer - Google Cloud' 'Data Engineer (m/f/d)'
 'Sommerjobb for data scientists / data engineers — Oslo'
 'EICT package engineer' 'Løsningsarkitekt Analytics / Data Engineer'
 'Data Engineer – Java Developer' 'Customer Support Engineer (All Levels)'
 'Mechanical Package Engineer'
 'Data Engineer | NorgesGruppen | Skøyen, Oslo' 'DATA ENGINEER'
 'Senior Full Stack Engineer GIS (remote)' 'Drilling Engineer'
 'Software Engineer (Oslo) - Experian Marketing Services'
 'We are looking for a Data Engineer' 'Field Service Engineer (m/f/d)'
 'Data Engineer med erfaring fra Azure?' 'Project Engineer'
 'Azure Data Engineer with DataBricks - Oslo'
 'Forward-deployed Data Engineer' 'Senior Completion Engineer'
 'Software Engineer - Frontend' 'Data Engineer - Oslo'
 'Elsker du data og analyser? Vi også.' 'Operational Technology Engineer'
 'Principal Software Engineer'
 'Data Engine

In [81]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_NL, specializations_FRI],
        [data_terms_NL, data_terms_FRI]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

59 :
['Cloud Services Engineer (Based in Germany)' 'Data Engineer'
 'Data Engineer - Google Cloud' 'Data Engineer (m/f/d)'
 'Sommerjobb for data scientists / data engineers — Oslo'
 'Løsningsarkitekt Analytics / Data Engineer'
 'Data Engineer – Java Developer'
 'Data Engineer | NorgesGruppen | Skøyen, Oslo' 'DATA ENGINEER'
 'We are looking for a Data Engineer'
 'Data Engineer med erfaring fra Azure?'
 'Azure Data Engineer with DataBricks - Oslo'
 'Forward-deployed Data Engineer' 'Data Engineer - Oslo'
 'Data Engineers, Scientists, Analysts - Oslo' 'Data/ML Engineer'
 'Senior Big Data Engineer'
 'Riverty is looking for a new Data Engineering Lead'
 'Data Engineer (Azure) - Oslo'
 'Integration Engineer - Cloud Solutions to Ericsson in Fornebu Oslo.'
 'Data Engineer - ETL - Finance Industry' '(Senior) Data Engineer - Azure'
 'Senior Cloud Security Engineer | Völur | Oslo'
 'System Engineer Storage and Data Protection'
 'Head of Data & Analytics - Oslo' 'Software Engineer - Data Platform'


In [82]:
dfs[selector] = df

##### 7.14 Poland

In [83]:
selector = 'Poland'

specializations_PL = ["Inżynier", "Konsultant", "Architekt", "Specjalista", "Manager", "Programista", "Administrator"]

data_terms_PL = ["Dane", "ETL", "Chmura", "Analityczny", "Analityka", "BI", "Business Intelligence", "Analityka Biznesowa", "Bazy Danych", "Pipeline", "Metadane", "Monitorowanie", "Centrum Danych"]

In [84]:
show_unique_and_its_len(dfs[selector]['Job_title'])

102 :
['Technical Reporting & Compliance Lead'
 'Controls Engineer Senior Associate' 'Electro Mechanical Design Engineer'
 'Cloud Services Engineer (Based in Germany)'
 'Contracts Officer (Based in Germany)'
 'Senior Technical Support Engineer (PL/SQL and Unix)'
 'Software QA Test Engineer'
 'Software Engineer Intern - Frontend/Fullstack'
 'UI Engineer (L4/L5) - Content Engineering'
 'Crypto Data Engineer Intern (Remote)'
 'Junior Data Engineer - Data & Analytics' 'Software Engineer Intern'
 'Intern Data Engineer' 'Scientific Research Intern'
 'Automation Project Engineer'
 'Remote Customer Service Representative - English Speaking'
 'Junior data engineer (Talent Program)' 'AI Engineer/Data Engineer'
 'Data/Machine Learning Engineer' 'Junior Cloud Data Platform Engineer'
 'Data Engineer - QuantumBlack'
 'Data Scientist - Machine Learning Engineer'
 'Junior/Mid Data Engineer (Biotech)' 'Data Engineer (DWH) (m/f/d)'
 'Data Engineer in Technology Consulting' 'Summer Internship Program'
 '

In [85]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_NL, specializations_FRI],
        [data_terms_NL, data_terms_FRI]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

54 :
['Cloud Services Engineer (Based in Germany)'
 'Crypto Data Engineer Intern (Remote)'
 'Junior Data Engineer - Data & Analytics' 'Intern Data Engineer'
 'Junior data engineer (Talent Program)' 'AI Engineer/Data Engineer'
 'Data/Machine Learning Engineer' 'Junior Cloud Data Platform Engineer'
 'Data Engineer - QuantumBlack'
 'Data Scientist - Machine Learning Engineer'
 'Junior/Mid Data Engineer (Biotech)' 'Data Engineer (DWH) (m/f/d)'
 'Data Engineer in Technology Consulting' 'BI Data Engineer'
 'Data Engineer'
 'Intern - Quantitative Analyst or Data Engineer at RiskHub Warsaw (Risk Hub Summer Internship Programme)'
 'Staff Data Visualization Engineer' 'Data Engineer, Analytics'
 'Reservoir Engineer - Junior Data Analyst' 'Data Engineer Intern'
 'Data Scientist / Machine Learning Engineer' 'DATA ENGINEER'
 'Junior Data Engineer - Data & Analytics Engineering Development Program'
 'Data Analyst/Engineer Internship'
 'Machine Learning Software Engineer / Data Scientist'
 'Junior DWH

In [86]:
dfs[selector] = df

##### 7.15 Portugal

In [87]:
selector = 'Portugal'

specializations_PTG= ["Engenheiro", "Consultor", "Arquiteto", "Especialista", "Gerente", "Desenvolvedor", "Arquitetura", "Administrador"]

data_terms_PTG = ["Dados", "ETL", "Nuvem", "Analítico", "Análise", "BI", "Inteligência de Negócios", "Análise de Negócios", "Banco de Dados", "Pipeline", "Metadados", "Monitoramento", "Centro de Dados"]

In [88]:
show_unique_and_its_len(dfs[selector]['Job_title'])

127 :
['Full Stack Developer' 'SQL Engineer' 'Ingestion Data Engineer (f/m/d)'
 'Software Integration Engineers' 'Ingestion Senior Data Engineer (f/m/d)'
 'Data Engineer' 'Data Engineer Trainee (m/f/d)' 'Senior Data Engineer'
 'Data Pipeline Engineer' 'Data Engineer - Junior' nan 'DATA ENGINEER'
 'Release & Code Analyst' 'Data Engineer - Banking Sector'
 'Associate Data Engineer (M/F/D)' 'Crypto Data Engineer (Remote)'
 'Lead Data Engineer' 'SQL Data Engineer' 'Project Engineer [Remote]'
 'Data Scientist (100% remote working)'
 'Google Cloud Platform Data Engineer'
 'Data Engineer – Innovation Program' 'Senior Data Engineer (Lisbon)'
 'Data Engineer - (Viator)' 'Data Engineer (all genders)'
 'On-Prem Data Engineer' 'Data Engineer | AWS' 'Data Engineer- Lisboa'
 'Senior Software Engineer (SAP PLM)'
 'Data Scientist e Machine Learning Engineer' 'Azure Data Engineer'
 'Data Engineer: Synapse SQL (m/f)' 'Data Engineer II'
 'Staff Data Engineer (m/f/d)' 'Data Engineer (M/F) – Lisboa'
 'Data

In [89]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_NL, specializations_FRI],
        [data_terms_NL, data_terms_FRI]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

75 :
['Ingestion Data Engineer (f/m/d)'
 'Ingestion Senior Data Engineer (f/m/d)' 'Data Engineer'
 'Data Engineer Trainee (m/f/d)' 'Senior Data Engineer'
 'Data Pipeline Engineer' 'Data Engineer - Junior' 'DATA ENGINEER'
 'Data Engineer - Banking Sector' 'Associate Data Engineer (M/F/D)'
 'Crypto Data Engineer (Remote)' 'Lead Data Engineer' 'SQL Data Engineer'
 'Google Cloud Platform Data Engineer'
 'Data Engineer – Innovation Program' 'Senior Data Engineer (Lisbon)'
 'Data Engineer - (Viator)' 'Data Engineer (all genders)'
 'On-Prem Data Engineer' 'Data Engineer | AWS' 'Data Engineer- Lisboa'
 'Data Scientist e Machine Learning Engineer' 'Azure Data Engineer'
 'Data Engineer: Synapse SQL (m/f)' 'Data Engineer II'
 'Staff Data Engineer (m/f/d)' 'Data Engineer (M/F) – Lisboa'
 'Data Engineer | Cloud & Big Data' 'Data Software Engineer - Senior'
 'Big Data Engineer' 'Data Engineer / BI Developer'
 'Data Engineer (Spark/Databricks)' 'Azure Senior Data Engineer'
 'Senior Data Engineer (all

In [90]:
dfs[selector] = df

##### 7.16 Romania

In [91]:
selector = 'Romania'

In [92]:
show_unique_and_its_len(dfs[selector]['Job_title'])

107 :
['Data Engineer' 'Supplier Quality Engineer Inspector'
 'Reliability Engineer' 'Automation engineer' 'Sales Support Engineer'
 'Software QA Test Engineer' 'Lead Quality Assurance Engineer'
 'Sr. Backend Software Engineer - Python, Go, OpenSearch a.k.a Elasticsearch, MySQL, AWS (Romania)'
 'Cloud DevOps Lead Engineer' 'Backend Software Engineer (Remote, ROU)'
 'IT Lead Test Engineer - Edge Computing'
 'Cloud Services Engineer (Based in Germany)'
 'Software Development Engineer in Test (Java) - Sandbox | Romania (Remote)'
 'Backend Software Engineer (Go, AWS, Cassandra) - Cloud Security (Remote or Hybrid, Romania)'
 'Data Engineer, Analytics' 'Functional Safety Engineer (m/f/d)'
 'Software Test Engineer' 'OutSystems Software Engineer'
 'Frontend Engineer - Partner Data' 'Back-End Software Engineer'
 'Middle Big Data Engineer'
 'Internship Data & Access Engineer - PokerStars'
 'Data Engineer (C#, Python, Elastic)' 'PHP Data Engineer' 'Junior Tester'
 'Software Development Engineer, 

In [93]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_ROM, specializations_HU],
        [data_terms_ROM, data_terms_HU]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

51 :
['Data Engineer' 'Reliability Engineer' 'Cloud DevOps Lead Engineer'
 'Cloud Services Engineer (Based in Germany)'
 'Backend Software Engineer (Go, AWS, Cassandra) - Cloud Security (Remote or Hybrid, Romania)'
 'Data Engineer, Analytics' 'Frontend Engineer - Partner Data'
 'Middle Big Data Engineer'
 'Internship Data & Access Engineer - PokerStars'
 'Data Engineer (C#, Python, Elastic)' 'PHP Data Engineer'
 'Software Development Engineer, SDO Privacy, Data Access, SDO Privacy - AEDU team'
 'Data Solution Engineer' 'Data Engineer (Google Cloud Platform)'
 'Data Engineer with Python' 'Data Engineer on AWS Cloud'
 'ETL Engineer|Unite Data Management @ ING Hubs Romania'
 'Data Platform Engineer |Data Analytics Platform @ING Hubs Romania'
 'Senior Data Engineer' 'Site Reliability Engineer'
 'Tableau/Elastic Search/Hadoop ( Data lake) Engineer for...'
 'Data Center Infrastructure Engineer Level II' 'Sr Big Data Engineer'
 'Data Engineer-Big Data' 'Data Engineer (Python)' 'Python Data En

In [94]:
dfs[selector] = df

##### 7.17 Spain

In [95]:
selector = 'Spain'

specializations_gl = ["Inxeniero", "Enxeñería", "Consultor", "arquitecto", "ESPECIALISTA", "Xestor", "Desenvolvedor", "Arquitectura", "Administrador"]

data_terms_gl = ["Datos", "ETL", "Nube", "Analítica", "Analítica de datos", "BI", "Intelixencia de negocios", "Analítica de negocios", "Base de datos", "Pipeline", "Metadatos", "Monitorización", "Centro de datos"]

specializations_ES = ["Ingeniero", "Consultor", "Arquitecto", "Especialista", "Gerente", "Desarrollador", "Arquitectura", "Administrador"]

data_terms_ES = ["Datos", "ETL", "Nube", "Analítico", "Análisis", "BI", "Inteligencia de Negocios", "Análisis de Negocios", "Base de datos", "Pipeline", "Metadatos", "Monitoreo", "Centro de datos"]


In [96]:
show_unique_and_its_len(dfs[selector]['Job_title'])

146 :
['Project Manager / Engineer Smart Cities & Smart Grids (m/f/x)'
 'Research Engineer (m/f/x/d)' 'Technical Service Engineer'
 'Sales & Order Operations Specialist' 'Data DevOps'
 'Data Engineer, Asturias' 'Customer Support Engineer - Madrid'
 'Field Service Engineer, Barcelona' 'Data Engineer Intern'
 'M3 Functional Analyst (Remote Role)' 'Data Engineer AWS, 100% En remoto'
 'Data Engineer' 'Python Data Engineer (Remote)'
 'Data Engineer (Shadow Team)' '(Mid) Data Engineer' nan
 'Data Engineer entry level (20 - 30K)' 'Data Engineer (H/M)'
 'Data Engineer Junior' 'Data Engineer / 100% Remoto, 100% En remoto'
 'GDC Process Mining Data Engineer (f/m/d)'
 'Junior Data Engineer (OF0523)' 'Junior Data Engineer'
 'Data Engineer - PagoNxt' 'Senior Cloud Engineer, Alicante'
 'Data Engineer and Analytics'
 'Consultor/a MuleSoft Remoto, 100% En remoto'
 'Senior TELCO BSS Architect' 'Data Engineer (Remote)'
 'Engineering Services Manager' 'Data Engineer exp +2 (30-40K)'
 'Operations Data Ana

In [97]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_ROM, specializations_HU],
        [data_terms_ROM, data_terms_HU]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

88 :
['Data Engineer, Asturias' 'Data Engineer Intern'
 'Data Engineer AWS, 100% En remoto' 'Data Engineer'
 'Python Data Engineer (Remote)' 'Data Engineer (Shadow Team)'
 '(Mid) Data Engineer' 'Data Engineer entry level (20 - 30K)'
 'Data Engineer (H/M)' 'Data Engineer Junior'
 'Data Engineer / 100% Remoto, 100% En remoto'
 'GDC Process Mining Data Engineer (f/m/d)'
 'Junior Data Engineer (OF0523)' 'Junior Data Engineer'
 'Data Engineer - PagoNxt' 'Senior Cloud Engineer, Alicante'
 'Data Engineer and Analytics' 'Data Engineer (Remote)'
 'Data Engineer exp +2 (30-40K)' 'Operations Data Analytics Engineer'
 'Data Engineer - Accelerator' 'JUNIOR DATA ENGINEER'
 'Head of Product Data - 100% Remote' 'Senior Data Engineer'
 'AIRWORTHINESS TECHNICAL ENGINEER FOR OPERATIONAL SUITABILITY DATA (OSD)'
 'Data Engineer for HR (f/m/d)'
 'Data Engineer - 100% remoto, 100% En remoto' 'Data Engineer DataStage'
 'Senior Data Engineer (40k-50k)'
 'Lead Data Engineer (Google Cloud, Scala, CI/CD)'
 'Data 

In [98]:
dfs[selector] = df

##### 7.18 Sweden

In [99]:
selector = 'Sweden'

specializations_SV = ["Ingenjör", "Konsult", "Arkitekt", "Specialist", "Chef", "Utvecklare", "Arkitektur", "Administratör"]

data_terms_SV = ["Data", "ETL", "Moln", "Analys", "Analytik", "BI", "Affärsinriktad Analys", "Business Intelligence", "Databas", "Pipeline", "Metadata", "Övervakning", "Datacenter"]

In [100]:
show_unique_and_its_len(dfs[selector]['Job_title'])

95 :
['Data Governance Engineer (m/f/d) 80% Homeoffice' 'Network Data Engineer'
 'Process Manager (m/f/d) 80% Homeoffice'
 'Datadriven Sälj- & Verksamhetsplanerare'
 'Battery Module Safety Engineer'
 'Data Engineer | Sjunde AP-fonden | Stockholm'
 'System Engineer within Energy Storage System at Electromobility'
 'Data Engineer to Avinode!' 'BI & Data Analytics Manager' 'Data Engineer'
 'Rock mechanical engineer at Zinkgruvan Mining' 'ECG Technical Advisor'
 nan 'Data/ cloud engineer for a global telecom company!'
 'IT solution engineer / Cloud engineer till QD'
 'Data Engineer to Business Intelligence & Analytics...'
 'Analytics Data Engineer' 'Back-end/Data Software Engineer'
 'DevOps Engineer to Zebware' 'SIEM Engineer' 'Data Engineer - Stockholm'
 'Junior Data Engineer' 'Graduate Data Engineer'
 'Data Engineer (Google Looker) - Fully Remote (LATAM)'
 'Head of Azure till Bitlog' 'Sustainability Engineer' 'Data Science Lead'
 'Firmware & Electronics Engineer' 'Customer Experience Eng

In [101]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_SV, specializations_FI],
        [data_terms_SV, data_terms_FI]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

54 :
['Data Governance Engineer (m/f/d) 80% Homeoffice' 'Network Data Engineer'
 'Data Engineer | Sjunde AP-fonden | Stockholm'
 'System Engineer within Energy Storage System at Electromobility'
 'Data Engineer to Avinode!' 'BI & Data Analytics Manager' 'Data Engineer'
 'Data/ cloud engineer for a global telecom company!'
 'IT solution engineer / Cloud engineer till QD'
 'Data Engineer to Business Intelligence & Analytics...'
 'Analytics Data Engineer' 'Back-end/Data Software Engineer'
 'Data Engineer - Stockholm' 'Junior Data Engineer'
 'Graduate Data Engineer'
 'Data Engineer (Google Looker) - Fully Remote (LATAM)'
 'Head of Azure till Bitlog' 'Sustainability Engineer' 'Data Science Lead'
 'Data Engineer sökes till Gränges' 'DevOps Cloud Engineer for AWS'
 'CI/CD Engineer till Tutus Data AB!'
 'Devops Engineer with Cloud Experience' 'Azure AI/Data Engineer'
 'Data Engineer to Data Platform and Innovation team'
 'Data Engineer (Engagement team)' 'GCP Data Engineer'
 'Data Engineer to 

In [102]:
dfs[selector] = df

##### 7.19 Switzerland

In [103]:
selector = 'Switzerland'

In [104]:
show_unique_and_its_len(dfs[selector]['Job_title'])

134 :
['Big Data & Platform Engineer 80-100%'
 'Data Engineer (w/m) | 80% oder mehr' 'System Integration Engineer'
 'Leiter Data Analytics & Data Science (a)' 'DWH Engineer (a) 80 - 100%'
 'Junior Data Engineer / Consultant (100%)'
 'Senior Data Engineer / Consultant (100%)'
 'Software Systems Engineer - Client Services Technology'
 'System Engineer im Cloud Bereich'
 'Platform-Engineer Information Management (w/m/d)'
 'Software Systems Engineer – Business Support Services Technology'
 'Data Engineer Messtechnik (m/w/d)' 'IT Cyber Security Engineer'
 'Data Engineer / Analyst (m/f/d)' 'Azure Data Engineer - Spark'
 'Data Engineer - DWH Entwicklung (w/m/d) (8581)'
 'Data Management Engineer (m/f)' 'IT Test Engineer'
 'Research Software Engineer' 'Manufacturing Engineer - Medical Device'
 'M365 Engineer (w/m/d) (8666)' 'Data Management Engineer (w/m/d)'
 'ETL / DWH Engineer (m/w/d)'
 'Freelance Hardware/ Data Centre Field Engineer'
 'Internship Organoid Genome Editing'
 'Data- & Integrati

In [105]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_DE, specializations_FR, specializations_IT],
        [data_terms_DE, data_terms_FR, data_terms_IT]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

64 :
['Big Data & Platform Engineer 80-100%'
 'Data Engineer (w/m) | 80% oder mehr'
 'Junior Data Engineer / Consultant (100%)'
 'Senior Data Engineer / Consultant (100%)'
 'System Engineer im Cloud Bereich' 'Data Engineer Messtechnik (m/w/d)'
 'Data Engineer / Analyst (m/f/d)' 'Azure Data Engineer - Spark'
 'Data Engineer - DWH Entwicklung (w/m/d) (8581)'
 'Data Management Engineer (m/f)' 'Data Management Engineer (w/m/d)'
 'ETL / DWH Engineer (m/w/d)'
 'Freelance Hardware/ Data Centre Field Engineer'
 'Data- & Integration Engineer (w/m/d)' 'Senior Power BI Engineer (m/w/d)'
 'Managed Service Engineer Data Infrastructure'
 'Big Data - (Senior) Data Scientist and Machine Learning Engineer (m/w)'
 'Big Data-Engineer (m/w) 80-100%' 'Azure Data Engineer'
 'Data Engineer / Analyst' 'Business Intelligence Data Engineer (d/f/w)'
 'DevOps Data Engineer 100%' 'Mobile Software Engineer Flutter'
 'Data Scientist / AI Engineer (w/m/d)' 'Big Data Engineer (m/w/d)'
 'Data Engineer / Data Scientist'

In [106]:
dfs[selector] = df

##### 7.20 Turkey

In [107]:
dfs[selector] = df

In [108]:
selector = 'Turkey'

specializations_TR= ["Mühendis", "Danışman", "Mimar", "Uzman", "Yönetici", "Geliştirici", "Mimarlık", "Yönetici"]
data_terms_TR = ["Veri", "ETL", "Bulut", "Analitik", "Analiz", "BI", "İş Zekası", "İş Analizi", "Veritabanı", "Boru Hattı", "Meta Veri", "İzleme", "Veri Merkezi"]

specializations_KU = ["Mûhandis", "Mûhendisî", "Pêşkêşker", "pargîdaniyar", "XWESER", "Manajer", "Pêşgir", "Arkîtektur", "Peywendkar"]
data_terms_KU = ["Zanist", "ETL", "Pirsgirêk", "Analytîk", "Analîz", "BI", "Zanistên Kar", "Analîzên Kar", "Bingehbazî", "Pîpelya", "Meta-Data", "Pêşwazî", "Navenda Zanistê"]

In [109]:
show_unique_and_its_len(filtered_df['Job_title'])

64 :
['Big Data & Platform Engineer 80-100%'
 'Data Engineer (w/m) | 80% oder mehr'
 'Junior Data Engineer / Consultant (100%)'
 'Senior Data Engineer / Consultant (100%)'
 'System Engineer im Cloud Bereich' 'Data Engineer Messtechnik (m/w/d)'
 'Data Engineer / Analyst (m/f/d)' 'Azure Data Engineer - Spark'
 'Data Engineer - DWH Entwicklung (w/m/d) (8581)'
 'Data Management Engineer (m/f)' 'Data Management Engineer (w/m/d)'
 'ETL / DWH Engineer (m/w/d)'
 'Freelance Hardware/ Data Centre Field Engineer'
 'Data- & Integration Engineer (w/m/d)' 'Senior Power BI Engineer (m/w/d)'
 'Managed Service Engineer Data Infrastructure'
 'Big Data - (Senior) Data Scientist and Machine Learning Engineer (m/w)'
 'Big Data-Engineer (m/w) 80-100%' 'Azure Data Engineer'
 'Data Engineer / Analyst' 'Business Intelligence Data Engineer (d/f/w)'
 'DevOps Data Engineer 100%' 'Mobile Software Engineer Flutter'
 'Data Scientist / AI Engineer (w/m/d)' 'Big Data Engineer (m/w/d)'
 'Data Engineer / Data Scientist'

In [110]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_DE, specializations_FR, specializations_IT],
        [data_terms_DE, data_terms_FR, data_terms_IT]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

15 :
['Data Engineer' 'Machine Learning Engineer With Data Science Experience'
 'Software & Big Data Engineer' 'Big Data & AI Engineer'
 'SENIOR DATA ENGINEER (PYTHON/SCALA, AWS)' 'GCP Data Engineer'
 'Data Analysis / Data Engineering / Data Science (m/f/d)'
 'Network Data Solutions Delivery Engineer' 'Senior Data Engineer'
 'Planning Engineer @Siemens Mobility Türkiye'
 'Mid / Senior Data Engineer'
 'Application and Design Engineer @Siemens Mobility Türkiye'
 'senior data engineer'
 'Technology Engineer - Site Reliability Engineer(SRE)'
 'Database Engineer - PostgreSQL']


In [111]:
dfs[selector] = df

##### 7.21 United_Kingdom

In [112]:
selector = 'United_Kingdom'

In [113]:
show_unique_and_its_len(dfs[selector]['Job_title'])

54 :
['Data Engineer' 'Data Engineer (Junior)' 'Junior Power BI Engineer'
 'Senior Data Engineer' 'Reporting Data Engineer' 'Azure Data Engineer'
 'Data Engineer - London' 'BI & Data Engineer'
 'Python Data Engineer (Remote)' 'Data Engineer Hybrid London'
 'Data Engineer | UK/Europe | Remote | £50-70K' 'Data Pipeline Engineer'
 'Microsoft Azure Data engineer' 'Apprentice Data Engineer'
 'Data Engineer (GIS/Geospatial)' 'Graduate Data Engineer - Bristol'
 'Experienced Data Engineer' 'MLOPS & Data Engineer (12 Months)'
 'Placement Student – Clinical Data Engineer'
 'Data Processor / Data Engineer' 'Junior Data Engineer'
 'Senior Data Engineer Full Time Permanent' 'Data Cabling Engineer'
 'Junior SQL Data Engineer' 'Trading Desk - Data Engineer'
 'Data Software Engineer | Python' 'BI Engineer'
 'Data Engineer, MS Partner, Redhill Full Time Permanent'
 'Data Analytics Engineer' 'Data Engineer - Remote' 'Power BI Engineer'
 'Data Engineer - SQL / ETL / AWS - Remote' 'Data Model Engineer'
 '

In [114]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title']),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

53 :
['Data Engineer' 'Data Engineer (Junior)' 'Junior Power BI Engineer'
 'Senior Data Engineer' 'Reporting Data Engineer' 'Azure Data Engineer'
 'Data Engineer - London' 'BI & Data Engineer'
 'Python Data Engineer (Remote)' 'Data Engineer Hybrid London'
 'Data Engineer | UK/Europe | Remote | £50-70K' 'Data Pipeline Engineer'
 'Microsoft Azure Data engineer' 'Apprentice Data Engineer'
 'Data Engineer (GIS/Geospatial)' 'Graduate Data Engineer - Bristol'
 'Experienced Data Engineer' 'MLOPS & Data Engineer (12 Months)'
 'Placement Student – Clinical Data Engineer'
 'Data Processor / Data Engineer' 'Junior Data Engineer'
 'Senior Data Engineer Full Time Permanent' 'Data Cabling Engineer'
 'Junior SQL Data Engineer' 'Trading Desk - Data Engineer'
 'Data Software Engineer | Python' 'BI Engineer'
 'Data Engineer, MS Partner, Redhill Full Time Permanent'
 'Data Analytics Engineer' 'Data Engineer - Remote' 'Power BI Engineer'
 'Data Engineer - SQL / ETL / AWS - Remote' 'Data Model Engineer'
 '

In [115]:
dfs[selector] = df

##### 7.22 USA 🦅

In [116]:
selector = 'United_States'

In [117]:
show_unique_and_its_len(dfs[selector]['Job_title'])

65 :
['Data Engineer' 'Data Engineer - Terraform' 'Snowflake Data Engineer'
 'Data Engineer (MDM)' 'AWS Data Engineer' 'DATA ENGINEER'
 'Big Data Engineer' 'Sr. Data Engineer' 'Data Engineer - Flink'
 'Jr. Data Engineer' 'Data Engineer - Remote' 'Data Engineer (L5)'
 'Software Data Engineer' 'GCP Data Engineer' 'Senior Data Engineer'
 'Azure Cloud Data Engineer' 'GCP DATA ENGINEER' 'Data Test Engineer'
 'Azure Data Engineer' 'Senior Azure Data Bricks Engineer'
 'Data Analytics Engineer' 'Data Engineer (W2 and onsite)'
 'Senior Big Data Engineer' 'Data Engineer- Google Cloud'
 'Data Engineer (ETL & System Administration concentration)'
 'Data Engineer/Data Analyst' 'Data Engineer/Data Scientist'
 'ETL Data Engineer' 'Lead Data Engineer'
 'Sr. Data Engineer with Snowflake' 'Junior Data Engineer'
 'Senior Data Engineer - Remote' 'Data Engineer Level 3'
 'Cloud Data Engineer (Azure)' 'Senior Software Engineer, Data'
 'Technical Support Engineer (L5) - Data Platform, Big Data / Analytics'
 

In [118]:
df = dfs[selector]
filtered_df = df[df.apply(
    lambda row: is_data_engineering_job(
        row['Job_title'],
        [specializations_ES],
        [data_terms_ES]
    ),
    axis=1)
]

show_unique_and_its_len(filtered_df['Job_title'])

65 :
['Data Engineer' 'Data Engineer - Terraform' 'Snowflake Data Engineer'
 'Data Engineer (MDM)' 'AWS Data Engineer' 'DATA ENGINEER'
 'Big Data Engineer' 'Sr. Data Engineer' 'Data Engineer - Flink'
 'Jr. Data Engineer' 'Data Engineer - Remote' 'Data Engineer (L5)'
 'Software Data Engineer' 'GCP Data Engineer' 'Senior Data Engineer'
 'Azure Cloud Data Engineer' 'GCP DATA ENGINEER' 'Data Test Engineer'
 'Azure Data Engineer' 'Senior Azure Data Bricks Engineer'
 'Data Analytics Engineer' 'Data Engineer (W2 and onsite)'
 'Senior Big Data Engineer' 'Data Engineer- Google Cloud'
 'Data Engineer (ETL & System Administration concentration)'
 'Data Engineer/Data Analyst' 'Data Engineer/Data Scientist'
 'ETL Data Engineer' 'Lead Data Engineer'
 'Sr. Data Engineer with Snowflake' 'Junior Data Engineer'
 'Senior Data Engineer - Remote' 'Data Engineer Level 3'
 'Cloud Data Engineer (Azure)' 'Senior Software Engineer, Data'
 'Technical Support Engineer (L5) - Data Platform, Big Data / Analytics'
 

In [119]:
dfs[selector] = df

### 8. Add job title seniority

In [None]:
def get_seniority(job_title:str):

    seniority = {
        'Junior' : ["Jr.", "Junior"],
        'Mid' : ["Mid", "Middle"],
        'Senior': ["Sr.", "Senior"],
        'Lead': "Lead",
        'Principle' : "Principle"
    }
    
    if seniority['Junior'][0] in job_title or seniority['Junior'][1] in job_title :
        return "Junior"
    elif seniority['Mid'][0] in job_title or seniority['Mid'][1] in job_title :
        return "Mid"
    elif seniority['Senior'][0] in job_title or seniority['Senior'][1] in job_title :
        return "Senior"
    elif seniority['Lead'] in job_title:
        return "Lead"
    elif seniority['Principle'] in job_title:
        return "Principle"
    else:
        return np.nan
    
df['Seniority'] = df['Job_title'].apply(get_seniority)

del get_seniority

df['Seniority'].value_counts()

Add non-standard seniority

In [None]:
def apply_seniority_level(df, job_title, company_name, seniority_level):
    df['Seniority'] = df.apply(
        lambda row: seniority_level if row['Job_title'] == job_title and row['Company_name'] == company_name else row['Seniority'],
        axis=1
    )

apply_seniority_level(df, "Data Engineer (L5)", "Netflix", "Senior")
apply_seniority_level(df, "Technical Support Engineer (L5) - Data Platform, Big Data / Analytics", "Netflix", "Senior")
apply_seniority_level(df, "Data Engineer Level 3", "Infoorigin Inc", "Mid")
apply_seniority_level(df, "Data Engineer IC4 - US ONLY", "Braintrust", "Lead")
apply_seniority_level(df, "ETL Engineer/ Data Analyst - Software Engineer III", "JPMorgan Chase Bank, N.A.", "Senior")
apply_seniority_level(df, "Software Engineer III (AI, Data, Python)", "JPMorgan Chase Bank, N.A.", "Senior")
apply_seniority_level(df, "Data Engineer 925", "Certec Consulting", "Senior")

del apply_seniority_level

df['Seniority'].value_counts()


### 8. Parse salary

#### 8.1 Employer provided salary

In [None]:
df['Salary_employer_provided'] = df['Salary'].apply(lambda salary : True if isinstance(salary, str) and "Employer Provided Salary" in salary else False)
df['Salary_employer_provided'].value_counts()

#### 8.2 Salary per hour

In [None]:
df['Salary_hourly'] = df['Salary'].apply(lambda salary : True if isinstance(salary, str) and "Per Hour" in salary else False)
df['Salary_hourly'].value_counts()

#### 8.3 Salary min

In [None]:
def get_salary_min(salary):

    if isinstance(salary, str):

        pattern_salary = r"(\d+(\.\d+)?K?)"
        match_min: str = re.findall(pattern_salary, salary)[0][0]

        if "K" in match_min:
            match_min = float(match_min.replace("K", ""))
            match_min *= 1000

        return float(match_min)

    else:

        return salary
    
def calculate_yearly_income(hourly_rate):

    hours_per_week = 40
    WEEKS_PER_YEAR = 52
    HOURS_PER_YEAR = WEEKS_PER_YEAR * hours_per_week
    gross_income = hourly_rate * HOURS_PER_YEAR
    return gross_income

df['Salary_min'] = df['Salary'].apply(get_salary_min)
df['Salary_min'] = df.apply(
        lambda row: calculate_yearly_income(row['Salary_min']) if row['Salary_hourly'] == True else row['Salary_min'],
        axis=1
    )

del get_salary_min

df['Salary_min']

#### 8.4 Salary max

In [None]:
def get_salary_max(salary):

    if isinstance(salary, str):

        pattern_salary = r"(\d+(\.\d+)?K?)"
        match_max: str = re.findall(pattern_salary, salary)[-1][0]

        if "K" in match_max:
            match_max = float(match_max.replace("K", ""))
            match_max *= 1000

        return float(match_max)

    else:

        return salary

df['Salary_max'] = df['Salary'].apply(get_salary_max)
df['Salary_max'] = df.apply(
        lambda row: calculate_yearly_income(row['Salary_max']) if row['Salary_hourly'] == True else row['Salary_max'],
        axis=1
    )

del get_salary_max

df['Salary_max']

In [None]:
# Cleanup

del calculate_yearly_income

#### 8.5 Salary currency 

In [None]:
def get_currency(salary: str):

    if isinstance(salary, str):

        pattern_currency = r"(.+?(?=\d))"

        if "Employer Provided Salary" in salary:
            pattern_currency = r"(\:.+?(?=\d))"

        matched = re.search(pattern_currency, salary)

        currency = matched.group(1).strip().replace(":", "")

        return currency

    else:

        return salary
    
df['Salary_currency'] = df['Salary'].apply(get_currency)
    
del get_currency
    
df['Salary_currency'].value_counts()

In [None]:
del df['Salary']

#### 8.6 Salary average

In [None]:
df['Salary_avg'] = (df['Salary_max']+df['Salary_min'])/2
df['Salary_avg']

### 9. Employees

In [None]:
df['Employees'].value_counts()

### 10. Type of ownership

In [None]:
df['Type_of_ownership'].value_counts()

### 11. Sector

In [None]:
df['Sector'].value_counts()

### 12. Industry

In [None]:
df['Industry'].value_counts()

### 13. Company age

In [None]:
import datetime

year = datetime.date.today().year

df['Company_age'] = df['Founded'].apply(lambda x: x if np.isnan(x) else int(year - x))
df['Company_age'] = df['Company_age']

del df['Founded'], year

df['Company_age'].value_counts()

### 14. Job age

In [None]:
np.sort(df['Job_age'].unique())

In [None]:
def clean_job_age(job_age):

    if job_age == "24h":
        job_age = "1d"
    elif job_age == "30d+":
        job_age = "31d"

    return int(job_age.replace("d", ""))

df['Job_age'] = df['Job_age'].apply(clean_job_age)

del clean_job_age
df['Job_age'].value_counts()


### 15. Revenue

In [None]:
df['Revenue_USD'].value_counts()

### 16. Preview columns so far

In [None]:
df.dtypes

### 17. Change columns order

##### 17.1 move salary values

In [None]:
def move_column__to_index(column_name: str, index: int):
    df.insert(index, column_name, df.pop(column_name))


def move_columns_to_index(column_names: list[str], index: int):
    for col in column_names:
        df.insert(index, col, df.pop(col))
        index += 1

move_columns_to_index([
    'Salary_min', 
    'Salary_max', 
    'Salary_avg', 
    'Salary_currency',
    'Salary_employer_provided', 
    'Salary_hourly'
    ], 3
    )

df.dtypes

##### 17.2 Move Seniority

In [None]:
move_column__to_index('Seniority', 3)
df.dtypes

##### 17.3 Move City, State

In [None]:
move_columns_to_index(['City', 'State'], 11)
df.dtypes

##### 17.4 Move Company age

In [None]:
move_column__to_index('Company_age', 19)
df.dtypes

##### 17.5 Move Work/Life_balance 

In [None]:
move_columns_to_index(['Senior_management', 'Work/Life_balance'], 25)
df.dtypes

## 18. Technology requirements - parsing the job description

##### 19 Git and code repositories

In [None]:
def check_repo(job_description: str):

    git_platforms = [
        r"Github", 
        r"GitLab", 
        r"Bitbucket", 
        r"SourceForge", 
        r"Launchpad", 
        r"Google Cloud Source Repositories",
        r"AWS CodeCommit",
        r"GitBucket",
        r"Gogs",
        r"Gitea",
        r"Apache Allura",
        r"RhodeCode",
        r"ONEDEV",
        r"Codeberg",
        r"Git" # IMPORTANT, it has to be last!
        ]
    
    for platform in git_platforms:
        if re.search((r"\b" + platform + r"\b"), job_description, re.IGNORECASE):
            return platform
        
    return np.nan
        
df['Git'] = df['Description'].apply(check_repo)

del check_repo

df['Git'].value_counts()


In [None]:
def make_is_tech(cloud_names: list[str]):

    def is_tech(job_description: str):

        
        for cloud in cloud_names:
            if re.search((r"\b" + cloud + r"\b"), job_description, re.IGNORECASE):
                return True
            
        return False
    
    return is_tech

In [None]:
def add_is_needed_column_to_df(column_name: str, tech_names: list[str]):

    df[column_name] = df['Description'].apply(make_is_tech(tech_names))


#### 20. Cloud Platforms

##### 20.1 AWS


Provides on-demand cloud computing platforms and APIs to individuals, companies, and governments, on a metered, pay-as-you-go basis. Often times, clients will use this in combination with autoscaling.

In [None]:
cloud_names = [
    r"Amazon Web Services", 
    r"AWS",
    ]

column_name = 'AWS'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.2 Microsoft Azure

A cloud computing platform operated by Microsoft that provides access, management, and development of applications and services via around the world-distributed data centers.

In [None]:
cloud_names = [
    r"Microsoft Azure", 
    r"Azure",
    ]

column_name = 'Microsoft_Azure'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.3 GCP

A suite of cloud computing services that runs on the same infrastructure that Google uses internally for its end-user products, such as Google Search, Gmail, Google Drive, and YouTube.

In [None]:
cloud_names = [
    r"Google Cloud Platform", 
    r"GCP",
    ]

column_name = 'GPC'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.4 Alibaba Cloud

Alibaba Cloud provides cloud computing services to online businesses and Alibaba's own e-commerce ecosystem.

In [None]:
cloud_names = [
    r"Alibaba Cloud", 
    r"Aliyun",
    ]

column_name = 'Alibaba_Cloud'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.4 Oracle Cloud

Providing servers, storage, network, applications and services through a global network of Oracle Corporation managed data centers

In [None]:
cloud_names = [
    r"Oracle Cloud", 
    r"OCI",
    ]

column_name = 'Oracle_Cloud'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.5 IBM Cloud

A set of cloud computing services for business

In [None]:
cloud_names = [
    r"IBM Cloud", 
    r"Kyndryl",
    r"Bluemix"
    ]

column_name = 'IBM_cloud'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.6 Tencent Cloud

Tencent Cloud provides businesses across the globe with stable and secure industry-leading cloud products and services, leveraging technological advancements such as cloud computing, Big Data, AI, IoT and network security.

In [None]:
cloud_names = [
    r"Tencent Cloud",
    ]

column_name = 'Tencent_cloud'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.8 OVHcloud

A French cloud computing company which offers VPS, dedicated servers and other web services

In [None]:
cloud_names = [
    r"OVHcloud",
    r"OVH"
    ]

column_name = 'OVHcloud'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.9 DigitalOcean

A cloud hosting provider that offers cloud computing services and Infrastructure as a Service (IaaS). Known for pricing and scalability

In [None]:
cloud_names = [
    r"DigitalOcean"
    ]

column_name = 'DigitalOcean_cloud'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.10 Linode

An American cloud hosting provider that focused on providing Linux-based virtual machines, cloud infrastructure, and managed services.

In [None]:
cloud_names = [
    r"Linode",
    r"Akamai"
    ]

column_name = 'Lincode_cloud'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

In [None]:
del cloud_names

#### 21. Relational Database Management Systems (RDBMS)

##### 21.1 PostgreSQL
Can be used as a data store for big data solutions.
Postgres, is a free and open-source relational database management system (RDBMS) emphasizing extensibility and SQL compliance. <br>
PostgreSQL features transactions with Atomicity, Consistency, Isolation, Durability (ACID) properties, automatically updatable views, materialized views, triggers, foreign keys, and stored procedures. <br> It is designed to handle a range of workloads, from single machines to data warehouses or Web services with many concurrent users. 

In [None]:
tool_names = [
    r"PostgreSQL",
    r"Postgres"
    ]

column_name = 'PostgreSQL'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 21.2 Microsoft SQL Server


A software product with the primary function of storing and retrieving data as requested by other software applications—which may run either on the same computer or on another computer across a network (including the Internet).

In [None]:
tool_names = [
    r"Microsoft SQL",
    r"SQL Server"
    ]

column_name = 'Microsoft_SQL_Server'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 21.3 MySQL

An open-source relational database management system.

In [None]:
tool_names = [
    r"MySQL"
    ]

column_name = 'MySQL'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 21.4 IBM Db2 warehouse

A family of data management products, including database servers, developed by IBM. It initially supported the relational model, but was extended to support object–relational features and non-relational structures like JSON and XML.

In [None]:
tool_names = [
    r"Db2",
    r"IBMDb2"
    ]

column_name = 'IBM_Db2'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 21.5. Oracle PL/SQL

 A procedural language designed specifically to embrace SQL statements within its syntax. PL/SQL program units are compiled by the Oracle Database server and stored inside the database. And at run-time, both PL/SQL and SQL run within the same server process, bringing optimal efficiency

In [None]:
tool_names = [
    r"PL/SQL",
    r"PL / SQL",
    r"Procedural Language for SQL"
    ]

column_name = 'Oracle_PL_SQL'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 22. NoSQL Database Management Systems

##### 22.1 MongoDB

A source-available cross-platform document-oriented database program. Classified as a NoSQL database program, MongoDB uses JSON-like documents with optional schemas

In [None]:
tool_names = [
    r"MongoDB",
    r"Mongo DB",
    ]

column_name = 'MongoDB'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.2 Cassandra

A free and open-source, distributed, wide-column store, NoSQL database management system designed to handle large amounts of data across many commodity servers, providing high availability with no single point of failure

In [None]:
tool_names = [
    r"Cassandra",
    ]

column_name = 'Cassandra'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.3 Amazon DynamoDB

A proprietary NoSQL database service that supports key–value and document data structures and is offered by Amazon.com as part of the Amazon Web Services portfolio.

In [None]:
tool_names = [
    r"DynamoDB",
    r"Dynamo DB",
    r"SimpleDB"
    ]

column_name = 'Amazon_DynamoDB'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 22.4 Neo4j

A graph database management system developed by Neo4j, Inc. Described by its developers as an ACID-compliant transactional database with native graph storage and processing

In [None]:
tool_names = [
    r"Neo4j"
    ]

column_name = 'Neo4j'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.5 Apache Solr

An open-source enterprise-search platform, written in Java. Its major features include full-text search, hit highlighting, faceted search, real-time indexing, dynamic clustering, database integration, NoSQL features[2] and rich document (e.g., Word, PDF) handling.

In [None]:
tool_names = [
    r"Solr"
    ]

column_name = 'Apache_Solr'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 22. Data warehousing and Analytics

##### 22.1 Amazon Redshift

A data warehouse product which forms part of the larger cloud-computing platform Amazon Web Services. It is built on top of technology from the massive parallel processing data warehouse company ParAccel, to handle large scale data sets and database migrations.

In [None]:
tool_names = [
    r"Redshift",
    ]

column_name = 'Amazon_Redshift'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.2 Google BigQuery

A serverless data warehouse that enables scalable analysis over petabytes of data. It is a Platform as a Service that supports querying using ANSI SQL. It also has built-in machine learning capabilities.

In [None]:
tool_names = [
    r"BigQuery",
    ]

column_name = 'Google_BigQuery'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.3 Snowflake

Snowflake enables data storage, processing, and analytic solutions.

In [None]:
tool_names = [
    r"Snowflake"
    ]

column_name = 'Snowflake'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.4 Oracle Exadata

Designed to run Oracle Database workloads, such as an OLTP application running simultaneously with Analytics processing. Historically, specialized database computing platforms were designed for a particular workload, such as Data Warehousing, and poor or unusable for other workloads, such as OLTP. 

In [None]:
tool_names = [
    r"Exadata"
    ]

column_name = 'Oracle_Exadata'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.5 SAP HANA

A multi-model database that stores data in its memory instead of keeping it on a disk.

In [None]:
tool_names = [
    r"HANA"
    ]

column_name = 'SAP_HANA'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.6 Teradata

It is mainly suitable for building large scale data warehousing applications.

In [None]:
tool_names = [
    r"Teradata"
    ]

column_name = 'Teradata'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 23. Data Integration and Processing

##### 23.1 Informatica PowerCenter - Data integration tool


Used extensively for ETL operations, data quality, data masking, data replication, data virtualization, and master data management services.

In [None]:
tool_names = [
    r"PowerCenter",
    r"Power Center",
    ]

column_name = 'Informatica_PowerCenter'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 23.2 DataBricks - Data processing and analytics platform

A unified set of tools for building, deploying, sharing, and maintaining enterprise-grade data solutions at scale. 

In [None]:
tool_names = [
    r"Data Bricks",
    r"Databricks"
    ]

column_name = 'Databricks'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 23.3 Presto - Query engine

 A distributed query engine for big data using the SQL query language. Its architecture allows users to query data sources such as Hadoop, Cassandra, Kafka, AWS S3, Alluxio, MySQL, MongoDB and Teradata, and allows use of multiple data sources

In [None]:
tool_names = [
    r"Presto",
    r"PrestoDB",
    r"PrestoSQL"
    ]

column_name = 'Presto'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 24. Stream processing tools

##### 24.1 Apache Kafka

An open-source system, distributed event store and stream-processing platform. The project aims to provide a unified, high-throughput, low-latency platform for handling real-time data feeds.

In [None]:
tool_names = [
    r"Kafka",
    ]

column_name = 'Apache_Kafka'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 24.2 Apache Flink

Process data streams at a large scale and to deliver real-time analytical insights about your processed data with your streaming application.

In [None]:
tool_names = [
    r"Flink",
    ]

column_name = 'Apache_Flink'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 24.3 Dataflow


Dataflow is a managed service provided by Google Cloud for building and executing data processing pipelines. It enables developers to create scalable and efficient batch and streaming data pipelines using a simple programming model.

In [None]:
tool_names = [
    r"Dataflow",
    ]

column_name = 'Dataflow'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 25 Workflow orchestration tools

##### 25.1 Apache Airflow

Apache Airflow is an open-source platform used for programmatically creating, scheduling, and monitoring complex workflows or data pipelines. It allows users to define and execute a sequence of tasks or operations, while providing tools for tracking and troubleshooting workflow executions.

In [None]:
tool_names = [
    r"Airflow",
    ]

column_name = 'Apache_Airflow'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 25.2 Luigi

Luigi is a Python-based open-source workflow management system that helps to build complex pipelines of batch jobs. It provides a flexible and extensible architecture to create and manage complex data workflows.

In [None]:
tool_names = [
    r"Luigi",
    ]

column_name = 'Luigi'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 25.3 SSIS

SQL Server Integration Services (SSIS) is a Microsoft tool used for building data integration and ETL (extract, transform, load) workflows. It allows users to perform a range of tasks such as data extraction, transformation, and loading from various sources to different destinations.

In [None]:
tool_names = [
    r"SSIS",
    r"SQL Server Integration Services"
    ]

column_name = 'SSIS'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 26. Big Data processing

##### 24.1 Apache Hadoop

Apache Hadoop is an open-source framework that allows for the distributed processing of large data sets across clusters of computers using simple programming models. It provides a distributed file system and supports various distributed computing models, such as MapReduce and Spark, for processing and analyzing large data sets.

In [None]:
tool_names = [
    r"Hadoop",
    ]

column_name = 'Apache_Hadoop'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 24.2 Apache Hive


Apache Hive is a data warehouse software that facilitates querying and managing large datasets stored in Hadoop file systems using a SQL-like language called HiveQL. It provides a high-level interface for data analysts and developers to analyze, transform, and summarize data stored in Hadoop Distributed File System (HDFS) and other compatible storage systems.

In [None]:
tool_names = [
    r"Hive",
    ]

column_name = 'Apache_Hive'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 24.3 Apache Spark

Apache Spark is a distributed computing framework designed to process large-scale data processing and analysis workloads in parallel. It can be used for batch processing, real-time stream processing, machine learning, and graph processing, among other things.

In [None]:
tool_names = [
    r"Spark",
    r"PySpark"
    ]

column_name = 'Apache_Spark'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

In [None]:
df.dtypes

#### 25. Linux

Family of Unix-like operating systems.

In [None]:
tool_names = [
    r"Linux",
    ]

column_name = 'Linux'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 26. Programming languages

##### 26.1 Python

Python is a high-level, interpreted programming language used for various purposes such as web development, data analysis, artificial intelligence, and more.

In [None]:
tool_names = [
    r"Python",
    ]

column_name = 'Python'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.2 R

A programming language and environment for statistical graphics and computing.

In [None]:
tool_names = [
    r"R",
    r"RStudio"
    ]

column_name = 'R'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.3 Scala

Scala is a high-level, statically typed programming language designed for functional programming and scalable, concurrent applications.

In [None]:
tool_names = [
    r"Scala"
    ]

column_name = 'Scala'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.4 SQL

A programming language used to manage and manipulate relational databases.

In [None]:
tool_names = [
    r"SQL",
    r"MySQL",
    r"PostgreSQL",
    r"Postgres",
    r"SQLite",
    r"MariaDB",
    r"IBM DB2",
    r"Oracle Database",
    r"Db2",
    ]

column_name = 'SQL'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.5 Java

Java is a high-level, object-oriented programming language widely used for developing robust and scalable enterprise applications.

In Data Science, Java can be used for developing machine learning models, data analysis, and data processing applications, as well as for building large-scale distributed systems for big data processing and management.

In [None]:
tool_names = [
    r"Java",
    ]

column_name = 'Java'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.6 C++

A general-purpose programming language designed for systems and application programming, and it is used in Data Science for building high-performance libraries and applications that require intensive computational tasks.

In [None]:
tool_names = [
    r"C\+\+",
    ]

column_name = 'C++'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.7 Go

A statically typed programming language designed for building simple, efficient, and reliable software, and it can be used in data engineering for building scalable, distributed systems for data processing and analysis.

In [None]:
tool_names = [
    r"Go language", # Go as separate word is too common in English
    r"Golang",
    ]

column_name = 'Go'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.8 Bash

A shell scripting language used for automating repetitive tasks and managing the operating system, including data processing tasks, in the command-line interface (CLI) on Unix and Unix-like systems.

In [None]:
tool_names = [
    r"Bash",
    ]

column_name = 'Bash'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.9 Powershell

A task automation and configuration management framework from Microsoft, which can be used in Data Science for automating various data processing tasks on Windows machines in the command-line interface (CLI).

In [None]:
tool_names = [
    r"PowerShell",
    r"DOS Shell"
    ]

column_name = 'PowerShell'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.10 CLI

CLI stands for Command Line Interface, which is a way to interact with a computer program through text commands, and it is commonly used in Data Science for running scripts, automating tasks, and managing software packages.

In [None]:
tool_names = [
    r"CLI",
    r"Command Line Interface"
    ]

column_name = 'CLI'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 27. Virtualization Tools

Business intelligence and data visualization tools used for analyzing and visualizing data.

##### 27.1 Tableau

In [None]:
tool_names = [
    r"Tableau"
    ]

column_name = 'Tableau'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.2 Power BI

In [None]:
tool_names = [
    r"Power BI"
    ]

column_name = 'Power_BI'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.3 Google Analytics

In [None]:
tool_names = [
    r"Google Analytics"
    ]

column_name = 'Google_Analytics'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.4 QlikView

In [None]:
tool_names = [
    r"QlikView",
    r"Qlik"
    ]

column_name = 'QlikView'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.5 Oracle BI server

In [None]:
tool_names = [
    r"Oracle Business Intelligence Enterprise Edition",
    r"OBIEE",
    r"Oracle BI server",
    ]

column_name = 'Oracle_BI_server'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.6 SAS Analytics

In [None]:
tool_names = [
    r"SAS Analytics",
    r"Statistical Analysis System",
    ]

column_name = 'SAS_Analytics'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.7 Lumira

In [None]:
tool_names = [
    r"Lumira",
    ]

column_name = 'Lumira'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.8 IBM Cognos Impromptu

In [None]:
tool_names = [
    r"Cognos Impromptu",
    ]

column_name = 'Cognos_Impromptu'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.9 MicroStrategy

In [None]:
tool_names = [
    r"MicroStrategy",
    ]

column_name = 'MicroStrategy'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.10 InsightSquared

In [None]:
tool_names = [
    r"InsightSquared",
    ]

column_name = 'InsightSquared'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.11 Sisense

In [None]:
tool_names = [
    r"Sisense",
    ]

column_name = 'Sisense'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.12 Dundas BI

In [None]:
tool_names = [
    r"Dundas BI",
    ]

column_name = 'Dundas_BI'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.13 Domo

In [None]:
tool_names = [
    r"Domo",
    ]

column_name = 'Domo'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.14 Looker

In [None]:
tool_names = [
    r"Looker",
    ]

column_name = 'Looker'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 28. Microsoft Excel

In [None]:
tool_names = [
    r"Excel",
    ]

column_name = 'Excel'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 29. Certifications

Checking if there is a need for any certification.

In [None]:
# Coursera, Udemy, Datacamp etc. list
tool_names = [
    r"Certificates",
    r"Certificate",
    r"Data Engineering, Big Data, and Machine Learning on GCP",
    r"Google Professional Data Engineer",
    r"Microsoft Azure Data Engineering",
    r"Data Engineer.+Nanodegree",
    r"DataCamp",
    r"Data Engineering, Big Data, and Machine Learning on GCP",
    r"Python, Bash and SQL Essentials for Data Engineering Specialization",
    r"Data Engineering ETL, Web Scraping, and Automation",
    r"Big Data Engineering with Hadoop and Spark"
    ]

column_name = 'Is_certificate'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 30. Needed education level

##### 30.1 BA

In [None]:
tool_names = [
    r"BA",
    r"Bachelor",
    r"BSc",
    r"Bachelors"
    ]

column_name = 'BA'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 30.2 MS

In [None]:
tool_names = [
    r"MS",
    r"MSc",
    r"Master",
    r"Masters",
    r"master\'s"
    ]

column_name = 'MS'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 30.3 Phd

In [None]:
tool_names = [
    r"Phd",
    r"Ph\.D",
    r"DPhil",
    r"Doctor of Philosophy",
    ]

column_name = 'Phd'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

### Overview

In [None]:
df.shape

In [None]:
columns_names = df.columns
columns_names

In [None]:
del columns_names, column_name, tool_names

### 31. Final cleanup

##### 31.1 Rename columns

In [None]:
df = df.rename({
    'Company_name': 'Name',
    'Job_title': 'Title',
    'Salary_min': 'Min',
    'Salary_max': 'Max',
    'Salary_avg': 'Avg',
    'Salary_currency': 'Currency',
    'Salary_employer_provided': 'Employer_provided',
    'Salary_hourly': 'Is_hourly',
    'Alibaba_Cloud': 'Alibaba',
    'Oracle_Cloud': 'Oracle',
    'IBM_cloud': 'IBM',
    'Tencent_cloud': 'Tencent',
    'DigitalOcean_cloud': 'DigitalOcean',
    'Lincode_cloud': 'Lincode'
    }, axis=1)


##### 31.2 Change columns order

In [None]:
def move_column__to_index(column_name: str, index: int):
    df.insert(index, column_name, df.pop(column_name))


def move_columns_to_index(column_names: list[str], index: int):
    for col in column_names:
        df.insert(index, col, df.pop(col))
        index += 1

move_columns_to_index([                       
                    'Title',
                    'Description',
                    'Seniority',
                    'City',
                    'State',
                    'Job_age',
                    'Easy_apply',
                    'Min',
                    'Max',
                    'Avg',
                    'Currency',
                    'Employer_provided',
                    'Is_hourly',
                    'Name',
                    'Rating',
                    'Employees',
                    'Type_of_ownership',
                    'Sector',
                    'Industry',
                    'Company_age',
                    'Revenue_USD',
                    'Friend_recommend',
                    'CEO_approval',
                    'Career_opportunities',
                    'Comp_&_benefits',
                    'Senior_management',
                    'Work/Life_balance',
                    'Culture_&_values',
                    'Pros',
                    'Cons',
                    'Benefits_rating',
                    'Benefits_reviews',
                    'BA',
                    'MS',
                    'Phd',
                    'Is_certificate',
                    'Git',
                    'AWS',
                    'Microsoft_Azure',
                    'GPC',
                    'Alibaba',
                    'Oracle',
                    'IBM',
                    'Tencent',
                    'OVHcloud',
                    'DigitalOcean',
                    'Lincode',
                    'PostgreSQL',
                    'Microsoft_SQL_Server',
                    'IBM_Db2',
                    'MySQL',
                    'Oracle_PL_SQL',
                    'MongoDB',
                    'Cassandra',
                    'Amazon_DynamoDB',
                    'Neo4j',
                    'Apache_Solr',
                    'Amazon_Redshift',
                    'Google_BigQuery',
                    'Snowflake',
                    'Oracle_Exadata',
                    'SAP_HANA',
                    'Teradata',
                    'Informatica_PowerCenter',
                    'Databricks',
                    'Presto',
                    'Apache_Kafka',
                    'Apache_Flink',
                    'Dataflow',
                    'Apache_Airflow',
                    'Luigi',
                    'SSIS',
                    'Apache_Hadoop',
                    'Apache_Hive',
                    'Apache_Spark',
                    'Linux',
                    'Python',
                    'R',
                    'Scala',
                    'SQL',
                    'Java',
                    'C++',
                    'Go',
                    'Bash',
                    'PowerShell',
                    'CLI',
                    'Tableau',
                    'Power_BI',
                    'Google_Analytics',
                    'QlikView',
                    'Oracle_BI_server',
                    'SAS_Analytics',
                    'Lumira',
                    'Cognos_Impromptu',
                    'MicroStrategy',
                    'InsightSquared', 
                    'Sisense', 
                    'Dundas_BI',
                    'Domo', 
                    'Looker', 
                    'Excel'
                    ],0
    )

df.dtypes

##### 31.3 Add multiindex

In [None]:
df.columns = pd.MultiIndex.from_tuples([
                                        ('Job_details', 'Title'),
                                        ('Job_details', 'Description'),
                                        ('Job_details', 'Seniority'),
                                        ('Job_details', 'City'),
                                        ('Job_details', 'State'),
                                        ('Job_details', 'Job_age'),
                                        ('Job_details', 'Easy_apply'),
                                        ('Salary', 'Min'),
                                        ('Salary', 'Max'),
                                        ('Salary', 'Avg'),
                                        ('Salary', 'Currency'),
                                        ('Salary', 'Employer_provided'),
                                        ('Salary', 'Is_hourly'),
                                        ('Company_info', 'Name'),
                                        ('Company_info', 'Rating'),
                                        ('Company_info', 'Employees'),
                                        ('Company_info', 'Type_of_ownership'),
                                        ('Company_info', 'Sector'),
                                        ('Company_info', 'Industry'),
                                        ('Company_info', 'Company_age'),
                                        ('Company_info', 'Revenue_USD'),
                                        ('Company_info', 'Friend_recommend'),
                                        ('Company_info', 'CEO_approval'),
                                        ('Company_info', 'Career_opportunities'),
                                        ('Company_info', 'Comp_&_benefits'),
                                        ('Company_info', 'Senior_management'),
                                        ('Company_info', 'Work/Life_balance'),
                                        ('Company_info', 'Culture_&_values'),
                                        ('Company_info', 'Pros'),
                                        ('Company_info', 'Cons'),
                                        ('Company_info', 'Benefits_rating'),
                                        ('Company_info', 'Benefits_reviews'),
                                        ('Education', 'BA'),
                                        ('Education', 'MS'),
                                        ('Education', 'Phd'),
                                        ('Education', 'Is_certificate'),
                                        ('Version_control', 'Git'),
                                        ('Cloud_platforms', 'AWS'),
                                        ('Cloud_platforms', 'Microsoft_Azure'),
                                        ('Cloud_platforms', 'GPC'),
                                        ('Cloud_platforms', 'Alibaba'),
                                        ('Cloud_platforms', 'Oracle'),
                                        ('Cloud_platforms', 'IBM'),
                                        ('Cloud_platforms', 'Tencent'),
                                        ('Cloud_platforms', 'OVHcloud'),
                                        ('Cloud_platforms', 'DigitalOcean'),
                                        ('Cloud_platforms', 'Lincode'),
                                        ('RDBMS', 'PostgreSQL'),
                                        ('RDBMS', 'Microsoft_SQL_Server'),
                                        ('RDBMS', 'IBM_Db2'),
                                        ('RDBMS', 'MySQL'),
                                        ('RDBMS', 'Oracle_PL_SQL'),
                                        ('NOSQL', 'MongoDB'),
                                        ('NOSQL', 'Cassandra'),
                                        ('NOSQL', 'Amazon_DynamoDB'),
                                        ('NOSQL', 'Neo4j'),
                                        ('Search_&_Analytics', 'Apache_Solr'),
                                        ('Search_&_Analytics', 'Amazon_Redshift'),
                                        ('Search_&_Analytics', 'Google_BigQuery'),
                                        ('Search_&_Analytics', 'Snowflake'),
                                        ('Search_&_Analytics', 'Oracle_Exadata'),
                                        ('Search_&_Analytics', 'SAP_HANA'),
                                        ('Search_&_Analytics', 'Teradata'),
                                        ('Data_integration_and_processing', 'Informatica_PowerCenter'),
                                        ('Data_integration_and_processing', 'Databricks'),
                                        ('Data_integration_and_processing', 'Presto'),
                                        ('Stream_processing_tools', 'Apache_Kafka'),
                                        ('Stream_processing_tools', 'Apache_Flink'),
                                        ('Stream_processing_tools', 'Dataflow'),
                                        ('Workflow_orchestration_tools', 'Apache_Airflow'),
                                        ('Workflow_orchestration_tools', 'Luigi'),
                                        ('Workflow_orchestration_tools', 'SSIS'),
                                        ('Big_Data_processing', 'Apache_Hadoop'),
                                        ('Big_Data_processing', 'Apache_Hive'),
                                        ('Big_Data_processing', 'Apache_Spark'),
                                        ('OS', 'Linux'),
                                        ('Programming_languages', 'Python'),
                                        ('Programming_languages', 'R'),
                                        ('Programming_languages', 'Scala'),
                                        ('Programming_languages', 'SQL'),
                                        ('Programming_languages', 'Java'),
                                        ('Programming_languages', 'C++'),
                                        ('Programming_languages', 'Go'),
                                        ('Programming_languages', 'Bash'),
                                        ('Programming_languages', 'PowerShell'),
                                        ('Programming_languages', 'CLI'),
                                        ('Business_Intelligence_Tools', 'Tableau'),
                                        ('Business_Intelligence_Tools', 'Power_BI'),
                                        ('Business_Intelligence_Tools', 'Google_Analytics'),
                                        ('Business_Intelligence_Tools', 'QlikView'),
                                        ('Business_Intelligence_Tools', 'Oracle_BI_server'),
                                        ('Business_Intelligence_Tools', 'SAS_Analytics'),
                                        ('Business_Intelligence_Tools', 'Lumira'),
                                        ('Business_Intelligence_Tools', 'Cognos_Impromptu'),
                                        ('Business_Intelligence_Tools', 'MicroStrategy'),
                                        ('Business_Intelligence_Tools', 'InsightSquared'), 
                                        ('Business_Intelligence_Tools', 'Sisense'), 
                                        ('Business_Intelligence_Tools', 'Dundas_BI'),
                                        ('Business_Intelligence_Tools', 'Domo'), 
                                        ('Business_Intelligence_Tools', 'Looker'), 
                                        ('Business_Intelligence_Tools', 'Excel'),                   
                                        ])


In [None]:
df['Company_info']['Name'].head()

In [None]:
df['Business_Intelligence_Tools']['Excel'].head()

#### 32. Save CSV

##### 32.1 Save

In [None]:
import os
from pathlib import Path
from scraper.config.get import get_config

config = get_config()

local_path = os.path.join(
    config['output_path']['main'],
    config['output_path']['clean'],
    "Data_Engineer"
    )

file_name = "Data_Engineer_United_States_06-03-2023_23-41.csv"
file_path = Path(f"{local_path}/{file_name}")

folder = os.path.dirname(file_path)
if not os.path.exists(folder):
    os.mkdir(folder)


df.to_csv(file_path, index=True)

##### 32.2 Check save

In [None]:
df_check = pd.read_csv(file_path, index_col=0, header=[0, 1])
df_check.head()

In [None]:
df_check.shape == df.shape