## Clean data

### 1. Importing all packages

In [1]:
# External
import os
import re
import pandas as pd

### 2. Importing  a CSV file

In [2]:
CSVs_folder = "data/RAW/Data Engineer"

def get_dfs_from_CSVs_in_folder(directory: str) -> dict[str, pd.DataFrame]:

    dfs = {}

    # https://regex101.com/r/QYuVDf/1
    pattern = r"Data_Engineer_([a-zA-Z_]+)_\d{2}-\d{2}-\d{4}_\d{2}-\d{2}.csv"

    for __, _, files in os.walk(directory):

        for file in files:
            if file.endswith('.csv'):
                match = re.search(pattern, file)
                if match:
                    country = match.group(1)
                    if country == "United_States":
                        continue
                    else:
                        file_path = os.path.join(CSVs_folder, file)
                        dfs[country] = pd.read_csv(file_path)

    return dfs
                

dfs = get_dfs_from_CSVs_in_folder(CSVs_folder)
dfs

{'Austria':                          Company_name  Rating            Location  \
 0               Riverty Services GmbH     3.9              Vienna   
 1            Infineon Technologies AG     4.2             Villach   
 2            Infineon Technologies AG     4.2             Villach   
 3            Infineon Technologies AG     4.2             Villach   
 4            Infineon Technologies AG     4.2             Villach   
 ..                                ...     ...                 ...   
 865             Infineon Technologies     4.2             Villach   
 866                 Wien Energie GmbH     4.3  Gerasdorf bei Wien   
 867  Trenkwalder Personaldienste GmbH     3.9             Austria   
 868              Silicon Austria Labs     3.6             Villach   
 869                   Noir Consulting     5.0              Vienna   
 
                                              Job_title  \
 0               Process Manager (m/f/d) 80% Homeoffice   
 1    Senior Staff Engineer D

In [3]:
dfs['Austria']

Unnamed: 0,Company_name,Rating,Location,Job_title,Description,Salary,Job_age,Easy_apply,Employees,Type_of_ownership,...,CEO_approval,Career_opportunities,Comp_&_benefits,Culture_&_values,Senior_management,Work/Life_balance,Pros,Cons,Benefits_rating,Benefits_reviews
0,Riverty Services GmbH,3.9,Vienna,Process Manager (m/f/d) 80% Homeoffice,"Everything we do, starts with you.\r\nTogether...",,30d+,False,5001 to 10000,Company - Private,...,1.00,3.7,3.2,3.9,3.5,4.0,"['""Good colleagues and overall atmosphere"" (in...","['""New strategy process is taking ages and peo...",4.0,
1,Infineon Technologies AG,4.2,Villach,Senior Staff Engineer Digital Verification (f/...,You are looking for a new challenge to bring i...,,25d,False,10000+,Company - Public,...,0.95,3.9,3.8,4.1,3.7,4.1,"['""Good work life balance and ambitious"" (in 1...","['""Descent pay, work life balance"" (in 152 rev...",3.9,"['Health Insurance (22 comments)\n""Great cover..."
2,Infineon Technologies AG,4.2,Villach,Senior Staff Engineer Product Development for ...,Do you want to get to know the development of ...,,18d,False,10000+,Company - Public,...,0.95,3.9,3.8,4.1,3.7,4.1,"['""Good work life balance and ambitious"" (in 1...","['""Descent pay, work life balance"" (in 152 rev...",3.9,"['Health Insurance (22 comments)\n""Great cover..."
3,Infineon Technologies AG,4.2,Villach,Component Verification and Product Characteriz...,You are looking for a new challenge to bring i...,,25d,False,10000+,Company - Public,...,0.95,3.9,3.8,4.1,3.7,4.1,"['""Good work life balance and ambitious"" (in 1...","['""Descent pay, work life balance"" (in 152 rev...",3.9,"['Health Insurance (22 comments)\n""Great cover..."
4,Infineon Technologies AG,4.2,Villach,Product Application Engineer (f/m/div)*,"You enjoy working in an international team, in...",,26d,False,10000+,Company - Public,...,0.95,3.9,3.8,4.1,3.7,4.1,"['""Good work life balance and ambitious"" (in 1...","['""Descent pay, work life balance"" (in 152 rev...",3.9,"['Health Insurance (22 comments)\n""Great cover..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865,Infineon Technologies,4.2,Villach,International Graduate Program: Senior Enginee...,Are you ready to start your career in a fast-m...,,30d,False,10000+,Company - Public,...,0.95,3.9,3.8,4.1,3.7,4.1,"['""Good work life balance and ambitious"" (in 1...","['""Descent pay, work life balance"" (in 152 rev...",3.9,"['Health Insurance (22 comments)\n""Great cover..."
866,Wien Energie GmbH,4.3,Gerasdorf bei Wien,Business Intelligence Specialist (w/m/d),Business Intelligence Specialist (w/m/d)\r\n==...,,25d,False,,,...,1.00,3.9,3.8,3.7,3.5,4.4,['No Pros have been reported by the Glassdoor ...,"['""Zu viele low performer, welche in die Pensi...",,
867,Trenkwalder Personaldienste GmbH,3.9,Austria,C++ Entwickler (m/w/d) mit Schwerpunkt Machine...,Unser Kunde (KV EMG) ist ein international tät...,,11d,False,10000+,Company - Private,...,1.00,3.5,3.1,3.8,3.5,3.7,"['""ilfe balance and career growth"" (in 3 revie...","['""Horrible communication between departments,...",,
868,Silicon Austria Labs,3.6,Villach,Cleanroom Process Engineer - Wet Processing (f...,Your future responsibilities\r\nThe process en...,,30d+,False,1 to 50,Self-employed,...,,3.2,3.2,3.5,3.4,3.9,"['""Good work environment."" (in 4 reviews)', '""...","['""Poor management."" (in 3 reviews)', '""there ...",,


### 3. Remove rows only with NaNs

In [6]:
differences = {}

for country_name, country_df in dfs.items():

    country_df_before = country_df
    country_df = country_df.dropna(how='all')

    is_difference = country_df_before.shape[0] != country_df.shape[0]

    if is_difference:
        differences[country_name] = (
            country_df_before.shape[0], 
            country_df.shape[0]
            )

    
differences

{}

### 4. Remove duplicates

In [None]:
df = df.drop_duplicates()
df.shape

There is huge amount of duplicates. But this is the feature of glassdoor

### 5. Remove empty columns

In [None]:
df = df.dropna(axis=1, how='all')
df.shape

There is no empty columns

#### 6. Now we will split `Location` column into `State` and `City`.

In [None]:
df['Location'].head()

In [None]:
df['City'] = df['Location'].apply(lambda x: x.split(',')[0] if x is not np.nan and "," in x else x)
df['City'].head()

In [None]:
df['State'] = df['Location'].apply(lambda x: x.split(',')[1] if x is not np.nan and "," in x else x)
df['State'].head()

In [None]:
# Cleanup
del df['Location']

### 7. Add job title seniority

In [None]:
df['Job_title'].unique()

In [None]:
def get_seniority(job_title:str):

    seniority = {
        'Junior' : ["Jr.", "Junior"],
        'Mid' : ["Mid", "Middle"],
        'Senior': ["Sr.", "Senior"],
        'Lead': "Lead",
        'Principle' : "Principle"
    }
    
    if seniority['Junior'][0] in job_title or seniority['Junior'][1] in job_title :
        return "Junior"
    elif seniority['Mid'][0] in job_title or seniority['Mid'][1] in job_title :
        return "Mid"
    elif seniority['Senior'][0] in job_title or seniority['Senior'][1] in job_title :
        return "Senior"
    elif seniority['Lead'] in job_title:
        return "Lead"
    elif seniority['Principle'] in job_title:
        return "Principle"
    else:
        return np.nan
    
df['Seniority'] = df['Job_title'].apply(get_seniority)

del get_seniority

df['Seniority'].value_counts()

Add non-standard seniority

In [None]:
def apply_seniority_level(df, job_title, company_name, seniority_level):
    df['Seniority'] = df.apply(
        lambda row: seniority_level if row['Job_title'] == job_title and row['Company_name'] == company_name else row['Seniority'],
        axis=1
    )

apply_seniority_level(df, "Data Engineer (L5)", "Netflix", "Senior")
apply_seniority_level(df, "Technical Support Engineer (L5) - Data Platform, Big Data / Analytics", "Netflix", "Senior")
apply_seniority_level(df, "Data Engineer Level 3", "Infoorigin Inc", "Mid")
apply_seniority_level(df, "Data Engineer IC4 - US ONLY", "Braintrust", "Lead")
apply_seniority_level(df, "ETL Engineer/ Data Analyst - Software Engineer III", "JPMorgan Chase Bank, N.A.", "Senior")
apply_seniority_level(df, "Software Engineer III (AI, Data, Python)", "JPMorgan Chase Bank, N.A.", "Senior")
apply_seniority_level(df, "Data Engineer 925", "Certec Consulting", "Senior")

del apply_seniority_level

df['Seniority'].value_counts()


### 8. Parse salary

#### 8.1 Employer provided salary

In [None]:
df['Salary_employer_provided'] = df['Salary'].apply(lambda salary : True if isinstance(salary, str) and "Employer Provided Salary" in salary else False)
df['Salary_employer_provided'].value_counts()

#### 8.2 Salary per hour

In [None]:
df['Salary_hourly'] = df['Salary'].apply(lambda salary : True if isinstance(salary, str) and "Per Hour" in salary else False)
df['Salary_hourly'].value_counts()

#### 8.3 Salary min

In [None]:
def get_salary_min(salary):

    if isinstance(salary, str):

        pattern_salary = r"(\d+(\.\d+)?K?)"
        match_min: str = re.findall(pattern_salary, salary)[0][0]

        if "K" in match_min:
            match_min = float(match_min.replace("K", ""))
            match_min *= 1000

        return float(match_min)

    else:

        return salary
    
def calculate_yearly_income(hourly_rate):

    hours_per_week = 40
    WEEKS_PER_YEAR = 52
    HOURS_PER_YEAR = WEEKS_PER_YEAR * hours_per_week
    gross_income = hourly_rate * HOURS_PER_YEAR
    return gross_income

df['Salary_min'] = df['Salary'].apply(get_salary_min)
df['Salary_min'] = df.apply(
        lambda row: calculate_yearly_income(row['Salary_min']) if row['Salary_hourly'] == True else row['Salary_min'],
        axis=1
    )

del get_salary_min

df['Salary_min']

#### 8.4 Salary max

In [None]:
def get_salary_max(salary):

    if isinstance(salary, str):

        pattern_salary = r"(\d+(\.\d+)?K?)"
        match_max: str = re.findall(pattern_salary, salary)[-1][0]

        if "K" in match_max:
            match_max = float(match_max.replace("K", ""))
            match_max *= 1000

        return float(match_max)

    else:

        return salary

df['Salary_max'] = df['Salary'].apply(get_salary_max)
df['Salary_max'] = df.apply(
        lambda row: calculate_yearly_income(row['Salary_max']) if row['Salary_hourly'] == True else row['Salary_max'],
        axis=1
    )

del get_salary_max

df['Salary_max']

In [None]:
# Cleanup

del calculate_yearly_income

#### 8.5 Salary currency 

In [None]:
def get_currency(salary: str):

    if isinstance(salary, str):

        pattern_currency = r"(.+?(?=\d))"

        if "Employer Provided Salary" in salary:
            pattern_currency = r"(\:.+?(?=\d))"

        matched = re.search(pattern_currency, salary)

        currency = matched.group(1).strip().replace(":", "")

        return currency

    else:

        return salary
    
df['Salary_currency'] = df['Salary'].apply(get_currency)
    
del get_currency
    
df['Salary_currency'].value_counts()

In [None]:
del df['Salary']

#### 8.6 Salary average

In [None]:
df['Salary_avg'] = (df['Salary_max']+df['Salary_min'])/2
df['Salary_avg']

### 9. Employees

In [None]:
df['Employees'].value_counts()

### 10. Type of ownership

In [None]:
df['Type_of_ownership'].value_counts()

### 11. Sector

In [None]:
df['Sector'].value_counts()

### 12. Industry

In [None]:
df['Industry'].value_counts()

### 13. Company age

In [None]:
import datetime

year = datetime.date.today().year

df['Company_age'] = df['Founded'].apply(lambda x: x if np.isnan(x) else int(year - x))
df['Company_age'] = df['Company_age']

del df['Founded'], year

df['Company_age'].value_counts()

### 14. Job age

In [None]:
np.sort(df['Job_age'].unique())

In [None]:
def clean_job_age(job_age):

    if job_age == "24h":
        job_age = "1d"
    elif job_age == "30d+":
        job_age = "31d"

    return int(job_age.replace("d", ""))

df['Job_age'] = df['Job_age'].apply(clean_job_age)

del clean_job_age
df['Job_age'].value_counts()


### 15. Revenue

In [None]:
df['Revenue_USD'].value_counts()

### 16. Preview columns so far

In [None]:
df.dtypes

### 17. Change columns order

##### 17.1 move salary values

In [None]:
def move_column__to_index(column_name: str, index: int):
    df.insert(index, column_name, df.pop(column_name))


def move_columns_to_index(column_names: list[str], index: int):
    for col in column_names:
        df.insert(index, col, df.pop(col))
        index += 1

move_columns_to_index([
    'Salary_min', 
    'Salary_max', 
    'Salary_avg', 
    'Salary_currency',
    'Salary_employer_provided', 
    'Salary_hourly'
    ], 3
    )

df.dtypes

##### 17.2 Move Seniority

In [None]:
move_column__to_index('Seniority', 3)
df.dtypes

##### 17.3 Move City, State

In [None]:
move_columns_to_index(['City', 'State'], 11)
df.dtypes

##### 17.4 Move Company age

In [None]:
move_column__to_index('Company_age', 19)
df.dtypes

##### 17.5 Move Work/Life_balance 

In [None]:
move_columns_to_index(['Senior_management', 'Work/Life_balance'], 25)
df.dtypes

## 18. Technology requirements - parsing the job description

##### 19 Git and code repositories

In [None]:
def check_repo(job_description: str):

    git_platforms = [
        r"Github", 
        r"GitLab", 
        r"Bitbucket", 
        r"SourceForge", 
        r"Launchpad", 
        r"Google Cloud Source Repositories",
        r"AWS CodeCommit",
        r"GitBucket",
        r"Gogs",
        r"Gitea",
        r"Apache Allura",
        r"RhodeCode",
        r"ONEDEV",
        r"Codeberg",
        r"Git" # IMPORTANT, it has to be last!
        ]
    
    for platform in git_platforms:
        if re.search((r"\b" + platform + r"\b"), job_description, re.IGNORECASE):
            return platform
        
    return np.nan
        
df['Git'] = df['Description'].apply(check_repo)

del check_repo

df['Git'].value_counts()


In [None]:
def make_is_tech(cloud_names: list[str]):

    def is_tech(job_description: str):

        
        for cloud in cloud_names:
            if re.search((r"\b" + cloud + r"\b"), job_description, re.IGNORECASE):
                return True
            
        return False
    
    return is_tech

In [None]:
def add_is_needed_column_to_df(column_name: str, tech_names: list[str]):

    df[column_name] = df['Description'].apply(make_is_tech(tech_names))


#### 20. Cloud Platforms

##### 20.1 AWS


Provides on-demand cloud computing platforms and APIs to individuals, companies, and governments, on a metered, pay-as-you-go basis. Often times, clients will use this in combination with autoscaling.

In [None]:
cloud_names = [
    r"Amazon Web Services", 
    r"AWS",
    ]

column_name = 'AWS'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.2 Microsoft Azure

A cloud computing platform operated by Microsoft that provides access, management, and development of applications and services via around the world-distributed data centers.

In [None]:
cloud_names = [
    r"Microsoft Azure", 
    r"Azure",
    ]

column_name = 'Microsoft_Azure'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.3 GCP

A suite of cloud computing services that runs on the same infrastructure that Google uses internally for its end-user products, such as Google Search, Gmail, Google Drive, and YouTube.

In [None]:
cloud_names = [
    r"Google Cloud Platform", 
    r"GCP",
    ]

column_name = 'GPC'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.4 Alibaba Cloud

Alibaba Cloud provides cloud computing services to online businesses and Alibaba's own e-commerce ecosystem.

In [None]:
cloud_names = [
    r"Alibaba Cloud", 
    r"Aliyun",
    ]

column_name = 'Alibaba_Cloud'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.4 Oracle Cloud

Providing servers, storage, network, applications and services through a global network of Oracle Corporation managed data centers

In [None]:
cloud_names = [
    r"Oracle Cloud", 
    r"OCI",
    ]

column_name = 'Oracle_Cloud'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.5 IBM Cloud

A set of cloud computing services for business

In [None]:
cloud_names = [
    r"IBM Cloud", 
    r"Kyndryl",
    r"Bluemix"
    ]

column_name = 'IBM_cloud'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.6 Tencent Cloud

Tencent Cloud provides businesses across the globe with stable and secure industry-leading cloud products and services, leveraging technological advancements such as cloud computing, Big Data, AI, IoT and network security.

In [None]:
cloud_names = [
    r"Tencent Cloud",
    ]

column_name = 'Tencent_cloud'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.8 OVHcloud

A French cloud computing company which offers VPS, dedicated servers and other web services

In [None]:
cloud_names = [
    r"OVHcloud",
    r"OVH"
    ]

column_name = 'OVHcloud'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.9 DigitalOcean

A cloud hosting provider that offers cloud computing services and Infrastructure as a Service (IaaS). Known for pricing and scalability

In [None]:
cloud_names = [
    r"DigitalOcean"
    ]

column_name = 'DigitalOcean_cloud'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

##### 20.10 Linode

An American cloud hosting provider that focused on providing Linux-based virtual machines, cloud infrastructure, and managed services.

In [None]:
cloud_names = [
    r"Linode",
    r"Akamai"
    ]

column_name = 'Lincode_cloud'

add_is_needed_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

In [None]:
del cloud_names

#### 21. Relational Database Management Systems (RDBMS)

##### 21.1 PostgreSQL
Can be used as a data store for big data solutions.
Postgres, is a free and open-source relational database management system (RDBMS) emphasizing extensibility and SQL compliance. <br>
PostgreSQL features transactions with Atomicity, Consistency, Isolation, Durability (ACID) properties, automatically updatable views, materialized views, triggers, foreign keys, and stored procedures. <br> It is designed to handle a range of workloads, from single machines to data warehouses or Web services with many concurrent users. 

In [None]:
tool_names = [
    r"PostgreSQL",
    r"Postgres"
    ]

column_name = 'PostgreSQL'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 21.2 Microsoft SQL Server


A software product with the primary function of storing and retrieving data as requested by other software applications—which may run either on the same computer or on another computer across a network (including the Internet).

In [None]:
tool_names = [
    r"Microsoft SQL",
    r"SQL Server"
    ]

column_name = 'Microsoft_SQL_Server'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 21.3 MySQL

An open-source relational database management system.

In [None]:
tool_names = [
    r"MySQL"
    ]

column_name = 'MySQL'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 21.4 IBM Db2 warehouse

A family of data management products, including database servers, developed by IBM. It initially supported the relational model, but was extended to support object–relational features and non-relational structures like JSON and XML.

In [None]:
tool_names = [
    r"Db2",
    r"IBMDb2"
    ]

column_name = 'IBM_Db2'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 21.5. Oracle PL/SQL

 A procedural language designed specifically to embrace SQL statements within its syntax. PL/SQL program units are compiled by the Oracle Database server and stored inside the database. And at run-time, both PL/SQL and SQL run within the same server process, bringing optimal efficiency

In [None]:
tool_names = [
    r"PL/SQL",
    r"PL / SQL",
    r"Procedural Language for SQL"
    ]

column_name = 'Oracle_PL_SQL'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 22. NoSQL Database Management Systems

##### 22.1 MongoDB

A source-available cross-platform document-oriented database program. Classified as a NoSQL database program, MongoDB uses JSON-like documents with optional schemas

In [None]:
tool_names = [
    r"MongoDB",
    r"Mongo DB",
    ]

column_name = 'MongoDB'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.2 Cassandra

A free and open-source, distributed, wide-column store, NoSQL database management system designed to handle large amounts of data across many commodity servers, providing high availability with no single point of failure

In [None]:
tool_names = [
    r"Cassandra",
    ]

column_name = 'Cassandra'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.3 Amazon DynamoDB

A proprietary NoSQL database service that supports key–value and document data structures and is offered by Amazon.com as part of the Amazon Web Services portfolio.

In [None]:
tool_names = [
    r"DynamoDB",
    r"Dynamo DB",
    r"SimpleDB"
    ]

column_name = 'Amazon_DynamoDB'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 22.4 Neo4j

A graph database management system developed by Neo4j, Inc. Described by its developers as an ACID-compliant transactional database with native graph storage and processing

In [None]:
tool_names = [
    r"Neo4j"
    ]

column_name = 'Neo4j'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.5 Apache Solr

An open-source enterprise-search platform, written in Java. Its major features include full-text search, hit highlighting, faceted search, real-time indexing, dynamic clustering, database integration, NoSQL features[2] and rich document (e.g., Word, PDF) handling.

In [None]:
tool_names = [
    r"Solr"
    ]

column_name = 'Apache_Solr'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 22. Data warehousing and Analytics

##### 22.1 Amazon Redshift

A data warehouse product which forms part of the larger cloud-computing platform Amazon Web Services. It is built on top of technology from the massive parallel processing data warehouse company ParAccel, to handle large scale data sets and database migrations.

In [None]:
tool_names = [
    r"Redshift",
    ]

column_name = 'Amazon_Redshift'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.2 Google BigQuery

A serverless data warehouse that enables scalable analysis over petabytes of data. It is a Platform as a Service that supports querying using ANSI SQL. It also has built-in machine learning capabilities.

In [None]:
tool_names = [
    r"BigQuery",
    ]

column_name = 'Google_BigQuery'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.3 Snowflake

Snowflake enables data storage, processing, and analytic solutions.

In [None]:
tool_names = [
    r"Snowflake"
    ]

column_name = 'Snowflake'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.4 Oracle Exadata

Designed to run Oracle Database workloads, such as an OLTP application running simultaneously with Analytics processing. Historically, specialized database computing platforms were designed for a particular workload, such as Data Warehousing, and poor or unusable for other workloads, such as OLTP. 

In [None]:
tool_names = [
    r"Exadata"
    ]

column_name = 'Oracle_Exadata'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.5 SAP HANA

A multi-model database that stores data in its memory instead of keeping it on a disk.

In [None]:
tool_names = [
    r"HANA"
    ]

column_name = 'SAP_HANA'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 22.6 Teradata

It is mainly suitable for building large scale data warehousing applications.

In [None]:
tool_names = [
    r"Teradata"
    ]

column_name = 'Teradata'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 23. Data Integration and Processing

##### 23.1 Informatica PowerCenter - Data integration tool


Used extensively for ETL operations, data quality, data masking, data replication, data virtualization, and master data management services.

In [None]:
tool_names = [
    r"PowerCenter",
    r"Power Center",
    ]

column_name = 'Informatica_PowerCenter'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 23.2 DataBricks - Data processing and analytics platform

A unified set of tools for building, deploying, sharing, and maintaining enterprise-grade data solutions at scale. 

In [None]:
tool_names = [
    r"Data Bricks",
    r"Databricks"
    ]

column_name = 'Databricks'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 23.3 Presto - Query engine

 A distributed query engine for big data using the SQL query language. Its architecture allows users to query data sources such as Hadoop, Cassandra, Kafka, AWS S3, Alluxio, MySQL, MongoDB and Teradata, and allows use of multiple data sources

In [None]:
tool_names = [
    r"Presto",
    r"PrestoDB",
    r"PrestoSQL"
    ]

column_name = 'Presto'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 24. Stream processing tools

##### 24.1 Apache Kafka

An open-source system, distributed event store and stream-processing platform. The project aims to provide a unified, high-throughput, low-latency platform for handling real-time data feeds.

In [None]:
tool_names = [
    r"Kafka",
    ]

column_name = 'Apache_Kafka'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 24.2 Apache Flink

Process data streams at a large scale and to deliver real-time analytical insights about your processed data with your streaming application.

In [None]:
tool_names = [
    r"Flink",
    ]

column_name = 'Apache_Flink'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 24.3 Dataflow


Dataflow is a managed service provided by Google Cloud for building and executing data processing pipelines. It enables developers to create scalable and efficient batch and streaming data pipelines using a simple programming model.

In [None]:
tool_names = [
    r"Dataflow",
    ]

column_name = 'Dataflow'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 25 Workflow orchestration tools

##### 25.1 Apache Airflow

Apache Airflow is an open-source platform used for programmatically creating, scheduling, and monitoring complex workflows or data pipelines. It allows users to define and execute a sequence of tasks or operations, while providing tools for tracking and troubleshooting workflow executions.

In [None]:
tool_names = [
    r"Airflow",
    ]

column_name = 'Apache_Airflow'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 25.2 Luigi

Luigi is a Python-based open-source workflow management system that helps to build complex pipelines of batch jobs. It provides a flexible and extensible architecture to create and manage complex data workflows.

In [None]:
tool_names = [
    r"Luigi",
    ]

column_name = 'Luigi'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 25.3 SSIS

SQL Server Integration Services (SSIS) is a Microsoft tool used for building data integration and ETL (extract, transform, load) workflows. It allows users to perform a range of tasks such as data extraction, transformation, and loading from various sources to different destinations.

In [None]:
tool_names = [
    r"SSIS",
    r"SQL Server Integration Services"
    ]

column_name = 'SSIS'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 26. Big Data processing

##### 24.1 Apache Hadoop

Apache Hadoop is an open-source framework that allows for the distributed processing of large data sets across clusters of computers using simple programming models. It provides a distributed file system and supports various distributed computing models, such as MapReduce and Spark, for processing and analyzing large data sets.

In [None]:
tool_names = [
    r"Hadoop",
    ]

column_name = 'Apache_Hadoop'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 24.2 Apache Hive


Apache Hive is a data warehouse software that facilitates querying and managing large datasets stored in Hadoop file systems using a SQL-like language called HiveQL. It provides a high-level interface for data analysts and developers to analyze, transform, and summarize data stored in Hadoop Distributed File System (HDFS) and other compatible storage systems.

In [None]:
tool_names = [
    r"Hive",
    ]

column_name = 'Apache_Hive'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 24.3 Apache Spark

Apache Spark is a distributed computing framework designed to process large-scale data processing and analysis workloads in parallel. It can be used for batch processing, real-time stream processing, machine learning, and graph processing, among other things.

In [None]:
tool_names = [
    r"Spark",
    r"PySpark"
    ]

column_name = 'Apache_Spark'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

In [None]:
df.dtypes

#### 25. Linux

Family of Unix-like operating systems.

In [None]:
tool_names = [
    r"Linux",
    ]

column_name = 'Linux'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 26. Programming languages

##### 26.1 Python

Python is a high-level, interpreted programming language used for various purposes such as web development, data analysis, artificial intelligence, and more.

In [None]:
tool_names = [
    r"Python",
    ]

column_name = 'Python'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.2 R

A programming language and environment for statistical graphics and computing.

In [None]:
tool_names = [
    r"R",
    r"RStudio"
    ]

column_name = 'R'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.3 Scala

Scala is a high-level, statically typed programming language designed for functional programming and scalable, concurrent applications.

In [None]:
tool_names = [
    r"Scala"
    ]

column_name = 'Scala'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.4 SQL

A programming language used to manage and manipulate relational databases.

In [None]:
tool_names = [
    r"SQL",
    r"MySQL",
    r"PostgreSQL",
    r"Postgres",
    r"SQLite",
    r"MariaDB",
    r"IBM DB2",
    r"Oracle Database",
    r"Db2",
    ]

column_name = 'SQL'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.5 Java

Java is a high-level, object-oriented programming language widely used for developing robust and scalable enterprise applications.

In Data Science, Java can be used for developing machine learning models, data analysis, and data processing applications, as well as for building large-scale distributed systems for big data processing and management.

In [None]:
tool_names = [
    r"Java",
    ]

column_name = 'Java'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.6 C++

A general-purpose programming language designed for systems and application programming, and it is used in Data Science for building high-performance libraries and applications that require intensive computational tasks.

In [None]:
tool_names = [
    r"C\+\+",
    ]

column_name = 'C++'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.7 Go

A statically typed programming language designed for building simple, efficient, and reliable software, and it can be used in data engineering for building scalable, distributed systems for data processing and analysis.

In [None]:
tool_names = [
    r"Go language", # Go as separate word is too common in English
    r"Golang",
    ]

column_name = 'Go'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.8 Bash

A shell scripting language used for automating repetitive tasks and managing the operating system, including data processing tasks, in the command-line interface (CLI) on Unix and Unix-like systems.

In [None]:
tool_names = [
    r"Bash",
    ]

column_name = 'Bash'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.9 Powershell

A task automation and configuration management framework from Microsoft, which can be used in Data Science for automating various data processing tasks on Windows machines in the command-line interface (CLI).

In [None]:
tool_names = [
    r"PowerShell",
    r"DOS Shell"
    ]

column_name = 'PowerShell'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 26.10 CLI

CLI stands for Command Line Interface, which is a way to interact with a computer program through text commands, and it is commonly used in Data Science for running scripts, automating tasks, and managing software packages.

In [None]:
tool_names = [
    r"CLI",
    r"Command Line Interface"
    ]

column_name = 'CLI'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 27. Virtualization Tools

Business intelligence and data visualization tools used for analyzing and visualizing data.

##### 27.1 Tableau

In [None]:
tool_names = [
    r"Tableau"
    ]

column_name = 'Tableau'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.2 Power BI

In [None]:
tool_names = [
    r"Power BI"
    ]

column_name = 'Power_BI'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.3 Google Analytics

In [None]:
tool_names = [
    r"Google Analytics"
    ]

column_name = 'Google_Analytics'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.4 QlikView

In [None]:
tool_names = [
    r"QlikView",
    r"Qlik"
    ]

column_name = 'QlikView'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.5 Oracle BI server

In [None]:
tool_names = [
    r"Oracle Business Intelligence Enterprise Edition",
    r"OBIEE",
    r"Oracle BI server",
    ]

column_name = 'Oracle_BI_server'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.6 SAS Analytics

In [None]:
tool_names = [
    r"SAS Analytics",
    r"Statistical Analysis System",
    ]

column_name = 'SAS_Analytics'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.7 Lumira

In [None]:
tool_names = [
    r"Lumira",
    ]

column_name = 'Lumira'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.8 IBM Cognos Impromptu

In [None]:
tool_names = [
    r"Cognos Impromptu",
    ]

column_name = 'Cognos_Impromptu'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.9 MicroStrategy

In [None]:
tool_names = [
    r"MicroStrategy",
    ]

column_name = 'MicroStrategy'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.10 InsightSquared

In [None]:
tool_names = [
    r"InsightSquared",
    ]

column_name = 'InsightSquared'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.11 Sisense

In [None]:
tool_names = [
    r"Sisense",
    ]

column_name = 'Sisense'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.12 Dundas BI

In [None]:
tool_names = [
    r"Dundas BI",
    ]

column_name = 'Dundas_BI'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.13 Domo

In [None]:
tool_names = [
    r"Domo",
    ]

column_name = 'Domo'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 27.14 Looker

In [None]:
tool_names = [
    r"Looker",
    ]

column_name = 'Looker'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 28. Microsoft Excel

In [None]:
tool_names = [
    r"Excel",
    ]

column_name = 'Excel'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 29. Certifications

Checking if there is a need for any certification.

In [None]:
# Coursera, Udemy, Datacamp etc. list
tool_names = [
    r"Certificates",
    r"Certificate",
    r"Data Engineering, Big Data, and Machine Learning on GCP",
    r"Google Professional Data Engineer",
    r"Microsoft Azure Data Engineering",
    r"Data Engineer.+Nanodegree",
    r"DataCamp",
    r"Data Engineering, Big Data, and Machine Learning on GCP",
    r"Python, Bash and SQL Essentials for Data Engineering Specialization",
    r"Data Engineering ETL, Web Scraping, and Automation",
    r"Big Data Engineering with Hadoop and Spark"
    ]

column_name = 'Is_certificate'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

#### 30. Needed education level

##### 30.1 BA

In [None]:
tool_names = [
    r"BA",
    r"Bachelor",
    r"BSc",
    r"Bachelors"
    ]

column_name = 'BA'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 30.2 MS

In [None]:
tool_names = [
    r"MS",
    r"MSc",
    r"Master",
    r"Masters",
    r"master\'s"
    ]

column_name = 'MS'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

##### 30.3 Phd

In [None]:
tool_names = [
    r"Phd",
    r"Ph\.D",
    r"DPhil",
    r"Doctor of Philosophy",
    ]

column_name = 'Phd'

add_is_needed_column_to_df(column_name, tool_names)

df[column_name].value_counts()

### Overview

In [None]:
df.shape

In [None]:
columns_names = df.columns
columns_names

In [None]:
del columns_names, column_name, tool_names

### 31. Final cleanup

##### 31.1 Rename columns

In [None]:
df = df.rename({
    'Company_name': 'Name',
    'Job_title': 'Title',
    'Salary_min': 'Min',
    'Salary_max': 'Max',
    'Salary_avg': 'Avg',
    'Salary_currency': 'Currency',
    'Salary_employer_provided': 'Employer_provided',
    'Salary_hourly': 'Is_hourly',
    'Alibaba_Cloud': 'Alibaba',
    'Oracle_Cloud': 'Oracle',
    'IBM_cloud': 'IBM',
    'Tencent_cloud': 'Tencent',
    'DigitalOcean_cloud': 'DigitalOcean',
    'Lincode_cloud': 'Lincode'
    }, axis=1)


##### 31.2 Change columns order

In [None]:
def move_column__to_index(column_name: str, index: int):
    df.insert(index, column_name, df.pop(column_name))


def move_columns_to_index(column_names: list[str], index: int):
    for col in column_names:
        df.insert(index, col, df.pop(col))
        index += 1

move_columns_to_index([                       
                    'Title',
                    'Description',
                    'Seniority',
                    'City',
                    'State',
                    'Job_age',
                    'Easy_apply',
                    'Min',
                    'Max',
                    'Avg',
                    'Currency',
                    'Employer_provided',
                    'Is_hourly',
                    'Name',
                    'Rating',
                    'Employees',
                    'Type_of_ownership',
                    'Sector',
                    'Industry',
                    'Company_age',
                    'Revenue_USD',
                    'Friend_recommend',
                    'CEO_approval',
                    'Career_opportunities',
                    'Comp_&_benefits',
                    'Senior_management',
                    'Work/Life_balance',
                    'Culture_&_values',
                    'Pros',
                    'Cons',
                    'Benefits_rating',
                    'Benefits_reviews',
                    'BA',
                    'MS',
                    'Phd',
                    'Is_certificate',
                    'Git',
                    'AWS',
                    'Microsoft_Azure',
                    'GPC',
                    'Alibaba',
                    'Oracle',
                    'IBM',
                    'Tencent',
                    'OVHcloud',
                    'DigitalOcean',
                    'Lincode',
                    'PostgreSQL',
                    'Microsoft_SQL_Server',
                    'IBM_Db2',
                    'MySQL',
                    'Oracle_PL_SQL',
                    'MongoDB',
                    'Cassandra',
                    'Amazon_DynamoDB',
                    'Neo4j',
                    'Apache_Solr',
                    'Amazon_Redshift',
                    'Google_BigQuery',
                    'Snowflake',
                    'Oracle_Exadata',
                    'SAP_HANA',
                    'Teradata',
                    'Informatica_PowerCenter',
                    'Databricks',
                    'Presto',
                    'Apache_Kafka',
                    'Apache_Flink',
                    'Dataflow',
                    'Apache_Airflow',
                    'Luigi',
                    'SSIS',
                    'Apache_Hadoop',
                    'Apache_Hive',
                    'Apache_Spark',
                    'Linux',
                    'Python',
                    'R',
                    'Scala',
                    'SQL',
                    'Java',
                    'C++',
                    'Go',
                    'Bash',
                    'PowerShell',
                    'CLI',
                    'Tableau',
                    'Power_BI',
                    'Google_Analytics',
                    'QlikView',
                    'Oracle_BI_server',
                    'SAS_Analytics',
                    'Lumira',
                    'Cognos_Impromptu',
                    'MicroStrategy',
                    'InsightSquared', 
                    'Sisense', 
                    'Dundas_BI',
                    'Domo', 
                    'Looker', 
                    'Excel'
                    ],0
    )

df.dtypes

##### 31.3 Add multiindex

In [None]:
df.columns = pd.MultiIndex.from_tuples([
                                        ('Job_details', 'Title'),
                                        ('Job_details', 'Description'),
                                        ('Job_details', 'Seniority'),
                                        ('Job_details', 'City'),
                                        ('Job_details', 'State'),
                                        ('Job_details', 'Job_age'),
                                        ('Job_details', 'Easy_apply'),
                                        ('Salary', 'Min'),
                                        ('Salary', 'Max'),
                                        ('Salary', 'Avg'),
                                        ('Salary', 'Currency'),
                                        ('Salary', 'Employer_provided'),
                                        ('Salary', 'Is_hourly'),
                                        ('Company_info', 'Name'),
                                        ('Company_info', 'Rating'),
                                        ('Company_info', 'Employees'),
                                        ('Company_info', 'Type_of_ownership'),
                                        ('Company_info', 'Sector'),
                                        ('Company_info', 'Industry'),
                                        ('Company_info', 'Company_age'),
                                        ('Company_info', 'Revenue_USD'),
                                        ('Company_info', 'Friend_recommend'),
                                        ('Company_info', 'CEO_approval'),
                                        ('Company_info', 'Career_opportunities'),
                                        ('Company_info', 'Comp_&_benefits'),
                                        ('Company_info', 'Senior_management'),
                                        ('Company_info', 'Work/Life_balance'),
                                        ('Company_info', 'Culture_&_values'),
                                        ('Company_info', 'Pros'),
                                        ('Company_info', 'Cons'),
                                        ('Company_info', 'Benefits_rating'),
                                        ('Company_info', 'Benefits_reviews'),
                                        ('Education', 'BA'),
                                        ('Education', 'MS'),
                                        ('Education', 'Phd'),
                                        ('Education', 'Is_certificate'),
                                        ('Version_control', 'Git'),
                                        ('Cloud_platforms', 'AWS'),
                                        ('Cloud_platforms', 'Microsoft_Azure'),
                                        ('Cloud_platforms', 'GPC'),
                                        ('Cloud_platforms', 'Alibaba'),
                                        ('Cloud_platforms', 'Oracle'),
                                        ('Cloud_platforms', 'IBM'),
                                        ('Cloud_platforms', 'Tencent'),
                                        ('Cloud_platforms', 'OVHcloud'),
                                        ('Cloud_platforms', 'DigitalOcean'),
                                        ('Cloud_platforms', 'Lincode'),
                                        ('RDBMS', 'PostgreSQL'),
                                        ('RDBMS', 'Microsoft_SQL_Server'),
                                        ('RDBMS', 'IBM_Db2'),
                                        ('RDBMS', 'MySQL'),
                                        ('RDBMS', 'Oracle_PL_SQL'),
                                        ('NOSQL', 'MongoDB'),
                                        ('NOSQL', 'Cassandra'),
                                        ('NOSQL', 'Amazon_DynamoDB'),
                                        ('NOSQL', 'Neo4j'),
                                        ('Search_&_Analytics', 'Apache_Solr'),
                                        ('Search_&_Analytics', 'Amazon_Redshift'),
                                        ('Search_&_Analytics', 'Google_BigQuery'),
                                        ('Search_&_Analytics', 'Snowflake'),
                                        ('Search_&_Analytics', 'Oracle_Exadata'),
                                        ('Search_&_Analytics', 'SAP_HANA'),
                                        ('Search_&_Analytics', 'Teradata'),
                                        ('Data_integration_and_processing', 'Informatica_PowerCenter'),
                                        ('Data_integration_and_processing', 'Databricks'),
                                        ('Data_integration_and_processing', 'Presto'),
                                        ('Stream_processing_tools', 'Apache_Kafka'),
                                        ('Stream_processing_tools', 'Apache_Flink'),
                                        ('Stream_processing_tools', 'Dataflow'),
                                        ('Workflow_orchestration_tools', 'Apache_Airflow'),
                                        ('Workflow_orchestration_tools', 'Luigi'),
                                        ('Workflow_orchestration_tools', 'SSIS'),
                                        ('Big_Data_processing', 'Apache_Hadoop'),
                                        ('Big_Data_processing', 'Apache_Hive'),
                                        ('Big_Data_processing', 'Apache_Spark'),
                                        ('OS', 'Linux'),
                                        ('Programming_languages', 'Python'),
                                        ('Programming_languages', 'R'),
                                        ('Programming_languages', 'Scala'),
                                        ('Programming_languages', 'SQL'),
                                        ('Programming_languages', 'Java'),
                                        ('Programming_languages', 'C++'),
                                        ('Programming_languages', 'Go'),
                                        ('Programming_languages', 'Bash'),
                                        ('Programming_languages', 'PowerShell'),
                                        ('Programming_languages', 'CLI'),
                                        ('Business_Intelligence_Tools', 'Tableau'),
                                        ('Business_Intelligence_Tools', 'Power_BI'),
                                        ('Business_Intelligence_Tools', 'Google_Analytics'),
                                        ('Business_Intelligence_Tools', 'QlikView'),
                                        ('Business_Intelligence_Tools', 'Oracle_BI_server'),
                                        ('Business_Intelligence_Tools', 'SAS_Analytics'),
                                        ('Business_Intelligence_Tools', 'Lumira'),
                                        ('Business_Intelligence_Tools', 'Cognos_Impromptu'),
                                        ('Business_Intelligence_Tools', 'MicroStrategy'),
                                        ('Business_Intelligence_Tools', 'InsightSquared'), 
                                        ('Business_Intelligence_Tools', 'Sisense'), 
                                        ('Business_Intelligence_Tools', 'Dundas_BI'),
                                        ('Business_Intelligence_Tools', 'Domo'), 
                                        ('Business_Intelligence_Tools', 'Looker'), 
                                        ('Business_Intelligence_Tools', 'Excel'),                   
                                        ])


In [None]:
df['Company_info']['Name'].head()

In [None]:
df['Business_Intelligence_Tools']['Excel'].head()

#### 32. Save CSV

##### 32.1 Save

In [None]:
import os
from pathlib import Path
from scraper.config.get import get_config

config = get_config()

local_path = os.path.join(
    config['output_path']['main'],
    config['output_path']['clean'],
    "Data_Engineer"
    )

file_name = "Data_Engineer_United_States_06-03-2023_23-41.csv"
file_path = Path(f"{local_path}/{file_name}")

folder = os.path.dirname(file_path)
if not os.path.exists(folder):
    os.mkdir(folder)


df.to_csv(file_path, index=True)

##### 32.2 Check save

In [None]:
df_check = pd.read_csv(file_path, index_col=0, header=[0, 1])
df_check.head()

In [None]:
df_check.shape == df.shape