## Clean data

### 1. Importing all packages

In [246]:
# External
import numpy as np
import pandas as pd
import re

### 2. Importing  a CSV file

In [247]:
df = pd.read_csv("data/RAW/Data_Engineer_06-03-2023_23-41.csv")
df.head()

Unnamed: 0,Company_name,Rating,Location,Job_title,Description,Salary,Job_age,Easy_apply,Employees,Type_of_ownership,...,CEO_approval,Career_opportunities,Comp_&_benefits,Culture_&_values,Senior_management,Work/Life_balance,Pros,Cons,Benefits_rating,Benefits_reviews
0,Infoway solutions LLC,3.9,"Santa Clara, CA",Data Engineer,Need min 10+ Years exp\nData Engineer\nBay Are...,Employer Provided Salary:$68.00 Per Hour,3d,True,,Company - Private,...,0.84,4.0,3.9,4.0,3.7,3.9,"['""Nice and friendly work environment"" (in 1 r...",['No Cons have been reported by the Glassdoor ...,2.2,
1,Optimal Inc.,3.6,"Dearborn, MI",Data Engineer - Terraform,Position Description:\nThe GDIA Data Factory P...,$63K - $90K (Glassdoor est.),12d,True,1 to 50,Nonprofit Organization,...,0.78,3.2,3.7,3.3,2.6,4.5,['No Pros have been reported by the Glassdoor ...,"['""Antisocial and downright rude CEO, callous ...",5.0,
2,Strivernet RPO Services Ltd,,"Santa Clara, CA",Data Engineer,"(W2 CANDIDATES ONLY) (SANTA CLARA, CA)\nPLEASE...",Employer Provided Salary:$90.00 - $95.00 Per Hour,5d,True,,Company - Public,...,,,,,,,,,,
3,Futuretech Consultants LLC,,"Newton, MS",Snowflake Data Engineer,My name is Dileep and I am a recruiter at Futu...,Employer Provided Salary:$40.00 - $45.00 Per Hour,30d+,True,,,...,,,,,,,,,,
4,Clairvoyant,4.4,Remote,Data Engineer (MDM),Required Skills:\nMust have 5-8+ Years of expe...,Employer Provided Salary:$65.00 - $70.00 Per Hour,12d,True,51 to 200,Company - Private,...,0.87,4.1,4.2,3.9,4.1,4.0,"['""Benefits, compensation, clean work environm...",['No Cons have been reported by the Glassdoor ...,,


In [248]:
df.columns

Index(['Company_name', 'Rating', 'Location', 'Job_title', 'Description',
       'Salary', 'Job_age', 'Easy_apply', 'Employees', 'Type_of_ownership',
       'Sector', 'Founded', 'Industry', 'Revenue_USD', 'Friend_recommend',
       'CEO_approval', 'Career_opportunities', 'Comp_&_benefits',
       'Culture_&_values', 'Senior_management', 'Work/Life_balance', 'Pros',
       'Cons', 'Benefits_rating', 'Benefits_reviews'],
      dtype='object')

### 3. Remove rows only with NaNs

In [249]:
df = df.dropna(how='all')
df.shape

(900, 25)

There is no empty rows

### 4. Remove duplicates

In [250]:
df = df.drop_duplicates()
df.shape

(220, 25)

There is huge amount of duplicates. But this is the feature of glassdoor

### 5. Remove empty columns

In [251]:
df = df.dropna(axis=1, how='all')
df.shape

(220, 25)

There is no empty columns

#### 6. Now we will split `Location` column into `State` and `City`.

In [252]:
df['Location'].head()

0    Santa Clara, CA
1       Dearborn, MI
2    Santa Clara, CA
3         Newton, MS
4             Remote
Name: Location, dtype: object

In [253]:
df['City'] = df['Location'].apply(lambda x: x.split(',')[0] if "," in x else x)
df['City'].head()

0    Santa Clara
1       Dearborn
2    Santa Clara
3         Newton
4         Remote
Name: City, dtype: object

In [254]:
df['State'] = df['Location'].apply(lambda x: x.split(',')[1] if "," in x else x)
df['State'].head()

0        CA
1        MI
2        CA
3        MS
4    Remote
Name: State, dtype: object

In [255]:
# Cleanup
del df['Location']

### 7. Add job title seniority

In [256]:
df['Job_title'].unique()

array(['Data Engineer', 'Data Engineer - Terraform',
       'Snowflake Data Engineer', 'Data Engineer (MDM)',
       'AWS Data Engineer', 'DATA ENGINEER', 'Big Data Engineer',
       'Sr. Data Engineer', 'Data Engineer - Flink', 'Jr. Data Engineer',
       'Data Engineer - Remote', 'Data Engineer (L5)',
       'Software Data Engineer', 'GCP Data Engineer',
       'Senior Data Engineer', 'Azure Cloud Data Engineer',
       'GCP DATA ENGINEER', 'Data Test Engineer', 'Azure Data Engineer',
       'Senior Azure Data Bricks Engineer', 'Data Analytics Engineer',
       'Data Engineer (W2 and onsite)', 'Senior Big Data Engineer',
       'Data Engineer- Google Cloud',
       'Data Engineer (ETL & System Administration concentration)',
       'Data Engineer/Data Analyst', 'Data Engineer/Data Scientist',
       'ETL Data Engineer', 'Lead Data Engineer',
       'Sr. Data Engineer with Snowflake', 'Junior Data Engineer',
       'Senior Data Engineer - Remote', 'Data Engineer Level 3',
       'Clou

In [257]:
def get_seniority(job_title:str):

    seniority = {
        'Junior' : ["Jr.", "Junior"],
        'Mid' : ["Mid", "Middle"],
        'Senior': ["Sr.", "Senior"],
        'Lead': "Lead",
        'Principle' : "Principle"
    }
    
    if seniority['Junior'][0] in job_title or seniority['Junior'][1] in job_title :
        return "Junior"
    elif seniority['Mid'][0] in job_title or seniority['Mid'][1] in job_title :
        return "Mid"
    elif seniority['Senior'][0] in job_title or seniority['Senior'][1] in job_title :
        return "Senior"
    elif seniority['Lead'] in job_title:
        return "Lead"
    elif seniority['Principle'] in job_title:
        return "Principle"
    else:
        return np.nan
    
df['Seniority'] = df['Job_title'].apply(get_seniority)

del get_seniority

df['Seniority'].value_counts()

Senior    45
Junior     4
Lead       4
Name: Seniority, dtype: int64

Add non-standard seniority

In [258]:
def apply_seniority_level(df, job_title, company_name, seniority_level):
    df['Seniority'] = df.apply(
        lambda row: seniority_level if row['Job_title'] == job_title and row['Company_name'] == company_name else row['Seniority'],
        axis=1
    )

apply_seniority_level(df, "Data Engineer (L5)", "Netflix", "Senior")
apply_seniority_level(df, "Technical Support Engineer (L5) - Data Platform, Big Data / Analytics", "Netflix", "Senior")
apply_seniority_level(df, "Data Engineer Level 3", "Infoorigin Inc", "Mid")
apply_seniority_level(df, "Data Engineer IC4 - US ONLY", "Braintrust", "Lead")
apply_seniority_level(df, "ETL Engineer/ Data Analyst - Software Engineer III", "JPMorgan Chase Bank, N.A.", "Senior")
apply_seniority_level(df, "Software Engineer III (AI, Data, Python)", "JPMorgan Chase Bank, N.A.", "Senior")
apply_seniority_level(df, "Data Engineer 925", "Certec Consulting", "Senior")

del apply_seniority_level

df['Seniority'].value_counts()


Senior    50
Lead       5
Junior     4
Mid        1
Name: Seniority, dtype: int64

### 8. Parse salary

#### 8.1 Employer provided salary

In [259]:
df['Salary_employer_provided'] = df['Salary'].apply(lambda salary : True if isinstance(salary, str) and "Employer Provided Salary" in salary else False)
df['Salary_employer_provided'].value_counts()

True     128
False     92
Name: Salary_employer_provided, dtype: int64

#### 8.2 Salary per hour

In [260]:
df['Salary_hourly'] = df['Salary'].apply(lambda salary : True if isinstance(salary, str) and "Per Hour" in salary else False)
df['Salary_hourly'].value_counts()

False    139
True      81
Name: Salary_hourly, dtype: int64

#### 8.3 Salary min

In [261]:
def get_salary_min(salary):

    if isinstance(salary, str):

        pattern_salary = r"(\d+(\.\d+)?K?)"
        match_min: str = re.findall(pattern_salary, salary)[0][0]

        if "K" in match_min:
            match_min = float(match_min.replace("K", ""))
            match_min *= 1000

        return float(match_min)

    else:

        return salary
    
def calculate_yearly_income(hourly_rate):

    hours_per_week = 40
    WEEKS_PER_YEAR = 52
    HOURS_PER_YEAR = WEEKS_PER_YEAR * hours_per_week
    gross_income = hourly_rate * HOURS_PER_YEAR
    return gross_income

df['Salary_min'] = df['Salary'].apply(get_salary_min)
df['Salary_min'] = df.apply(
        lambda row: calculate_yearly_income(row['Salary_min']) if row['Salary_hourly'] == True else row['Salary_min'],
        axis=1
    )

del get_salary_min

df['Salary_min']

0      141440.0
1       63000.0
2      187200.0
3       83200.0
4      135200.0
         ...   
624         NaN
755         NaN
778     81000.0
825    120640.0
895         NaN
Name: Salary_min, Length: 220, dtype: float64

#### 8.4 Salary max

In [262]:
def get_salary_max(salary):

    if isinstance(salary, str):

        pattern_salary = r"(\d+(\.\d+)?K?)"
        match_max: str = re.findall(pattern_salary, salary)[-1][0]

        if "K" in match_max:
            match_max = float(match_max.replace("K", ""))
            match_max *= 1000

        return float(match_max)

    else:

        return salary

df['Salary_max'] = df['Salary'].apply(get_salary_max)
df['Salary_max'] = df.apply(
        lambda row: calculate_yearly_income(row['Salary_max']) if row['Salary_hourly'] == True else row['Salary_max'],
        axis=1
    )

del get_salary_max

df['Salary_max']

0      141440.0
1       90000.0
2      197600.0
3       93600.0
4      145600.0
         ...   
624         NaN
755         NaN
778    115000.0
825    131040.0
895         NaN
Name: Salary_max, Length: 220, dtype: float64

In [263]:
# Cleanup

del calculate_yearly_income

#### 8.5 Salary currency 

In [264]:
def get_currency(salary: str):

    if isinstance(salary, str):

        pattern_currency = r"(.+?(?=\d))"

        if "Employer Provided Salary" in salary:
            pattern_currency = r"(\:.+?(?=\d))"

        matched = re.search(pattern_currency, salary)

        currency = matched.group(1).strip().replace(":", "")

        return currency

    else:

        return salary
    
df['Salary_currency'] = df['Salary'].apply(get_currency)
    
del get_currency
    
df['Salary_currency'].value_counts()

$    199
Name: Salary_currency, dtype: int64

In [265]:
del df['Salary']

#### 8.6 Salary average

In [266]:
df['Salary_avg'] = (df['Salary_max']+df['Salary_min'])/2
df['Salary_avg']

0      141440.0
1       76500.0
2      192400.0
3       88400.0
4      140400.0
         ...   
624         NaN
755         NaN
778     98000.0
825    125840.0
895         NaN
Name: Salary_avg, Length: 220, dtype: float64

### 9. Employees

In [267]:
df['Employees'].value_counts()

1 to 50          56
51 to 200        50
10000+           23
1001 to 5000     20
201 to 500       15
501 to 1000      10
5001 to 10000     4
Name: Employees, dtype: int64

### 10. Type of ownership

In [268]:
df['Type_of_ownership'].value_counts()

Company - Private                 119
Company - Public                   57
Nonprofit Organization              7
Contract                            5
Subsidiary or Business Segment      4
Self-employed                       2
Private Practice / Firm             2
Name: Type_of_ownership, dtype: int64

### 11. Sector

In [269]:
df['Sector'].value_counts()

Information Technology                         77
Financial Services                             16
Human Resources & Staffing                      7
Management & Consulting                         7
Insurance                                       5
Manufacturing                                   5
Education                                       4
Healthcare                                      4
Energy, Mining & Utilities                      4
Media & Communication                           3
Pharmaceutical & Biotechnology                  3
Retail & Wholesale                              2
Nonprofit & NGO                                 1
Agriculture                                     1
Transportation & Logistics                      1
Arts, Entertainment & Recreation                1
Personal Consumer Services                      1
Aerospace & Defense                             1
Construction, Repair & Maintenance Services     1
Name: Sector, dtype: int64

### 12. Industry

In [270]:
df['Industry'].value_counts()

Information Technology Support Services    49
Computer Hardware Development              10
Enterprise Software & Network Solutions     9
Business Consulting                         7
Banking & Lending                           6
Internet & Web Services                     6
Investment & Asset Management               5
Insurance Carriers                          5
HR Consulting                               5
Health Care Services & Hospitals            4
Energy & Utilities                          4
Financial Transaction Processing            3
Software Development                        3
Biotech & Pharmaceuticals                   3
Education & Training Services               3
Accounting & Tax                            2
Advertising & Public Relations              2
Staffing & Subcontracting                   2
Commercial Printing                         2
Wholesale                                   1
Construction                                1
Aerospace & Defense               

### 13. Company age

In [271]:
import datetime

year = datetime.date.today().year

df['Company_age'] = df['Founded'].apply(lambda x: x if np.isnan(x) else int(year - x))
df['Company_age'] = df['Company_age']

del df['Founded'], year

df['Company_age'].value_counts()

16.0     8
15.0     8
8.0      7
10.0     6
5.0      5
39.0     5
9.0      5
7.0      5
24.0     5
27.0     4
12.0     4
19.0     4
20.0     4
4.0      3
17.0     3
25.0     3
85.0     3
26.0     3
6.0      2
11.0     2
224.0    2
14.0     2
18.0     2
41.0     2
13.0     2
53.0     1
239.0    1
76.0     1
160.0    1
50.0     1
122.0    1
47.0     1
35.0     1
81.0     1
22.0     1
128.0    1
211.0    1
170.0    1
23.0     1
171.0    1
158.0    1
52.0     1
97.0     1
89.0     1
54.0     1
77.0     1
37.0     1
28.0     1
106.0    1
173.0    1
29.0     1
3.0      1
Name: Company_age, dtype: int64

### 14. Job age

In [272]:
np.sort(df['Job_age'].unique())

array(['10d', '11d', '12d', '13d', '14d', '16d', '17d', '18d', '19d',
       '20d', '22d', '24d', '24h', '25d', '26d', '28d', '2d', '30d+',
       '3d', '4d', '5d', '6d', '7d', '9d'], dtype=object)

In [273]:
def clean_job_age(job_age):

    if job_age == "24h":
        job_age = "1d"
    elif job_age == "30d+":
        job_age = "31d"

    return int(job_age.replace("d", ""))

df['Job_age'] = df['Job_age'].apply(clean_job_age)

del clean_job_age
df['Job_age'].value_counts()


31    91
1     18
3     14
12    14
6     13
13     9
2      9
5      8
4      7
18     6
10     5
24     4
17     4
20     3
19     2
9      2
7      2
25     2
14     2
16     1
28     1
11     1
22     1
26     1
Name: Job_age, dtype: int64

### 15. Revenue

In [274]:
df['Revenue_USD'].value_counts()

$5 to $25 million             23
$10+ billion                  17
$25 to $100 million           16
$1 to $5 million              15
Less than $1 million           9
$100 to $500 million           8
$1 to $5 billion               7
$5 to $10 billion              5
$500 million to $1 billion     4
Name: Revenue_USD, dtype: int64

### 16. Preview columns so far

In [275]:
df.dtypes

Company_name                 object
Rating                      float64
Job_title                    object
Description                  object
Job_age                       int64
Easy_apply                     bool
Employees                    object
Type_of_ownership            object
Sector                       object
Industry                     object
Revenue_USD                  object
Friend_recommend            float64
CEO_approval                float64
Career_opportunities        float64
Comp_&_benefits             float64
Culture_&_values            float64
Senior_management           float64
Work/Life_balance           float64
Pros                         object
Cons                         object
Benefits_rating             float64
Benefits_reviews             object
City                         object
State                        object
Seniority                    object
Salary_employer_provided       bool
Salary_hourly                  bool
Salary_min                  

### 17. Change columns order

##### 17.1 move salary values

In [276]:
def move_column__to_index(column_name: str, index: int):
    df.insert(index, column_name, df.pop(column_name))


def move_columns_to_index(column_names: list[str], index: int):
    for col in column_names:
        df.insert(index, col, df.pop(col))
        index += 1

move_columns_to_index([
    'Salary_min', 
    'Salary_max', 
    'Salary_avg', 
    'Salary_currency',
    'Salary_employer_provided', 
    'Salary_hourly'
    ], 3
    )

df.dtypes

Company_name                 object
Rating                      float64
Job_title                    object
Salary_min                  float64
Salary_max                  float64
Salary_avg                  float64
Salary_currency              object
Salary_employer_provided       bool
Salary_hourly                  bool
Description                  object
Job_age                       int64
Easy_apply                     bool
Employees                    object
Type_of_ownership            object
Sector                       object
Industry                     object
Revenue_USD                  object
Friend_recommend            float64
CEO_approval                float64
Career_opportunities        float64
Comp_&_benefits             float64
Culture_&_values            float64
Senior_management           float64
Work/Life_balance           float64
Pros                         object
Cons                         object
Benefits_rating             float64
Benefits_reviews            

##### 17.2 Move Seniority

In [277]:
move_column__to_index('Seniority', 3)
df.dtypes

Company_name                 object
Rating                      float64
Job_title                    object
Seniority                    object
Salary_min                  float64
Salary_max                  float64
Salary_avg                  float64
Salary_currency              object
Salary_employer_provided       bool
Salary_hourly                  bool
Description                  object
Job_age                       int64
Easy_apply                     bool
Employees                    object
Type_of_ownership            object
Sector                       object
Industry                     object
Revenue_USD                  object
Friend_recommend            float64
CEO_approval                float64
Career_opportunities        float64
Comp_&_benefits             float64
Culture_&_values            float64
Senior_management           float64
Work/Life_balance           float64
Pros                         object
Cons                         object
Benefits_rating             

##### 17.3 Move City, State

In [278]:
move_columns_to_index(['City', 'State'], 11)
df.dtypes

Company_name                 object
Rating                      float64
Job_title                    object
Seniority                    object
Salary_min                  float64
Salary_max                  float64
Salary_avg                  float64
Salary_currency              object
Salary_employer_provided       bool
Salary_hourly                  bool
Description                  object
City                         object
State                        object
Job_age                       int64
Easy_apply                     bool
Employees                    object
Type_of_ownership            object
Sector                       object
Industry                     object
Revenue_USD                  object
Friend_recommend            float64
CEO_approval                float64
Career_opportunities        float64
Comp_&_benefits             float64
Culture_&_values            float64
Senior_management           float64
Work/Life_balance           float64
Pros                        

##### 17.4 Move Company age

In [279]:
move_column__to_index('Company_age', 19)
df.dtypes

Company_name                 object
Rating                      float64
Job_title                    object
Seniority                    object
Salary_min                  float64
Salary_max                  float64
Salary_avg                  float64
Salary_currency              object
Salary_employer_provided       bool
Salary_hourly                  bool
Description                  object
City                         object
State                        object
Job_age                       int64
Easy_apply                     bool
Employees                    object
Type_of_ownership            object
Sector                       object
Industry                     object
Company_age                 float64
Revenue_USD                  object
Friend_recommend            float64
CEO_approval                float64
Career_opportunities        float64
Comp_&_benefits             float64
Culture_&_values            float64
Senior_management           float64
Work/Life_balance           

##### 17.5 Move Work/Life_balance 

In [280]:
move_columns_to_index(['Senior_management', 'Work/Life_balance'], 25)
df.dtypes

Company_name                 object
Rating                      float64
Job_title                    object
Seniority                    object
Salary_min                  float64
Salary_max                  float64
Salary_avg                  float64
Salary_currency              object
Salary_employer_provided       bool
Salary_hourly                  bool
Description                  object
City                         object
State                        object
Job_age                       int64
Easy_apply                     bool
Employees                    object
Type_of_ownership            object
Sector                       object
Industry                     object
Company_age                 float64
Revenue_USD                  object
Friend_recommend            float64
CEO_approval                float64
Career_opportunities        float64
Comp_&_benefits             float64
Senior_management           float64
Work/Life_balance           float64
Culture_&_values            

## 18. Technology requirements - parsing the job description

##### 19 Git and code repositories

In [281]:
def check_repo(job_description: str):

    git_platforms = [
        r"Github", 
        r"GitLab", 
        r"Bitbucket", 
        r"SourceForge", 
        r"Launchpad", 
        r"Google Cloud Source Repositories",
        r"AWS CodeCommit",
        r"GitBucket",
        r"Gogs",
        r"Gitea",
        r"Apache Allura",
        r"RhodeCode",
        r"ONEDEV",
        r"Codeberg",
        r"Git" # IMPORTANT, it has to be last!
        ]
    
    for platform in git_platforms:
        if re.search((r"\b" + platform + r"\b"), job_description, re.IGNORECASE):
            return platform
        
    return np.nan
        
df['Git'] = df['Description'].apply(check_repo)

del check_repo

df['Git'].value_counts()


Git       4
GitLab    1
Name: Git, dtype: int64

In [282]:
def make_is_tech(cloud_names: list[str]):

    def check_tech(job_description: str):

        
        for cloud in cloud_names:
            if re.search((r"\b" + cloud + r"\b"), job_description, re.IGNORECASE):
                return True
            
        return False
    
    return check_tech

In [283]:
def add_is_tech_column_to_df(column_name: str, tech_names: list[str]):

    df[column_name] = df['Description'].apply(make_is_tech(tech_names))


#### 20. Cloud Platforms

##### 20.1 AWS


Provides on-demand cloud computing platforms and APIs to individuals, companies, and governments, on a metered, pay-as-you-go basis. Often times, clients will use this in combination with autoscaling.

In [284]:
cloud_names = [
    r"Amazon Web Services", 
    r"AWS",
    ]

column_name = 'AWS'

add_is_tech_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

False    196
True      24
Name: AWS, dtype: int64

##### 20.2 Microsoft Azure

A cloud computing platform operated by Microsoft that provides access, management, and development of applications and services via around the world-distributed data centers.

In [285]:
cloud_names = [
    r"Microsoft Azure", 
    r"Azure",
    ]

column_name = 'Microsoft_Azure'

add_is_tech_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

False    196
True      24
Name: Microsoft_Azure, dtype: int64

##### 20.3 GCP

A suite of cloud computing services that runs on the same infrastructure that Google uses internally for its end-user products, such as Google Search, Gmail, Google Drive, and YouTube.

In [286]:
cloud_names = [
    r"Google Cloud Platform", 
    r"GCP",
    ]

column_name = 'GPC'

add_is_tech_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

False    204
True      16
Name: GPC, dtype: int64

##### 20.4 Alibaba Cloud

Alibaba Cloud provides cloud computing services to online businesses and Alibaba's own e-commerce ecosystem.

In [287]:
cloud_names = [
    r"Alibaba Cloud", 
    r"Aliyun",
    ]

column_name = 'Alibaba_Cloud'

add_is_tech_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

False    220
Name: Alibaba_Cloud, dtype: int64

##### 20.4 Oracle Cloud

Providing servers, storage, network, applications and services through a global network of Oracle Corporation managed data centers

In [288]:
cloud_names = [
    r"Oracle Cloud", 
    r"OCI",
    ]

column_name = 'Oracle_Cloud'

add_is_tech_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

False    220
Name: Oracle_Cloud, dtype: int64

##### 20.5 IBM Cloud

A set of cloud computing services for business

In [289]:
cloud_names = [
    r"IBM Cloud", 
    r"Kyndryl",
    r"Bluemix"
    ]

column_name = 'IBM_cloud'

add_is_tech_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

False    220
Name: IBM_cloud, dtype: int64

##### 20.6 Tencent Cloud

Tencent Cloud provides businesses across the globe with stable and secure industry-leading cloud products and services, leveraging technological advancements such as cloud computing, Big Data, AI, IoT and network security.

In [290]:
cloud_names = [
    r"Tencent Cloud",
    ]

column_name = 'Tencent_cloud'

add_is_tech_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

False    220
Name: Tencent_cloud, dtype: int64

##### 20.8 OVHcloud

A French cloud computing company which offers VPS, dedicated servers and other web services

In [291]:
cloud_names = [
    r"OVHcloud",
    r"OVH"
    ]

column_name = 'OVHcloud'

add_is_tech_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

False    220
Name: OVHcloud, dtype: int64

##### 20.9 DigitalOcean

A cloud hosting provider that offers cloud computing services and Infrastructure as a Service (IaaS). Known for pricing and scalability

In [292]:
cloud_names = [
    r"DigitalOcean"
    ]

column_name = 'DigitalOcean_cloud'

add_is_tech_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

False    220
Name: DigitalOcean_cloud, dtype: int64

##### 20.10 Linode

An American cloud hosting provider that focused on providing Linux-based virtual machines, cloud infrastructure, and managed services.

In [293]:
cloud_names = [
    r"Linode",
    r"Akamai"
    ]

column_name = 'Lincode_cloud'

add_is_tech_column_to_df(column_name, cloud_names)

df[column_name].value_counts()

False    220
Name: Lincode_cloud, dtype: int64

In [294]:
del cloud_names

#### 21. Relational Database Management Systems (RDBMS)

##### 21.1 PostgreSQL
Can be used as a data store for big data solutions.
Postgres, is a free and open-source relational database management system (RDBMS) emphasizing extensibility and SQL compliance. <br>
PostgreSQL features transactions with Atomicity, Consistency, Isolation, Durability (ACID) properties, automatically updatable views, materialized views, triggers, foreign keys, and stored procedures. <br> It is designed to handle a range of workloads, from single machines to data warehouses or Web services with many concurrent users. 

In [295]:
tool_names = [
    r"PostgreSQL",
    r"Postgres"
    ]

column_name = 'PostgreSQL'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    219
True       1
Name: PostgreSQL, dtype: int64

##### 21.2 Microsoft SQL Server


A software product with the primary function of storing and retrieving data as requested by other software applications—which may run either on the same computer or on another computer across a network (including the Internet).

In [296]:
tool_names = [
    r"Microsoft SQL",
    r"SQL Server"
    ]

column_name = 'Microsoft_SQL_Server'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    213
True       7
Name: Microsoft_SQL_Server, dtype: int64

##### 21.3 MySQL

An open-source relational database management system.

In [297]:
tool_names = [
    r"MySQL"
    ]

column_name = 'MySQL'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    218
True       2
Name: MySQL, dtype: int64

##### 21.4 IBM Db2 warehouse

A family of data management products, including database servers, developed by IBM. It initially supported the relational model, but was extended to support object–relational features and non-relational structures like JSON and XML.

In [298]:
tool_names = [
    r"Db2",
    r"IBMDb2"
    ]

column_name = 'IBM_Db2'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    217
True       3
Name: IBM_Db2, dtype: int64

##### 21.5. Oracle PL/SQL

 A procedural language designed specifically to embrace SQL statements within its syntax. PL/SQL program units are compiled by the Oracle Database server and stored inside the database. And at run-time, both PL/SQL and SQL run within the same server process, bringing optimal efficiency

In [299]:
tool_names = [
    r"PL/SQL",
    r"PL / SQL",
    r"Procedural Language for SQL"
    ]

column_name = 'Oracle_PL_SQL'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    219
True       1
Name: Oracle_PL_SQL, dtype: int64

#### 22. NoSQL Database Management Systems

##### 22.1 MongoDB

A source-available cross-platform document-oriented database program. Classified as a NoSQL database program, MongoDB uses JSON-like documents with optional schemas

In [300]:
tool_names = [
    r"MongoDB",
    r"Mongo DB",
    ]

column_name = 'MongoDB'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    219
True       1
Name: MongoDB, dtype: int64

##### 22.2 Cassandra

A free and open-source, distributed, wide-column store, NoSQL database management system designed to handle large amounts of data across many commodity servers, providing high availability with no single point of failure

In [301]:
tool_names = [
    r"Cassandra",
    ]

column_name = 'Cassandra'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    217
True       3
Name: Cassandra, dtype: int64

##### 22.3 Amazon DynamoDB

A proprietary NoSQL database service that supports key–value and document data structures and is offered by Amazon.com as part of the Amazon Web Services portfolio.

In [302]:
tool_names = [
    r"DynamoDB",
    r"Dynamo DB",
    r"SimpleDB"
    ]

column_name = 'Amazon_DynamoDB'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    218
True       2
Name: Amazon_DynamoDB, dtype: int64

#### 22.4 Neo4j

A graph database management system developed by Neo4j, Inc. Described by its developers as an ACID-compliant transactional database with native graph storage and processing

In [303]:
tool_names = [
    r"Neo4j"
    ]

column_name = 'Neo4j '

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    220
Name: Neo4j , dtype: int64

##### 22.5 Apache Solr

An open-source enterprise-search platform, written in Java. Its major features include full-text search, hit highlighting, faceted search, real-time indexing, dynamic clustering, database integration, NoSQL features[2] and rich document (e.g., Word, PDF) handling.

In [304]:
tool_names = [
    r"Solr"
    ]

column_name = 'Apache_Solr '

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    220
Name: Apache_Solr , dtype: int64

#### 22. Data warehousing and Analytics

##### 22.1 Amazon Redshift

A data warehouse product which forms part of the larger cloud-computing platform Amazon Web Services. It is built on top of technology from the massive parallel processing data warehouse company ParAccel, to handle large scale data sets and database migrations.

In [305]:
tool_names = [
    r"Redshift",
    ]

column_name = 'Amazon_Redshift'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    215
True       5
Name: Amazon_Redshift, dtype: int64

##### 22.2 Google BigQuery

A serverless data warehouse that enables scalable analysis over petabytes of data. It is a Platform as a Service that supports querying using ANSI SQL. It also has built-in machine learning capabilities.

In [306]:
tool_names = [
    r"BigQuery",
    ]

column_name = 'Google_BigQuery'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    213
True       7
Name: Google_BigQuery, dtype: int64

##### 22.3 Snowflake

Snowflake enables data storage, processing, and analytic solutions.

In [307]:
tool_names = [
    r"Snowflake"
    ]

column_name = 'Snowflake'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    207
True      13
Name: Snowflake, dtype: int64

##### 22.4 Oracle Exadata

Designed to run Oracle Database workloads, such as an OLTP application running simultaneously with Analytics processing. Historically, specialized database computing platforms were designed for a particular workload, such as Data Warehousing, and poor or unusable for other workloads, such as OLTP. 

In [308]:
tool_names = [
    r"Exadata"
    ]

column_name = 'Oracle_Exadata'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    220
Name: Oracle_Exadata, dtype: int64

##### 22.5 SAP HANA

A multi-model database that stores data in its memory instead of keeping it on a disk.

In [309]:
tool_names = [
    r"HANA"
    ]

column_name = 'SAP_HANA'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    220
Name: SAP_HANA, dtype: int64

##### 22.6 Teradata

It is mainly suitable for building large scale data warehousing applications.

In [310]:
tool_names = [
    r"Teradata"
    ]

column_name = 'Teradata'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    216
True       4
Name: Teradata, dtype: int64

#### 23. Data Integration and Processing

##### 23.1 Informatica PowerCenter - Data integration tool


Used extensively for ETL operations, data quality, data masking, data replication, data virtualization, and master data management services.

In [311]:
tool_names = [
    r"PowerCenter",
    r"Power Center",
    ]

column_name = 'Informatica_PowerCenter'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    220
Name: Informatica_PowerCenter, dtype: int64

##### 23.2 DataBricks - Data processing and analytics platform

A unified set of tools for building, deploying, sharing, and maintaining enterprise-grade data solutions at scale. 

In [312]:
tool_names = [
    r"Data Bricks",
    r"Databricks"
    ]

column_name = 'Databricks'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    206
True      14
Name: Databricks, dtype: int64

##### 23.3 Presto - Query engine

 A distributed query engine for big data using the SQL query language. Its architecture allows users to query data sources such as Hadoop, Cassandra, Kafka, AWS S3, Alluxio, MySQL, MongoDB and Teradata, and allows use of multiple data sources

In [313]:
tool_names = [
    r"Presto",
    r"PrestoDB",
    r"PrestoSQL"
    ]

column_name = 'Presto'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    220
Name: Presto, dtype: int64

#### 24. Stream processing tools

##### 24.1 Apache Kafka

An open-source system, distributed event store and stream-processing platform. The project aims to provide a unified, high-throughput, low-latency platform for handling real-time data feeds.

In [314]:
tool_names = [
    r"Kafka",
    ]

column_name = 'Apache_Kafka'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    209
True      11
Name: Apache_Kafka, dtype: int64

##### 24.2 Apache Flink

Process data streams at a large scale and to deliver real-time analytical insights about your processed data with your streaming application.

In [315]:
tool_names = [
    r"Flink",
    ]

column_name = 'Apache_Flink'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    218
True       2
Name: Apache_Flink, dtype: int64

##### 24.3 Dataflow


Dataflow is a managed service provided by Google Cloud for building and executing data processing pipelines. It enables developers to create scalable and efficient batch and streaming data pipelines using a simple programming model.

In [316]:
tool_names = [
    r"Dataflow",
    ]

column_name = 'Dataflow'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    216
True       4
Name: Dataflow, dtype: int64

#### 25 Workflow orchestration tools

##### 25.1 Apache Airflow

Apache Airflow is an open-source platform used for programmatically creating, scheduling, and monitoring complex workflows or data pipelines. It allows users to define and execute a sequence of tasks or operations, while providing tools for tracking and troubleshooting workflow executions.

In [317]:
tool_names = [
    r"Airflow",
    ]

column_name = 'Apache_Airflow'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    211
True       9
Name: Apache_Airflow, dtype: int64

##### 25.2 Luigi

Luigi is a Python-based open-source workflow management system that helps to build complex pipelines of batch jobs. It provides a flexible and extensible architecture to create and manage complex data workflows.

In [318]:
tool_names = [
    r"Luigi",
    ]

column_name = 'Luigi'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    220
Name: Luigi, dtype: int64

##### 25.3 SSIS

SQL Server Integration Services (SSIS) is a Microsoft tool used for building data integration and ETL (extract, transform, load) workflows. It allows users to perform a range of tasks such as data extraction, transformation, and loading from various sources to different destinations.

In [319]:
tool_names = [
    r"SSIS",
    r"SQL Server Integration Services"
    ]

column_name = 'SSIS'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    215
True       5
Name: SSIS, dtype: int64

#### 26. Big Data processing

##### 24.1 Apache Hadoop

Apache Hadoop is an open-source framework that allows for the distributed processing of large data sets across clusters of computers using simple programming models. It provides a distributed file system and supports various distributed computing models, such as MapReduce and Spark, for processing and analyzing large data sets.

In [320]:
tool_names = [
    r"Hadoop",
    ]

column_name = 'Apache_Hadoop'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    202
True      18
Name: Apache_Hadoop, dtype: int64

##### 24.2 Apache Hive


Apache Hive is a data warehouse software that facilitates querying and managing large datasets stored in Hadoop file systems using a SQL-like language called HiveQL. It provides a high-level interface for data analysts and developers to analyze, transform, and summarize data stored in Hadoop Distributed File System (HDFS) and other compatible storage systems.

In [321]:
tool_names = [
    r"Hive",
    ]

column_name = 'Apache_Hive'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    205
True      15
Name: Apache_Hive, dtype: int64

##### 24.3 Apache Spark

Apache Spark is a distributed computing framework designed to process large-scale data processing and analysis workloads in parallel. It can be used for batch processing, real-time stream processing, machine learning, and graph processing, among other things.

In [322]:
tool_names = [
    r"Spark",
    r"PySpark"
    ]

column_name = 'Apache_Spark'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    185
True      35
Name: Apache_Spark, dtype: int64

In [323]:
df.dtypes

Company_name      object
Rating           float64
Job_title         object
Seniority         object
Salary_min       float64
                  ...   
Luigi               bool
SSIS                bool
Apache_Hadoop       bool
Apache_Hive         bool
Apache_Spark        bool
Length: 71, dtype: object

#### 25. Linux

Family of Unix-like operating systems.

In [324]:
tool_names = [
    r"Linux",
    ]

column_name = 'Linux'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    216
True       4
Name: Linux, dtype: int64

#### 26. Programming languages

##### 26.1 Python

Python is a high-level, interpreted programming language used for various purposes such as web development, data analysis, artificial intelligence, and more.

In [325]:
tool_names = [
    r"Python",
    ]

column_name = 'Python'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    162
True      58
Name: Python, dtype: int64

##### 26.2 R

A programming language and environment for statistical graphics and computing.

In [326]:
tool_names = [
    r"R",
    r"RStudio"
    ]

column_name = 'R'

add_is_tech_column_to_df(column_name, tool_names)

df[column_name].value_counts()

False    216
True       4
Name: R, dtype: int64