#### 1. Importing all packages

In [109]:
# External
import numpy as np
import pandas as pd

#### 2. Importing  a CSV file

In [99]:
df = pd.read_csv("data/RAW/Data_Engineer_06-03-2023_23-41.csv")
df.head()

Unnamed: 0,Company_name,Rating,Location,Job_title,Description,Salary,Job_age,Easy_apply,Employees,Type_of_ownership,...,CEO_approval,Career_opportunities,Comp_&_benefits,Culture_&_values,Senior_management,Work/Life_balance,Pros,Cons,Benefits_rating,Benefits_reviews
0,Infoway solutions LLC,3.9,"Santa Clara, CA",Data Engineer,Need min 10+ Years exp\nData Engineer\nBay Are...,Employer Provided Salary:$68.00 Per Hour,3d,True,,Company - Private,...,0.84,4.0,3.9,4.0,3.7,3.9,"['""Nice and friendly work environment"" (in 1 r...",['No Cons have been reported by the Glassdoor ...,2.2,
1,Optimal Inc.,3.6,"Dearborn, MI",Data Engineer - Terraform,Position Description:\nThe GDIA Data Factory P...,$63K - $90K (Glassdoor est.),12d,True,1 to 50,Nonprofit Organization,...,0.78,3.2,3.7,3.3,2.6,4.5,['No Pros have been reported by the Glassdoor ...,"['""Antisocial and downright rude CEO, callous ...",5.0,
2,Strivernet RPO Services Ltd,,"Santa Clara, CA",Data Engineer,"(W2 CANDIDATES ONLY) (SANTA CLARA, CA)\nPLEASE...",Employer Provided Salary:$90.00 - $95.00 Per Hour,5d,True,,Company - Public,...,,,,,,,,,,
3,Futuretech Consultants LLC,,"Newton, MS",Snowflake Data Engineer,My name is Dileep and I am a recruiter at Futu...,Employer Provided Salary:$40.00 - $45.00 Per Hour,30d+,True,,,...,,,,,,,,,,
4,Clairvoyant,4.4,Remote,Data Engineer (MDM),Required Skills:\nMust have 5-8+ Years of expe...,Employer Provided Salary:$65.00 - $70.00 Per Hour,12d,True,51 to 200,Company - Private,...,0.87,4.1,4.2,3.9,4.1,4.0,"['""Benefits, compensation, clean work environm...",['No Cons have been reported by the Glassdoor ...,,


In [100]:
df.columns

Index(['Company_name', 'Rating', 'Location', 'Job_title', 'Description',
       'Salary', 'Job_age', 'Easy_apply', 'Employees', 'Type_of_ownership',
       'Sector', 'Founded', 'Industry', 'Revenue_USD', 'Friend_recommend',
       'CEO_approval', 'Career_opportunities', 'Comp_&_benefits',
       'Culture_&_values', 'Senior_management', 'Work/Life_balance', 'Pros',
       'Cons', 'Benefits_rating', 'Benefits_reviews'],
      dtype='object')

#### 3. Remove rows only with NaNs

In [101]:
df = df.dropna(how='all')
df.shape

(900, 25)

There is no empty rows

#### 4. Remove duplicates

In [102]:
df = df.drop_duplicates()
df.shape

(220, 25)

There is huge amount of duplicates. But this is the feature of glassdoor

#### 5. Remove empty columns

In [103]:
df = df.dropna(axis=1, how='all')
df.shape

(220, 25)

There is no empty columns

#### 6. Now we will split `Location` column into `State` and `City`.

In [104]:
df['Location'].head()

0    Santa Clara, CA
1       Dearborn, MI
2    Santa Clara, CA
3         Newton, MS
4             Remote
Name: Location, dtype: object

In [105]:
df['City'] = df['Location'].apply(lambda x: x.split(',')[0] if "," in x else x)
df['City'].head()

0    Santa Clara
1       Dearborn
2    Santa Clara
3         Newton
4         Remote
Name: City, dtype: object

In [106]:
df['State'] = df['Location'].apply(lambda x: x.split(',')[1] if "," in x else x)
df['State'].head()

0        CA
1        MI
2        CA
3        MS
4    Remote
Name: State, dtype: object

#### 7. Add job title seniority

In [108]:
df['Job_title'].unique()

array(['Data Engineer', 'Data Engineer - Terraform',
       'Snowflake Data Engineer', 'Data Engineer (MDM)',
       'AWS Data Engineer', 'DATA ENGINEER', 'Big Data Engineer',
       'Sr. Data Engineer', 'Data Engineer - Flink', 'Jr. Data Engineer',
       'Data Engineer - Remote', 'Data Engineer (L5)',
       'Software Data Engineer', 'GCP Data Engineer',
       'Senior Data Engineer', 'Azure Cloud Data Engineer',
       'GCP DATA ENGINEER', 'Data Test Engineer', 'Azure Data Engineer',
       'Senior Azure Data Bricks Engineer', 'Data Analytics Engineer',
       'Data Engineer (W2 and onsite)', 'Senior Big Data Engineer',
       'Data Engineer- Google Cloud',
       'Data Engineer (ETL & System Administration concentration)',
       'Data Engineer/Data Analyst', 'Data Engineer/Data Scientist',
       'ETL Data Engineer', 'Lead Data Engineer',
       'Sr. Data Engineer with Snowflake', 'Junior Data Engineer',
       'Senior Data Engineer - Remote', 'Data Engineer Level 3',
       'Clou

In [121]:
def get_seniority(job_title:str):

    seniority = {
        'Junior' : ["Jr.", "Junior"],
        'Mid' : ["Mid", "Middle"],
        'Senior': ["Sr.", "Senior"],
        'Lead': "Lead",
        'Principle' : "Principle"
    }
    
    if seniority['Junior'][0] in job_title or seniority['Junior'][1] in job_title :
        return "Junior"
    elif seniority['Mid'][0] in job_title or seniority['Mid'][1] in job_title :
        return "Mid"
    elif seniority['Senior'][0] in job_title or seniority['Senior'][1] in job_title :
        return "Senior"
    elif seniority['Lead'] in job_title:
        return "Lead"
    elif seniority['Principle'] in job_title:
        return "Principle"
    else:
        return np.nan
    
df['Seniority'] = df['Job_title'].apply(get_seniority)

del get_seniority

df['Seniority'].value_counts()

Senior    45
Junior     4
Lead       4
Name: Seniority, dtype: int64

Add non-standard seniority

In [None]:
# Data Engineer (L5) - Netflix senior - Senior
# Infoorigin Inc - Data Engineer Level 3 - Principle
# Technical Support Engineer (L5) - Data Platform, Big Data / Analytics - Senior
# Data Engineer IC4 - Lead
# JPMorgan Chase Bank, N.A., ETL Engineer/ Data Analyst - Software Engineer III - Mid
# Software Engineer III (AI, Data, Python) - Mid
# Data Engineer 925 - Senior