In [11]:
# read files and combine
import pandas as pd
import re

In [5]:
#read files
files = ['glassdoor_Business_Analyst.csv', 'glassdoor_Data_Analyst.csv', 'glassdoor_Data_Architect.csv', 'glassdoor_Data_Engineer.csv', 'glassdoor_Data_Scientist.csv', 'glassdoor_Machine_Learning_Engineer.csv']
dfs = [pd.read_csv(file) for file in files]
data_df = pd.concat(dfs, ignore_index=True)
print(data_df.head())

                      Job Title                           Salary Estimate  \
0              Business Analyst              $61K - $85K (Glassdoor est.)   
1      Business Systems Analyst  $61.00 - $71.00 Per Hour (Employer est.)   
2     Business/Database Analyst              $58K - $92K (Glassdoor est.)   
3                  Data Analyst                      $60K (Employer est.)   
4  Junior Business/Data Analyst              $51K - $78K (Glassdoor est.)   

                  Company Name        Location  \
0         BCVS group Inc.5.0 ★       Plano, TX   
1                          SRP      Irvine, CA   
2     JSR Tech Consulting5.0 ★      Newark, NJ   
3                 Wildcat3.5 ★    New York, NY   
4  The Kenific Group, Inc2.6 ★  Washington, DC   

                                     Job Description  Rating  \
0  Hello,\r\nWe are hiring for Business System An...     5.0   
1                                                 -1     5.0   
2  JSR has an immediate opening for their di

In [8]:
data_df.shape

(1080, 12)

In [9]:
# clean duplicate and salary
data_clean = data_df.drop_duplicates()
data_clean = data_clean[data_clean['Salary Estimate'] != "-1"]
data_clean.shape

(563, 12)

In [10]:
data_clean['hourly'] = data_clean['Salary Estimate'].apply(lambda x: 1 if 'per hour' in x.lower() else 0)
data_clean['employer_est'] = data_clean['Salary Estimate'].apply(lambda x: 1 if 'employer est.' in x.lower() else 0)
data_clean['glassdoor_est'] = data_clean['Salary Estimate'].apply(lambda x: 1 if 'glassdoor est.' in x.lower() else 0)

In [12]:
def extract_min_max_salary(salary_string):
    # Remove the 'K' (thousand) sign and '$' sign
    salary_string = salary_string.replace('K', '').replace('$', '')

    # Extract the numbers (minimum and maximum salary)
    salary_range = re.findall("\d+\.\d+|\d+", salary_string)

    # If there is no range (only one number), then the minimum and maximum salary are the same
    if len(salary_range) == 1:
        min_salary = max_salary = float(salary_range[0])

    # If there is a range (two numbers), the first is the minimum salary and the second is the maximum salary
    elif len(salary_range) == 2:
        min_salary, max_salary = map(float, salary_range)

    else:
        min_salary = max_salary = None

    return min_salary, max_salary


In [13]:
data_clean['Min Salary'], data_clean['Max Salary'] = zip(*data_clean['Salary Estimate'].map(extract_min_max_salary))
data_clean.head()

Unnamed: 0,Job Title,Salary Estimate,Company Name,Location,Job Description,Rating,Size,Founded,Type of ownership,Industry,Sector,Revenue,hourly,employer_est,glassdoor_est,Min Salary,Max Salary
0,Business Analyst,$61K - $85K (Glassdoor est.),BCVS group Inc.5.0 ★,"Plano, TX","Hello,\r\nWe are hiring for Business System An...",5.0,Unknown,-1,Company - Public,-1,-1,Unknown / Non-Applicable,0,0,1,61.0,85.0
1,Business Systems Analyst,$61.00 - $71.00 Per Hour (Employer est.),SRP,"Irvine, CA",-1,5.0,-1,-1,-1,-1,-1,-1,1,1,0,61.0,71.0
2,Business/Database Analyst,$58K - $92K (Glassdoor est.),JSR Tech Consulting5.0 ★,"Newark, NJ",JSR has an immediate opening for their direct ...,5.0,51 to 200 Employees,2015,Company - Private,Information Technology Support Services,Information Technology,$5 to $25 million (USD),0,0,1,58.0,92.0
3,Data Analyst,$60K (Employer est.),Wildcat3.5 ★,"New York, NY",Are you someone who loves crunching numbers an...,3.5,201 to 500 Employees,1972,Nonprofit Organization,Education & Training Services,Education,$5 to $25 million (USD),0,1,0,60.0,60.0
4,Junior Business/Data Analyst,$51K - $78K (Glassdoor est.),"The Kenific Group, Inc2.6 ★","Washington, DC","Company Overview:\r\nThe Kenific Group, Inc. (...",2.6,51 to 200 Employees,-1,Company - Private,Business Consulting,Management & Consulting,Less than $1 million (USD),0,0,1,51.0,78.0


In [14]:
#age of company 
data_clean['age'] = data_clean.Founded.apply(lambda x: x if x <1 else 2023 - x)

In [15]:
#Company name text only
data_clean['company_txt'] = data_clean.apply(lambda x: x['Company Name'] if x['Rating'] <0 else x['Company Name'][:-5], axis = 1)

In [16]:
data_clean.head()

Unnamed: 0,Job Title,Salary Estimate,Company Name,Location,Job Description,Rating,Size,Founded,Type of ownership,Industry,Sector,Revenue,hourly,employer_est,glassdoor_est,Min Salary,Max Salary,age,company_txt
0,Business Analyst,$61K - $85K (Glassdoor est.),BCVS group Inc.5.0 ★,"Plano, TX","Hello,\r\nWe are hiring for Business System An...",5.0,Unknown,-1,Company - Public,-1,-1,Unknown / Non-Applicable,0,0,1,61.0,85.0,-1,BCVS group Inc.
1,Business Systems Analyst,$61.00 - $71.00 Per Hour (Employer est.),SRP,"Irvine, CA",-1,5.0,-1,-1,-1,-1,-1,-1,1,1,0,61.0,71.0,-1,
2,Business/Database Analyst,$58K - $92K (Glassdoor est.),JSR Tech Consulting5.0 ★,"Newark, NJ",JSR has an immediate opening for their direct ...,5.0,51 to 200 Employees,2015,Company - Private,Information Technology Support Services,Information Technology,$5 to $25 million (USD),0,0,1,58.0,92.0,8,JSR Tech Consulting
3,Data Analyst,$60K (Employer est.),Wildcat3.5 ★,"New York, NY",Are you someone who loves crunching numbers an...,3.5,201 to 500 Employees,1972,Nonprofit Organization,Education & Training Services,Education,$5 to $25 million (USD),0,1,0,60.0,60.0,51,Wildcat
4,Junior Business/Data Analyst,$51K - $78K (Glassdoor est.),"The Kenific Group, Inc2.6 ★","Washington, DC","Company Overview:\r\nThe Kenific Group, Inc. (...",2.6,51 to 200 Employees,-1,Company - Private,Business Consulting,Management & Consulting,Less than $1 million (USD),0,0,1,51.0,78.0,-1,"The Kenific Group, Inc"


In [24]:
(data_clean.company_txt.str.len() == 0).sum()

2

In [25]:
data_clean.columns

Index(['Job Title', 'Salary Estimate', 'Company Name', 'Location',
       'Job Description', 'Rating', 'Size', 'Founded', 'Type of ownership',
       'Industry', 'Sector', 'Revenue', 'hourly', 'employer_est',
       'glassdoor_est', 'Min Salary', 'Max Salary', 'age', 'company_txt'],
      dtype='object')

In [26]:
data_clean.to_csv('data_cleaned.csv', index = False)