In [26]:
# read files and combine
import pandas as pd
import re
from IPython.core.display import display, HTML

In [2]:
#read files
files = ['glassdoor_Business_Analyst.csv', 'glassdoor_Data_Analyst.csv', 'glassdoor_Data_Architect.csv', 'glassdoor_Data_Engineer.csv', 'glassdoor_Data_Scientist.csv', 'glassdoor_Machine_Learning_Engineer.csv']
dfs = [pd.read_csv(file) for file in files]
data_df = pd.concat(dfs, ignore_index=True)
print(data_df.head())

                      Job Title                           Salary Estimate  \
0              Business Analyst              $61K - $85K (Glassdoor est.)   
1      Business Systems Analyst  $61.00 - $71.00 Per Hour (Employer est.)   
2     Business/Database Analyst              $58K - $92K (Glassdoor est.)   
3                  Data Analyst                      $60K (Employer est.)   
4  Junior Business/Data Analyst              $51K - $78K (Glassdoor est.)   

                  Company Name        Location  \
0         BCVS group Inc.5.0 ★       Plano, TX   
1                          SRP      Irvine, CA   
2     JSR Tech Consulting5.0 ★      Newark, NJ   
3                 Wildcat3.5 ★    New York, NY   
4  The Kenific Group, Inc2.6 ★  Washington, DC   

                                     Job Description  Rating  \
0  Hello,\r\nWe are hiring for Business System An...     5.0   
1                                                 -1     5.0   
2  JSR has an immediate opening for their di

In [3]:
data_df.shape

(1080, 12)

In [4]:
# clean duplicate and salary
data_clean = data_df.drop_duplicates()
data_clean = data_clean[data_clean['Salary Estimate'] != "-1"]
data_clean.shape

(563, 12)

In [5]:
data_clean['hourly'] = data_clean['Salary Estimate'].apply(lambda x: 1 if 'per hour' in x.lower() else 0)
data_clean['employer_est'] = data_clean['Salary Estimate'].apply(lambda x: 1 if 'employer est.' in x.lower() else 0)
data_clean['glassdoor_est'] = data_clean['Salary Estimate'].apply(lambda x: 1 if 'glassdoor est.' in x.lower() else 0)

In [6]:
def extract_min_max_salary(salary_string):
    # Remove the 'K' (thousand) sign and '$' sign
    salary_string = salary_string.replace('K', '').replace('$', '')

    # Extract the numbers (minimum and maximum salary)
    salary_range = re.findall("\d+\.\d+|\d+", salary_string)

    # If there is no range (only one number), then the minimum and maximum salary are the same
    if len(salary_range) == 1:
        min_salary = max_salary = float(salary_range[0])

    # If there is a range (two numbers), the first is the minimum salary and the second is the maximum salary
    elif len(salary_range) == 2:
        min_salary, max_salary = map(float, salary_range)

    else:
        min_salary = max_salary = None

    return min_salary, max_salary


In [7]:
data_clean['min_salary'], data_clean['max_salary'] = zip(*data_clean['Salary Estimate'].map(extract_min_max_salary))
data_clean.head()

Unnamed: 0,Job Title,Salary Estimate,Company Name,Location,Job Description,Rating,Size,Founded,Type of ownership,Industry,Sector,Revenue,hourly,employer_est,glassdoor_est,min_salary,max_salary
0,Business Analyst,$61K - $85K (Glassdoor est.),BCVS group Inc.5.0 ★,"Plano, TX","Hello,\r\nWe are hiring for Business System An...",5.0,Unknown,-1,Company - Public,-1,-1,Unknown / Non-Applicable,0,0,1,61.0,85.0
1,Business Systems Analyst,$61.00 - $71.00 Per Hour (Employer est.),SRP,"Irvine, CA",-1,5.0,-1,-1,-1,-1,-1,-1,1,1,0,61.0,71.0
2,Business/Database Analyst,$58K - $92K (Glassdoor est.),JSR Tech Consulting5.0 ★,"Newark, NJ",JSR has an immediate opening for their direct ...,5.0,51 to 200 Employees,2015,Company - Private,Information Technology Support Services,Information Technology,$5 to $25 million (USD),0,0,1,58.0,92.0
3,Data Analyst,$60K (Employer est.),Wildcat3.5 ★,"New York, NY",Are you someone who loves crunching numbers an...,3.5,201 to 500 Employees,1972,Nonprofit Organization,Education & Training Services,Education,$5 to $25 million (USD),0,1,0,60.0,60.0
4,Junior Business/Data Analyst,$51K - $78K (Glassdoor est.),"The Kenific Group, Inc2.6 ★","Washington, DC","Company Overview:\r\nThe Kenific Group, Inc. (...",2.6,51 to 200 Employees,-1,Company - Private,Business Consulting,Management & Consulting,Less than $1 million (USD),0,0,1,51.0,78.0


In [8]:
#hourly to annually
data_clean.loc[data_clean['hourly'] == 1, 'min_salary'] = data_clean['min_salary'] * 2
data_clean.loc[data_clean['hourly'] == 1, 'max_salary'] = data_clean['max_salary'] * 2
data_clean.head()

Unnamed: 0,Job Title,Salary Estimate,Company Name,Location,Job Description,Rating,Size,Founded,Type of ownership,Industry,Sector,Revenue,hourly,employer_est,glassdoor_est,min_salary,max_salary
0,Business Analyst,$61K - $85K (Glassdoor est.),BCVS group Inc.5.0 ★,"Plano, TX","Hello,\r\nWe are hiring for Business System An...",5.0,Unknown,-1,Company - Public,-1,-1,Unknown / Non-Applicable,0,0,1,61.0,85.0
1,Business Systems Analyst,$61.00 - $71.00 Per Hour (Employer est.),SRP,"Irvine, CA",-1,5.0,-1,-1,-1,-1,-1,-1,1,1,0,122.0,142.0
2,Business/Database Analyst,$58K - $92K (Glassdoor est.),JSR Tech Consulting5.0 ★,"Newark, NJ",JSR has an immediate opening for their direct ...,5.0,51 to 200 Employees,2015,Company - Private,Information Technology Support Services,Information Technology,$5 to $25 million (USD),0,0,1,58.0,92.0
3,Data Analyst,$60K (Employer est.),Wildcat3.5 ★,"New York, NY",Are you someone who loves crunching numbers an...,3.5,201 to 500 Employees,1972,Nonprofit Organization,Education & Training Services,Education,$5 to $25 million (USD),0,1,0,60.0,60.0
4,Junior Business/Data Analyst,$51K - $78K (Glassdoor est.),"The Kenific Group, Inc2.6 ★","Washington, DC","Company Overview:\r\nThe Kenific Group, Inc. (...",2.6,51 to 200 Employees,-1,Company - Private,Business Consulting,Management & Consulting,Less than $1 million (USD),0,0,1,51.0,78.0


In [9]:
data_clean[data_clean['hourly'] == 1][['min_salary', 'max_salary']]

Unnamed: 0,min_salary,max_salary
1,122.00,142.00
7,124.00,124.00
16,46.00,50.00
20,60.00,60.00
21,100.00,110.00
...,...,...
649,140.00,160.00
724,91.92,137.90
778,52.40,102.78
796,52.40,102.78


In [10]:
data_clean['avg_salary'] = (data_clean.min_salary+data_clean.max_salary)/2

In [11]:
#age of company 
data_clean['age'] = data_clean.Founded.apply(lambda x: x if x <1 else 2023 - x)

In [12]:
# Define a function to clean the company name
def clean_company_name(name):
    # Check if the last 5 characters match the pattern of a rating (e.g., "5.0 ★")
    if name[-5:-2].replace('.', '').isdigit() and name[-1] == "★":
        return name[:-5].strip()
    else:
        return name.strip()

In [13]:
#Company name text only
data_clean['company_txt'] = data_clean["Company Name"].apply(clean_company_name)

In [14]:
data_clean.head()

Unnamed: 0,Job Title,Salary Estimate,Company Name,Location,Job Description,Rating,Size,Founded,Type of ownership,Industry,Sector,Revenue,hourly,employer_est,glassdoor_est,min_salary,max_salary,avg_salary,age,company_txt
0,Business Analyst,$61K - $85K (Glassdoor est.),BCVS group Inc.5.0 ★,"Plano, TX","Hello,\r\nWe are hiring for Business System An...",5.0,Unknown,-1,Company - Public,-1,-1,Unknown / Non-Applicable,0,0,1,61.0,85.0,73.0,-1,BCVS group Inc.
1,Business Systems Analyst,$61.00 - $71.00 Per Hour (Employer est.),SRP,"Irvine, CA",-1,5.0,-1,-1,-1,-1,-1,-1,1,1,0,122.0,142.0,132.0,-1,SRP
2,Business/Database Analyst,$58K - $92K (Glassdoor est.),JSR Tech Consulting5.0 ★,"Newark, NJ",JSR has an immediate opening for their direct ...,5.0,51 to 200 Employees,2015,Company - Private,Information Technology Support Services,Information Technology,$5 to $25 million (USD),0,0,1,58.0,92.0,75.0,8,JSR Tech Consulting
3,Data Analyst,$60K (Employer est.),Wildcat3.5 ★,"New York, NY",Are you someone who loves crunching numbers an...,3.5,201 to 500 Employees,1972,Nonprofit Organization,Education & Training Services,Education,$5 to $25 million (USD),0,1,0,60.0,60.0,60.0,51,Wildcat
4,Junior Business/Data Analyst,$51K - $78K (Glassdoor est.),"The Kenific Group, Inc2.6 ★","Washington, DC","Company Overview:\r\nThe Kenific Group, Inc. (...",2.6,51 to 200 Employees,-1,Company - Private,Business Consulting,Management & Consulting,Less than $1 million (USD),0,0,1,51.0,78.0,64.5,-1,"The Kenific Group, Inc"


In [15]:
data_clean['State'] = data_clean['Location'].str.split(',').apply(lambda x: x[1].strip() if len(x) > 1 else x[0])

In [16]:
data_clean.State.value_counts()

CA               105
Remote            89
NY                52
TX                51
NJ                21
WA                20
FL                19
VA                16
MI                15
NC                14
MA                14
MD                13
GA                13
DC                12
AZ                11
OH                11
MO                10
PA                 8
MN                 8
KY                 7
United States      6
IL                 5
AR                 4
CO                 4
KS                 4
AL                 4
IN                 2
NE                 2
California         2
IA                 2
UT                 2
LA                 2
OR                 2
AK                 2
WI                 2
TN                 1
DE                 1
Missouri           1
Pennsylvania       1
SC                 1
ND                 1
ME                 1
HI                 1
NM                 1
Name: State, dtype: int64

In [21]:
counts_data = data_clean['State'].value_counts().reset_index()
counts_data.columns = ['values', 'counts']
counts_data

Unnamed: 0,values,counts
0,CA,105
1,Remote,89
2,NY,52
3,TX,51
4,NJ,21
5,WA,20
6,FL,19
7,VA,16
8,MI,15
9,NC,14


In [22]:
data_clean['State'] = data_clean['State'].replace('Missouri', 'MO')
data_clean['State'] = data_clean['State'].replace('California', 'CA')
data_clean['State'] = data_clean['State'].replace('Pennsylvania', 'PA')
data_clean['State'] = data_clean['State'].replace('United States', 'Remote')

In [27]:
state_counts=data_clean.State.value_counts().to_frame().transpose()
# Display horizontally with scroll
display(HTML('<div style="max-width: 800px; overflow-x: auto; border: 1px solid #ccc;">' +
             state_counts.to_html(index=False) +
             '</div>'))


CA,Remote,NY,TX,NJ,WA,FL,VA,MI,NC,MA,MD,GA,DC,AZ,MO,OH,PA,MN,KY,IL,CO,KS,AR,AL,UT,AK,NE,IA,WI,LA,IN,OR,TN,SC,ME,ND,HI,NM,DE
107,95,52,51,21,20,19,16,15,14,14,13,13,12,11,11,11,9,8,7,5,4,4,4,4,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1


In [28]:
data_clean.columns

Index(['Job Title', 'Salary Estimate', 'Company Name', 'Location',
       'Job Description', 'Rating', 'Size', 'Founded', 'Type of ownership',
       'Industry', 'Sector', 'Revenue', 'hourly', 'employer_est',
       'glassdoor_est', 'min_salary', 'max_salary', 'avg_salary', 'age',
       'company_txt', 'State'],
      dtype='object')

In [29]:
data_clean.head()

Unnamed: 0,Job Title,Salary Estimate,Company Name,Location,Job Description,Rating,Size,Founded,Type of ownership,Industry,...,Revenue,hourly,employer_est,glassdoor_est,min_salary,max_salary,avg_salary,age,company_txt,State
0,Business Analyst,$61K - $85K (Glassdoor est.),BCVS group Inc.5.0 ★,"Plano, TX","Hello,\r\nWe are hiring for Business System An...",5.0,Unknown,-1,Company - Public,-1,...,Unknown / Non-Applicable,0,0,1,61.0,85.0,73.0,-1,BCVS group Inc.,TX
1,Business Systems Analyst,$61.00 - $71.00 Per Hour (Employer est.),SRP,"Irvine, CA",-1,5.0,-1,-1,-1,-1,...,-1,1,1,0,122.0,142.0,132.0,-1,SRP,CA
2,Business/Database Analyst,$58K - $92K (Glassdoor est.),JSR Tech Consulting5.0 ★,"Newark, NJ",JSR has an immediate opening for their direct ...,5.0,51 to 200 Employees,2015,Company - Private,Information Technology Support Services,...,$5 to $25 million (USD),0,0,1,58.0,92.0,75.0,8,JSR Tech Consulting,NJ
3,Data Analyst,$60K (Employer est.),Wildcat3.5 ★,"New York, NY",Are you someone who loves crunching numbers an...,3.5,201 to 500 Employees,1972,Nonprofit Organization,Education & Training Services,...,$5 to $25 million (USD),0,1,0,60.0,60.0,60.0,51,Wildcat,NY
4,Junior Business/Data Analyst,$51K - $78K (Glassdoor est.),"The Kenific Group, Inc2.6 ★","Washington, DC","Company Overview:\r\nThe Kenific Group, Inc. (...",2.6,51 to 200 Employees,-1,Company - Private,Business Consulting,...,Less than $1 million (USD),0,0,1,51.0,78.0,64.5,-1,"The Kenific Group, Inc",DC


In [30]:
data_clean.to_csv('data_cleaned.csv', index = False)