#### Load data

In [221]:
import pandas as pd
import numpy as np

# Load sample file
df = pd.read_csv('glassdoor_sample_former_employees_100k.csv')
print('Shape before: ', df.shape)
df.head(2)

Shape before:  (100000, 19)


Unnamed: 0,rating,title,status,pros,cons,advice,Recommend,CEO Approval,Business Outlook,Career Opportunities,Compensation and Benefits,Senior Management,Work/Life Balance,Culture & Values,Diversity & Inclusion,firm_link,date,job,index
0,3.0,Good,Former Employee,Schedule accommodation\r\nFree food during shift,Fast food and management was rough,,o,o,o,3,3.0,3.0,5.0,3.0,4.0,Reviews/Chipotle-Reviews-E1370610.htm,"Jul 18, 2022",Cashier,
1,1.0,It's Only About Data,"Former Employee, less than 1 year",The teachers at the school are great and really support each other during stressful times.,"The school and the district has disappointing leadership who don't listen to the teachers and staff.\r\n\r\nCommunication is terrible. Emails are sent on weekends for work to be done before Monday. Instructions and expectations are unclear, and then employees are disciplined for not meeting expectations. Methods of communication do not show basic understanding of good leadership skills or even basic constructive communication.\r\n\r\nNo support from administration when there are problems or difficulties. Everything is about ""data"" that is neither accurate nor useful. Though the district and administrators at the school claim to care about putting students first, there is no part of their policies or actions that support students (or staff) above points of superfluous data collection or self protection.",,x,o,x,2,2.0,1.0,1.0,1.0,,Reviews/Manor-ISD-Reviews-E231745.htm,"Jan 5, 2021",6th Grade ELA Teacher,


In [222]:
df.dtypes.sort_values()

rating                       float64
Diversity & Inclusion        float64
Culture & Values             float64
advice                       float64
index                        float64
pros                          object
Recommend                     object
CEO Approval                  object
Business Outlook              object
job                           object
Compensation and Benefits     object
Senior Management             object
Work/Life Balance             object
status                        object
title                         object
firm_link                     object
date                          object
cons                          object
Career Opportunities          object
dtype: object

#### Data Cleaning

##### empty columns

In [223]:
# Review column names
df.columns.to_list()

['rating',
 'title',
 'status',
 'pros',
 'cons',
 'advice',
 'Recommend',
 'CEO Approval',
 'Business Outlook',
 'Career Opportunities',
 'Compensation and Benefits',
 'Senior Management',
 'Work/Life Balance',
 'Culture & Values',
 'Diversity & Inclusion',
 'firm_link',
 'date',
 'job',
 'index']

In [224]:
# Find entirely empty columns
df.isna().sum().sort_values(ascending=False)

advice                       100000
index                        100000
Diversity & Inclusion         68340
Culture & Values              24303
Senior Management             22361
Work/Life Balance             21357
Compensation and Benefits     21165
Career Opportunities          20800
title                           314
status                            0
rating                            0
CEO Approval                      0
Business Outlook                  0
cons                              0
Recommend                         0
pros                              0
firm_link                         0
date                              0
job                               0
dtype: int64

In [225]:
# Drop empty columns
df = df.drop(columns='advice')

df.head(2)

Unnamed: 0,rating,title,status,pros,cons,Recommend,CEO Approval,Business Outlook,Career Opportunities,Compensation and Benefits,Senior Management,Work/Life Balance,Culture & Values,Diversity & Inclusion,firm_link,date,job,index
0,3.0,Good,Former Employee,Schedule accommodation\r\nFree food during shift,Fast food and management was rough,o,o,o,3,3.0,3.0,5.0,3.0,4.0,Reviews/Chipotle-Reviews-E1370610.htm,"Jul 18, 2022",Cashier,
1,1.0,It's Only About Data,"Former Employee, less than 1 year",The teachers at the school are great and really support each other during stressful times.,"The school and the district has disappointing leadership who don't listen to the teachers and staff.\r\n\r\nCommunication is terrible. Emails are sent on weekends for work to be done before Monday. Instructions and expectations are unclear, and then employees are disciplined for not meeting expectations. Methods of communication do not show basic understanding of good leadership skills or even basic constructive communication.\r\n\r\nNo support from administration when there are problems or difficulties. Everything is about ""data"" that is neither accurate nor useful. Though the district and administrators at the school claim to care about putting students first, there is no part of their policies or actions that support students (or staff) above points of superfluous data collection or self protection.",x,o,x,2,2.0,1.0,1.0,1.0,,Reviews/Manor-ISD-Reviews-E231745.htm,"Jan 5, 2021",6th Grade ELA Teacher,


In [226]:
df = df.drop(df.columns[17], axis=1)
df.head(2)

Unnamed: 0,rating,title,status,pros,cons,Recommend,CEO Approval,Business Outlook,Career Opportunities,Compensation and Benefits,Senior Management,Work/Life Balance,Culture & Values,Diversity & Inclusion,firm_link,date,job
0,3.0,Good,Former Employee,Schedule accommodation\r\nFree food during shift,Fast food and management was rough,o,o,o,3,3.0,3.0,5.0,3.0,4.0,Reviews/Chipotle-Reviews-E1370610.htm,"Jul 18, 2022",Cashier
1,1.0,It's Only About Data,"Former Employee, less than 1 year",The teachers at the school are great and really support each other during stressful times.,"The school and the district has disappointing leadership who don't listen to the teachers and staff.\r\n\r\nCommunication is terrible. Emails are sent on weekends for work to be done before Monday. Instructions and expectations are unclear, and then employees are disciplined for not meeting expectations. Methods of communication do not show basic understanding of good leadership skills or even basic constructive communication.\r\n\r\nNo support from administration when there are problems or difficulties. Everything is about ""data"" that is neither accurate nor useful. Though the district and administrators at the school claim to care about putting students first, there is no part of their policies or actions that support students (or staff) above points of superfluous data collection or self protection.",x,o,x,2,2.0,1.0,1.0,1.0,,Reviews/Manor-ISD-Reviews-E231745.htm,"Jan 5, 2021",6th Grade ELA Teacher


##### Column names

In [227]:
# Columns renamed prior to further profiling

print('Original columns: ', df.columns.to_list())

df.rename(columns={
    'rating': 'overall_rating',
    'Career Opportunities': 'career_opportunities_rating',
    'Compensation and Benefits': 'comp_benefits_rating',
    'Culture & Values': 'culture_values_rating',
    'Diversity & Inclusion': 'diversity_inclusion_rating',
    'job': 'job_title',
    'status': 'tenure_at_employer',
    'date': 'review_date',
    'title': 'review_title',
    'Senior Management': 'senior_management_rating',
    'Work/Life Balance': 'work_life_rating',
    'Recommend': 'recommend',
    'CEO Approval': 'ceo_approval',
    'Business Outlook': 'business_outlook',
    'firm_link' : 'company'
}, inplace=True)


print('New columns: ', df.columns.to_list())

Original columns:  ['rating', 'title', 'status', 'pros', 'cons', 'Recommend', 'CEO Approval', 'Business Outlook', 'Career Opportunities', 'Compensation and Benefits', 'Senior Management', 'Work/Life Balance', 'Culture & Values', 'Diversity & Inclusion', 'firm_link', 'date', 'job']
New columns:  ['overall_rating', 'review_title', 'tenure_at_employer', 'pros', 'cons', 'recommend', 'ceo_approval', 'business_outlook', 'career_opportunities_rating', 'comp_benefits_rating', 'senior_management_rating', 'work_life_rating', 'culture_values_rating', 'diversity_inclusion_rating', 'company', 'review_date', 'job_title']


##### Duplicates

In [228]:
# Find entire row duplicates
duplicates = df.duplicated().sum()
print('Duplicate rows: ', duplicates)

Duplicate rows:  440


In [229]:
# Drop row duplicates
df = df.drop_duplicates()
df.shape

(99560, 17)

- ratings columns

In [230]:
# Inspect ratings columns
float_ratings = ['overall_rating', 'diversity_inclusion_rating', 'culture_values_rating']
print(df[float_ratings])

obj_ratings = ['work_life_rating', 'senior_management_rating', 'comp_benefits_rating', 'career_opportunities_rating']
print(df[obj_ratings])


       overall_rating  diversity_inclusion_rating  culture_values_rating
0                 3.0                         4.0                    3.0
1                 1.0                         NaN                    1.0
2                 3.0                         NaN                    3.0
3                 2.0                         NaN                    NaN
4                 4.0                         NaN                    NaN
...               ...                         ...                    ...
99995             2.0                         NaN                    5.0
99996             4.0                         NaN                    5.0
99997             3.0                         NaN                    NaN
99998             3.0                         3.0                    3.0
99999             5.0                         5.0                    5.0

[99560 rows x 3 columns]
      work_life_rating senior_management_rating comp_benefits_rating  \
0                    5    

In [231]:
for col in (float_ratings):
    print('FLOAT RATINGS: ', f"{col}: {df[col].unique()}") 


for col in (obj_ratings):
    print('OBJECT RATINGS: ', f"{col}: {df[col].unique()}")

FLOAT RATINGS:  overall_rating: [3. 1. 2. 4. 5.]
FLOAT RATINGS:  diversity_inclusion_rating: [ 4. nan  5.  3.  1.  2.]
FLOAT RATINGS:  culture_values_rating: [ 3.  1. nan  2.  4.  5.]
OBJECT RATINGS:  work_life_rating: ['5' '1.0' '2' nan '3' '1' '4' '4.0' '5.0' '2.0' '10u0eun' 's4o194' '3.0'
 '156pzk5' 'e0wqkp']
OBJECT RATINGS:  senior_management_rating: ['3' '1.0' '1' nan '5' '4' '5.0' '2.0' '2' '4.0' '3.0' '10u0eun' 'e0wqkp'
 '156pzk5' 's4o194']
OBJECT RATINGS:  comp_benefits_rating: ['3' '2.0' nan '5' '4' '1' '2' '5.0' '3.0' '4.0' '1.0' '10u0eun' 'e0wqkp'
 's4o194' '156pzk5']
OBJECT RATINGS:  career_opportunities_rating: ['3' '2' nan '5' '1' '4' '5.0' '4.0' '2.0' '1.0' '10u0eun' '3.0' 'e0wqkp'
 's4o194' '156pzk5']


In [232]:
# Inspect tenure column
print('Dtype: ', df['tenure_at_employer'].dtypes)

df['tenure_at_employer'].value_counts().sort_index()

Dtype:  object


tenure_at_employer
Former Employee                        39470
Former Employee, less than 1 year      16116
Former Employee, more than 1 year      20339
Former Employee, more than 10 years     3006
Former Employee, more than 3 years     11902
Former Employee, more than 5 years      6036
Former Employee, more than 8 years      2691
Name: count, dtype: int64

- Convert all text columns to string dtype

In [233]:
# objects to string
all_text_cols = ['review_title', 'company', 'job_title', 'pros', 'cons', 'ceo_approval', 'tenure_at_employer','recommend','business_outlook']

df[all_text_cols] = df[all_text_cols].astype('string')

# Verify dtype update to string
df[all_text_cols].dtypes

review_title          string[python]
company               string[python]
job_title             string[python]
pros                  string[python]
cons                  string[python]
ceo_approval          string[python]
tenure_at_employer    string[python]
recommend             string[python]
business_outlook      string[python]
dtype: object

- Ratings columns

In [234]:
# Get rid of non-numeric values in all ratings columns
obj_ratings = ['work_life_rating', 'senior_management_rating', 'comp_benefits_rating', 'career_opportunities_rating']
weird_values = ['10u0eun', 'e0wqkp','s4o194', '156pzk5']

# Replace weird values with NaN
df[obj_ratings] = df[obj_ratings].replace(weird_values, np.nan)

# convert objects to floats
df[obj_ratings] = df[obj_ratings].astype('float')

# Check columns unique values
for col in obj_ratings:
    print('Unique values: '' ', f"{col}: {df[col].unique()}")

df.info()

## Keep ratings NaNs!!  These inidicate no response, not actually 0 
# 'overall_rating' has no NaNs because mandatory

Unique values:   work_life_rating: [ 5.  1.  2. nan  3.  4.]
Unique values:   senior_management_rating: [ 3.  1. nan  5.  4.  2.]
Unique values:   comp_benefits_rating: [ 3.  2. nan  5.  4.  1.]
Unique values:   career_opportunities_rating: [ 3.  2. nan  5.  1.  4.]
<class 'pandas.core.frame.DataFrame'>
Index: 99560 entries, 0 to 99999
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   overall_rating               99560 non-null  float64
 1   review_title                 99250 non-null  string 
 2   tenure_at_employer           99560 non-null  string 
 3   pros                         99560 non-null  string 
 4   cons                         99560 non-null  string 
 5   recommend                    99560 non-null  string 
 6   ceo_approval                 99560 non-null  string 
 7   business_outlook             99560 non-null  string 
 8   career_opportunities_rating  78154 non-null 

- DATE column

In [235]:
# object to date
df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce')

# Confirm new datetime dtype
df['review_date'].dtype

dtype('<M8[ns]')

- review_title

In [236]:
# Capture # of nulls before changes
print('Null before: ',df['review_title'].isna().sum())

# Fill nulls with "Not provided"
df['review_title'] = df['review_title'].fillna('Not provided')

# Verify no more nulls
print('Nulls after: ', df['review_title'].isna().sum())

Null before:  310
Nulls after:  0


- business_outlook, recommend, ceo_approval columns

In [237]:
# map business_outlook, recommend, ceo_approval values to labels
opinion_map_cols = ['business_outlook', 'recommend', 'ceo_approval']
opinion_map = { 'v' : 'Positive', 'r': 'Mild', 'x': 'Negative', 'o' : 'No opinion'}

df[opinion_map_cols] = df[opinion_map_cols].apply(lambda col: col.map(opinion_map))


# Verify changed values
df[opinion_map_cols]

Unnamed: 0,business_outlook,recommend,ceo_approval
0,No opinion,No opinion,No opinion
1,Negative,Negative,No opinion
2,Mild,Positive,Mild
3,No opinion,No opinion,No opinion
4,No opinion,No opinion,No opinion
...,...,...,...
99995,Negative,Negative,Mild
99996,Mild,Positive,No opinion
99997,No opinion,No opinion,No opinion
99998,Positive,Negative,Mild


- tenure_at_employer column

In [238]:
# Re-bucket tenure lengths
df['tenure_at_employer'] = df['tenure_at_employer'].replace({
    'Former Employee' : 'Not provided',
    'Former Employee, less than 1 year' : 'Under 1 year',
    'Former Employee, more than 1 year' : '1-5 years',
    'Former Employee, more than 3 years' : '1-5 years',
    'Former Employee, more than 5 years' : '6-10 years',
    'Former Employee, more than 8 years' : '6-10 years',
    'Former Employee, more than 10 years' : 'Over 10 years'

})

# Set bucket sorting order
tenure_order = [
    'Under 1 year',
    '1-5 years',
    '6-10 years',
    'Over 10 years',
    'Not provided'
]

# Change to categorical to enforce sorting order
df['tenure_at_employer'] = pd.Categorical(
    df['tenure_at_employer'],
    categories=tenure_order,
    ordered=True
)

# Confirm changes
df['tenure_at_employer'].value_counts().sort_index()


tenure_at_employer
Under 1 year     16116
1-5 years        32241
6-10 years        8727
Over 10 years     3006
Not provided     39470
Name: count, dtype: int64

- company column

In [239]:
# Replace NaNs with "Not Provided"
df['company'] = df['company'].fillna('Not provided')

# Extract company name
df['company'] = df['company'].str.extract(r'Reviews/([^/]+?)-Reviews', expand=False)

# Remove dashes between words
df['company'] = df['company'].str.replace('-', ' ', regex=False)

# Inspect company
df['company']

0                                 Chipotle
1                                Manor ISD
2                          CA Technologies
3        Palm Beach County School District
4                            Goldman Sachs
                       ...                
99995                                   RH
99996                             GameStop
99997                             Chipotle
99998                                H E B
99999                 Texas A&M University
Name: company, Length: 99560, dtype: string

- pros column

In [240]:
# Pros column

df['pros'] = (
    df['pros']
    .str.lower()
    .str.strip()

# remove punctuation
    .str.replace(r'[^\w\s]', '', regex=True)  

# normalize spaces
    .str.replace(r'\s+', ' ', regex=True)     
)


In [241]:
df['pros'].sample(3)

62888           learning zone for a fresh graduate
49342    good coworkers good work work environment
25115     the culture was great and very laid back
Name: pros, dtype: string

- cons column

In [242]:
# Cons column 

df['cons'] = (
    df['cons']
    .str.lower()
    .str.strip()

# remove punctuation
    .str.replace(r'[^\w\s]', '', regex=True)  

# normalize spaces
    .str.replace(r'\s+', ' ', regex=True)     
)

In [243]:
df['cons'].sample(3)

34458    where do i begin first i question the few good recent reviews published on this site as being written by anyone other than management trying to get their score increased i dont know of anyone in any position across the board who is happy there right now mercury used to be a good company to work for in the past 23 years its gone downhill fast people are over worked super stressed unappreciated underpaid and literally everyone hates being there upper management couldnt care less about their employees the raises are laughable and the bonuses are less and less year after year ive seen 3 major layoffs and only stayed as long as i did hoping to get laid off myself so i could get a nice severance and then be on my way hc is a joke implementing huddle cards and high 5s as a way to reward adultsits offensive i send my condolences to george joseph in regards to the way a company he worked hard to build has been run into the ground
93778                                                   

#### Cleaned dataset

In [244]:
# Export to csv
df.to_csv('cleaned_glassdoor_sample_data.csv', index=False)

# Preview sample of new csv
cleaned_sample = pd.read_csv('cleaned_glassdoor_sample_data.csv')
print(cleaned_sample.shape)
cleaned_sample.head(2)

(99560, 17)


Unnamed: 0,overall_rating,review_title,tenure_at_employer,pros,cons,recommend,ceo_approval,business_outlook,career_opportunities_rating,comp_benefits_rating,senior_management_rating,work_life_rating,culture_values_rating,diversity_inclusion_rating,company,review_date,job_title
0,3.0,Good,Not provided,schedule accommodation free food during shift,fast food and management was rough,No opinion,No opinion,No opinion,3.0,3.0,3.0,5.0,3.0,4.0,Chipotle,2022-07-18,Cashier
1,1.0,It's Only About Data,Under 1 year,the teachers at the school are great and really support each other during stressful times,the school and the district has disappointing leadership who dont listen to the teachers and staff communication is terrible emails are sent on weekends for work to be done before monday instructions and expectations are unclear and then employees are disciplined for not meeting expectations methods of communication do not show basic understanding of good leadership skills or even basic constructive communication no support from administration when there are problems or difficulties everything is about data that is neither accurate nor useful though the district and administrators at the school claim to care about putting students first there is no part of their policies or actions that support students or staff above points of superfluous data collection or self protection,Negative,No opinion,Negative,2.0,2.0,1.0,1.0,1.0,,Manor ISD,2021-01-05,6th Grade ELA Teacher
