In [2]:
import pandas as pd

# Load your raw file (adjust filename if needed)
df = pd.read_csv('../data/glassdoor_sample_former_employees_100k.csv')
print('Before: ', df.shape)

# Peek at the dataset
df.head()

Before:  (100000, 19)


Unnamed: 0,rating,title,status,pros,cons,advice,Recommend,CEO Approval,Business Outlook,Career Opportunities,Compensation and Benefits,Senior Management,Work/Life Balance,Culture & Values,Diversity & Inclusion,firm_link,date,job,index
0,3.0,Good,Former Employee,Schedule accommodation\r\nFree food during shift,Fast food and management was rough,,o,o,o,3.0,3.0,3.0,5.0,3.0,4.0,Reviews/Chipotle-Reviews-E1370610.htm,"Jul 18, 2022",Cashier,
1,1.0,It's Only About Data,"Former Employee, less than 1 year",The teachers at the school are great and reall...,The school and the district has disappointing ...,,x,o,x,2.0,2.0,1.0,1.0,1.0,,Reviews/Manor-ISD-Reviews-E231745.htm,"Jan 5, 2021",6th Grade ELA Teacher,
2,3.0,Bad HR experience,Former Employee,Good salaries\r\nGood benefits\r\nGood place r...,"Really bad HR in Latam. People not prepared, a...",,v,r,r,2.0,3.0,1.0,2.0,3.0,,https://www.glassdoor.com/Reviews/CA-Technolog...,"Mar 18, 2018",General Manager,
3,2.0,Do not support older teacher or teachers who d...,"Former Employee, more than 8 years",Excellent location and benefit package,Management tend to be bullies and racists,,o,o,o,,,,,,,Reviews/Palm-Beach-County-School-District-Revi...,"Mar 25, 2017",Teaching,
4,4.0,Good place to develop career,Former Employee,Supportive management for career development,Culture is pushy. Fast-paced just like the ind...,,o,o,o,,,,,,,https://www.glassdoor.com/Reviews/Goldman-Sach...,"Feb 28, 2023",Vice President,


- Data Profiling

In [3]:
# Check overall dataset
print(df.columns)
print('DATA TYPES: ')
print(df.dtypes.sort_values(ascending=False))
print('UNIQUE VALUES: ')
print(df.nunique().sort_values(ascending=False))
print('MISSING VALUES: ')
print(df.isna().sum().sort_values(ascending=False))
df.describe()

Index(['rating', 'title', 'status', 'pros', 'cons', 'advice', 'Recommend',
       'CEO Approval', 'Business Outlook', 'Career Opportunities',
       'Compensation and Benefits', 'Senior Management', 'Work/Life Balance',
       'Culture & Values', 'Diversity & Inclusion', 'firm_link', 'date', 'job',
       'index'],
      dtype='object')
DATA TYPES: 
Career Opportunities          object
firm_link                     object
status                        object
pros                          object
cons                          object
job                           object
Recommend                     object
CEO Approval                  object
Business Outlook              object
title                         object
Compensation and Benefits     object
Senior Management             object
Work/Life Balance             object
date                          object
rating                       float64
Diversity & Inclusion        float64
Culture & Values             float64
advice             

Unnamed: 0,rating,advice,Culture & Values,Diversity & Inclusion,index
count,100000.0,0.0,75697.0,31660.0,0.0
mean,3.22606,,3.111312,3.553411,
std,1.300659,,1.44904,1.388412,
min,1.0,,1.0,1.0,
25%,2.0,,2.0,3.0,
50%,3.0,,3.0,4.0,
75%,4.0,,4.0,5.0,
max,5.0,,5.0,5.0,


- Data Cleaning

In [4]:
# Drop empty columns 'advice' and 'index'
drop_cols = df.columns[[5,18]]
df = df.drop(columns=drop_cols)
# Drop duplicates
df = df.drop_duplicates()
print('After: ', df.shape)

After:  (99560, 17)


In [5]:
# Rename columns
print('PREVIOUS COLUMNS: ', df.columns)

df.rename(columns={
    'rating': 'overall_rating',
    'Career Opportunities': 'career_opportunities_rating',
    'Compensation and Benefits': 'comp_benefits_rating',
    'Culture & Values': 'culture_values_rating',
    'Diversity & Inclusion': 'diversity_inclusion_rating',
    'job': 'job_title',
    'status': 'tenure_at_employer',
    'date': 'review_date',
    'title': 'review_title',
    'Senior Management': 'senior_management_rating',
    'Work/Life Balance': 'work_life_rating',
    'Recommend': 'recommend',
    'CEO Approval': 'ceo_approval',
    'Business Outlook': 'business_outlook'
}, inplace=True)

print('NEW COLUMNS: ') 
df.columns

PREVIOUS COLUMNS:  Index(['rating', 'title', 'status', 'pros', 'cons', 'Recommend',
       'CEO Approval', 'Business Outlook', 'Career Opportunities',
       'Compensation and Benefits', 'Senior Management', 'Work/Life Balance',
       'Culture & Values', 'Diversity & Inclusion', 'firm_link', 'date',
       'job'],
      dtype='object')
NEW COLUMNS: 


Index(['overall_rating', 'review_title', 'tenure_at_employer', 'pros', 'cons',
       'recommend', 'ceo_approval', 'business_outlook',
       'career_opportunities_rating', 'comp_benefits_rating',
       'senior_management_rating', 'work_life_rating', 'culture_values_rating',
       'diversity_inclusion_rating', 'firm_link', 'review_date', 'job_title'],
      dtype='object')

In [6]:
# Impute 0 for NaNs in ratings columns
ratings_cols = ['overall_rating', 'career_opportunities_rating', 'comp_benefits_rating', 'senior_management_rating',  'work_life_rating', 'culture_values_rating', 'diversity_inclusion_rating']

print('BEFORE: ')
print(df.isna().sum().sort_values(ascending=False))

# Fill NaNs with 0
df[ratings_cols] = df[ratings_cols].fillna(0)

# Replace null review titles with Untitled
df['review_title'] = df['review_title'].fillna('Untitled')

print('AFTER: ')
df.isna().sum().sort_values(ascending=False)

BEFORE: 
diversity_inclusion_rating     68128
culture_values_rating          24220
senior_management_rating       22279
work_life_rating               21282
comp_benefits_rating           21090
career_opportunities_rating    20727
review_title                     310
overall_rating                     0
pros                               0
business_outlook                   0
ceo_approval                       0
recommend                          0
cons                               0
tenure_at_employer                 0
firm_link                          0
review_date                        0
job_title                          0
dtype: int64
AFTER: 


overall_rating                 0
review_title                   0
tenure_at_employer             0
pros                           0
cons                           0
recommend                      0
ceo_approval                   0
business_outlook               0
career_opportunities_rating    0
comp_benefits_rating           0
senior_management_rating       0
work_life_rating               0
culture_values_rating          0
diversity_inclusion_rating     0
firm_link                      0
review_date                    0
job_title                      0
dtype: int64

In [7]:
# Change data types
df.dtypes.sort_values()

# Change date to date type

# Change all ratings columns to integer
# Change comment columns to string

overall_rating                 float64
diversity_inclusion_rating     float64
culture_values_rating          float64
firm_link                       object
work_life_rating                object
senior_management_rating        object
comp_benefits_rating            object
review_date                     object
career_opportunities_rating     object
ceo_approval                    object
recommend                       object
cons                            object
pros                            object
tenure_at_employer              object
review_title                    object
business_outlook                object
job_title                       object
dtype: object