In [None]:
import pandas as pd

# Load your raw file (adjust filename if needed)
df = pd.read_csv('../data/glassdoor_sample_former_employees_100k.csv')
print('Shape: ', df.shape)


- Data Profiling

In [None]:
# Check overall dataset
print(df.columns)
print('DATA TYPES: ')
print(df.dtypes.sort_values(ascending=False))
print('UNIQUE VALUES: ')
print(df.nunique().sort_values(ascending=False))
print('MISSING VALUES: ')
print(df.isna().sum().sort_values(ascending=False))
df.describe()

- Data Cleaning

In [None]:
# Drop empty columns 'advice' and 'index'
drop_cols = df.columns[[5,18]]
df = df.drop(columns=drop_cols)
print('Before: ', df.shape)

# Drop duplicates
df = df.drop_duplicates()
print('After: ', df.shape)

In [None]:
# Rename columns

df.rename(columns={
    'rating': 'overall_rating',
    'Career Opportunities': 'career_opportunities_rating',
    'Compensation and Benefits': 'comp_benefits_rating',
    'Culture & Values': 'culture_values_rating',
    'Diversity & Inclusion': 'diversity_inclusion_rating',
    'job': 'job_title',
    'status': 'tenure_at_employer',
    'date': 'review_date',
    'title': 'review_title',
    'Senior Management': 'senior_management_rating',
    'Work/Life Balance': 'work_life_rating',
    'Recommend': 'recommend',
    'CEO Approval': 'ceo_approval',
    'Business Outlook': 'business_outlook'
}, inplace=True)


In [None]:
# Get rid of weird values
ratings_cols = ['overall_rating', 'career_opportunities_rating', 'comp_benefits_rating', 'senior_management_rating', 'work_life_rating', 'culture_values_rating', 'diversity_inclusion_rating']
weird_values = ['10u0eun', 'e0wqkp','s4o194', '156pzk5']

for col in ratings_cols:
    print('Before: ', f"{col}: {df[col].unique()}")

df[ratings_cols] = df[ratings_cols].replace(weird_values, 0)

# Check columns unique values
for col in ratings_cols:
    print('After: ', f"{col}: {df[col].unique()}")

In [None]:
#Change around data types

# Define columns to change
cols_to_0 = ['diversity_inclusion_rating', 'culture_values_rating', 'work_life_rating', 'senior_management_rating', 'comp_benefits_rating', 'career_opportunities_rating']

# Fill NaNs
df[cols_to_0] = df[cols_to_0].fillna(0)

# Change from object to str 
# Strip values ending in .0
for col in cols_to_0:
    df[col] = df[col].astype(str).str.replace(r'\.0$', '', regex=True)

# change from str to int
df[cols_to_0] = df[cols_to_0].astype('int')

# Define columns to change
obj_cols = ['career_opportunities_rating', 'comp_benefits_rating', 'senior_management_rating', 'work_life_rating']

# Change from object to int
df[obj_cols] = df[obj_cols].astype('int')

# float to int
df['overall_rating'] = df['overall_rating'].astype('int')

# object to string
cols_to_string = ['review_title', 'firm_link', 'job_title', 'pros', 'cons', 'ceo_approval', 'tenure_at_employer']
df[cols_to_string] = df[cols_to_string].astype('string')

# object to date
df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce')

# map business_outlook, recommend, ceo_approval values to labels
opinion_map_cols = ['recommend', 'ceo_approval', 'business_outlook']
new_vals = { 'v' : 'Positive', 'r': 'Mild', 'x': 'Negative', 'o' : 'No opinion'}

df[opinion_map_cols] = df[opinion_map_cols].replace(new_vals)

# Change from object to string
df[opinion_map_cols] = df[opinion_map_cols].astype('string')

In [None]:
# Inspect feedback columns
text_cols = ['pros', 'cons']
df['pros'].unique()

# Remove line breaks in pros & cons
for col in text_cols:
    df[col] = df[col].astype(str).str.replace(r'[\r\n]+', '. ', regex=True)


In [None]:
# Inspect tenure_at_employer
df.dtypes

df['tenure_at_employer'].unique()
df['tenure_at_employer'].value_counts()

# Re-bucket tenure lengths
df['tenure_at_employer'] = df['tenure_at_employer'].replace({
    'Former Employee' : 'Not provided',
    'Former Employee, less than 1 year' : 'Under 1 year',
    'Former Employee, more than 1 year' : '1-5 years',
    'Former Employee, more than 3 years' : '1-5 years',
    'Former Employee, more than 5 years' : '6-10 years',
    'Former Employee, more than 8 years' : '6-10 years',
    'Former Employee, more than 10 years' : 'Over 10 years'

})

# Set bucket sorting order
tenure_order = [
    'Under 1 year',
    '1-5 years',
    '6-10 years',
    'Over 10 years',
    'Not provided'
]

# Change to categorical to enforce sorting order
df['tenure_at_employer'] = pd.Categorical(
    df['tenure_at_employer'],
    categories=tenure_order,
    ordered=True
)

In [None]:
# Inspect firm_link
print(df['firm_link'].sample(10))

# Extract company name
df['firm_link'] = df['firm_link'].str.extract(r'Reviews/([^/]+?)-Reviews', expand=False)

#Removed dashes between words
df['firm_link'] = df['firm_link'].str.replace('-', ' ', regex=False)

In [None]:
df.to_csv('cleaned_glassdoor_sample_data.csv', index=False)

cleaned_sample = pd.read_csv('cleaned_glassdoor_sample_data.csv')

print(cleaned_sample.shape)

cleaned_sample.head()
