#### Load data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('cleaned_glassdoor_sample_data.csv')

In [None]:
df.head(2)

In [None]:
df.columns

#### Rating columns review

In [None]:
# Ratings columns profiling
ratings_cols = [
    'overall_rating',
    'career_opportunities_rating', 
    'comp_benefits_rating', 
    'senior_management_rating', 
    'work_life_rating', 
    'culture_values_rating', 
    'diversity_inclusion_rating']

df[ratings_cols].info()
df[ratings_cols].describe()

In [None]:
# Unanswered rating categories

unanswered = df[ratings_cols].isna().sum().to_frame(name='missing_count')
unanswered['missing_proportion'] = unanswered['missing_count'] / len(df)
unanswered

In [None]:
# Distribution of Ratings per Category

ax = df[ratings_cols].apply(pd.Series.value_counts).sort_index().T.plot(
    kind='bar', stacked=True, figsize=(10, 6)
)

ax.set_xlabel('Rating Category')
ax.set_ylabel('Count of Ratings')
ax.set_title('Distribution of Ratings per Category')

plt.xticks(rotation=30, ha='right')
plt.legend(title='Rating', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Average Rating by Category

df[ratings_cols].mean().sort_values().plot(kind='barh')
plt.xlabel('Average Rating')
plt.title('Average Rating by Category')
plt.tight_layout()
plt.show()


#### pros column

In [None]:
df['pros'].head()

In [None]:
df['pros'].isna().sum()

In [None]:
df['pros'].str.len().describe()

In [None]:
df.loc[df['pros'].str.len().idxmax(), 'pros']

In [None]:
df[df['pros'].str.len() < 16]