Author: Elaine da Silva
Date: 11/15/2024
Assignment5: Exploring a dataset Glassdoor Job Data Science 2024 downloaded on Kaggle.
Course: DBAS3018

Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import math 
import re
import seaborn as sns
#import matplotlib.pyplot as plt
#import plotly.express as px

Reading the CSV file and check some information about it

In [2]:
df = pd.read_csv('glassdoorJobs_Input.csv', keep_default_na=True )

In [3]:
# Display the datatypes for each column
print(df.dtypes)

company                         object
job_title                       object
company_rating                 float64
job_description                 object
location                        object
salary_avg_estimate             object
salary_estimate_payperiod       object
company_size                    object
company_founded                 object
employment_type                 object
industry                        object
sector                          object
revenue                         object
career_opportunities_rating    float64
comp_and_benefits_rating       float64
culture_and_values_rating      float64
senior_management_rating       float64
work_life_balance_rating       float64
dtype: object


In [None]:
# Display the DataFrame information
df.info()

In [None]:
# Display the DataFrame statistics 
df.describe()

 Function to convert the indian currency to american currency

In [4]:
def convert_to_american_format(indian_currency):

    # Check if the value is NaN to convert to 0
    if isinstance(indian_currency, float) and math.isnan(indian_currency):
        indian_currency = 0
    
    # Check if the value is numeric  (float, int, Decimal, etc.)
    if isinstance(indian_currency, (float, int)):
        numeric_value = int(float(indian_currency))  # Converte para float e depois para int
    else:
        # Convert to String to remove currency symbol and commas
        indian_currency = str(indian_currency).replace("₹", "").replace(",", "")
        numeric_value = int(float(indian_currency))  # Convert to float before int
    
    # Format the number to american style (mthousands separated by comma)
    american_format = f"{numeric_value}"
    
    return f"{american_format}"

Starting data transformation to replace NaN and '--' to 'Unknown' for cleaness purpose

In [5]:
# invoke the convert_to_american_format function and create a new column with the result
#df.converted_salary = df.salary_avg_estimate.apply(convert_to_american_format)
df.salary_avg_estimate = df.salary_avg_estimate.apply(convert_to_american_format)

In [6]:
# Replace rows with string value from NaN' to 'Unknown' for the fields below
df.company = df.company.fillna('Unknown')
df.job_description = df.job_description.fillna('Unknown')
df.salary_avg_estimate = df.salary_avg_estimate.fillna('Unknown')
df.salary_estimate_payperiod = df.salary_estimate_payperiod.fillna('Unknown')
df.company_size = df.company_size.fillna('Unknown')
df.company_founded = df.company_founded.fillna('Unknown')
df.employment_type = df.employment_type.fillna('Unknown')
df.industry = df.industry.fillna('Unknown')
df.sector = df.sector.fillna('Unknown')
df.revenue = df.revenue.fillna('Unknown / Non-Applicable')

In [7]:
# Replace rows with string value from '--' to 'Unknown' for the fields below
df.industry = df.industry.replace('--', 'Unknown')
df.sector = df.industry.replace('--', 'Unknown')
df.company_founded = df.company_founded.replace('--', 'Unknown')

In [8]:
# Converting the salary_avg_estimate column to INT to calculate purposes
df.salary_avg_estimate = df.salary_avg_estimate.astype(int)

In [9]:
# Converting the df.salary_avg_estimate to a NumPy array
avgSalary = df.salary_avg_estimate.to_numpy()

In [None]:
# display the dataframe information after data wrangling
df.info()

In [None]:
# display the dataframe statistics after data wrangling
df.describe()

Grouping information based on the categorical columns:
Sector, Company Size, Company Revenue, Employment Type, Job Title, and Location.

In [None]:
# Group and sort by 'company_size'
#df.company_size.value_counts(normalize=False)
df.groupby('company_size').size().reset_index(name='count').sort_values(by='count', ascending=False)

In [None]:
#df.revenue.value_counts(normalize=False)
# Group and sort by 'revenue'
df.groupby('revenue').size().reset_index(name='count').sort_values(by='count', ascending=False)

In [None]:
# Group and sort by 'employment_type'
empType=df.groupby('employment_type').size().reset_index(name='count').sort_values(by='count', ascending=False)
print(empType)

In [None]:
# Group and sort by sector
sectorCount = df.groupby('sector').size().reset_index(name='count')

# Filter and sort the sector with counter equal or bigger than 10
sectorCount_filtered = sectorCount[sectorCount['count'] >= 10].sort_values(by='count', ascending=False)
print(sectorCount_filtered)

In [None]:
# Group and sort by job_title
roleCount = df.groupby('job_title').size().reset_index(name='count')

# Filter and sort the sector with counting equal or bigger than 10
roleCount_filtered = roleCount[roleCount['count'] >= 10].sort_values(by='count', ascending=False)
print(roleCount_filtered)

In [None]:
# Group and sort by Location
localCount = df.groupby('location').size().reset_index(name='count')

# Filter and sort the sector with counter equal or bigger than 10
localCount_filtered = localCount[localCount['count'] >= 10].sort_values(by='count', ascending=False)
print(localCount_filtered)

Evaluating the Company rating by Sector, Company Size, Company Revenue, Employment Type and Location

In [None]:
# Group and sort by sector
sectorRattingAvg = df.groupby(by='sector').mean()['company_rating'].reset_index(name='mean').round(2)

# Filter and sort the sector with counter equal or bigger than 10
sectorAvg_filtered = sectorRattingAvg[sectorRattingAvg['mean'] >= 4].sort_values(by='mean', ascending=False)
print(sectorAvg_filtered)

In [None]:
# Group and sort by company size
df.groupby(by='company_size').mean()['company_rating'].reset_index(name='mean').sort_values(by='mean', ascending=False).round(2)

In [None]:
# Group and sort by company revenue
df.groupby(by='revenue').mean()['company_rating'].reset_index(name='mean').sort_values(by='mean', ascending=False).round(2)

In [None]:
# Group and sort by employment_type
df.groupby(by='employment_type').mean()['company_rating'].reset_index(name='mean').sort_values(by='mean', ascending=False).round(2)

In [None]:
# Group and sort by location
df.groupby(by='location').mean()['company_rating'].reset_index(name='mean').sort_values(by='mean', ascending=False).round(2)

Descritive Statistics: correlation, mean, median, mode, variance and standard deviation

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr(), annot=True);

In [None]:
# Mean
#print(f'Mean is {df.salary_avg_estimate.mean():.2f}')
print(f'Mean is {np.mean(avgSalary):.2f}')

In [None]:
# Median
#print(f'Median is {df.salary_avg_estimate.median():.2f}')
print(f'Median is {np.median(avgSalary):.2f}')

In [None]:
# Mode
print(f'Mode is {df.salary_avg_estimate.mode()}')

In [None]:
# St Deviation
#print(f'Standard Deviation is {df.salary_avg_estimate.std():.2f}')
print(f'Standard Deviation is {np.std(avgSalary):.2f}')

In [None]:
# Variance
#print(f'Variance is {df.salary_avg_estimate.var():.2f}')
print(f'Variance is {np.var(avgSalary):.2f}')

In [10]:
# save the Dataframe content to a CSV file 
df.to_csv('glassdoorJobs_Output.csv', index=False, header=True, sep=';', encoding='utf-8-sig')