# Import necessary libraries


In [66]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import re

pd.options.display.max_columns = 200
pd.options.display.max_rows = 3000

# Load the dataset 

In [67]:
def parent_directory():

  # Create a relative path to the parent
  # of the current working directory
  relative_parent = os.path.join(os.getcwd(), os.pardir)
  # Return the absolute path of the parent directory
  return os.path.abspath(relative_parent)

print(parent_directory())

def str_2_int(value):
  if ',' in value:
    x = float(value.replace(',',''))
    return x
  else :
    return float(value)

salaries = pd.read_csv(parent_directory() + '/final_project/0.Ressources/05_2_salaries.csv')


c:\Users\33668\Documents\Albert school Msc1\Pandas


# Renaming Columns and Adding Total Remuneration

In [68]:


#Rename Column names because they were too long
salaries.columns = ['Time', 'Age', 'Industry', 'Job', 'Job Context', 'Salary', 'Bonus', 'Currency', 'Other Currency', 'Income Context', 'Country', 'US State', 'City', 'Experience', 'Field experience', 'Education', 'Gender', 'Race']

#Convert String to float
salaries["Salary"] = salaries["Salary"].apply(str_2_int)


#Replace NA values in Bonus by 0 and add a "Total Remuneration" Column
values = {"Bonus": 0.0}
salaries = salaries.fillna(value = values)
salaries["Total Remuneration"] = salaries["Salary"] + salaries["Bonus"]
move = salaries.pop("Total Remuneration")
salaries.insert(7,"Total Remuneration" ,move)


salaries


Unnamed: 0,Time,Age,Industry,Job,Job Context,Salary,Bonus,Total Remuneration,Currency,Other Currency,Income Context,Country,US State,City,Experience,Field experience,Education,Gender,Race
0,4/27/2021 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,,55000.0,0.0,55000.0,USD,,,United States,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White
1,4/27/2021 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,,54600.0,4000.0,58600.0,GBP,,,United Kingdom,,Cambridge,8 - 10 years,5-7 years,College degree,Non-binary,White
2,4/27/2021 11:02:38,25-34,"Accounting, Banking & Finance",Marketing Specialist,,34000.0,0.0,34000.0,USD,,,US,Tennessee,Chattanooga,2 - 4 years,2 - 4 years,College degree,Woman,White
3,4/27/2021 11:02:41,25-34,Nonprofits,Program Manager,,62000.0,3000.0,65000.0,USD,,,USA,Wisconsin,Milwaukee,8 - 10 years,5-7 years,College degree,Woman,White
4,4/27/2021 11:02:42,25-34,"Accounting, Banking & Finance",Accounting Manager,,60000.0,7000.0,67000.0,USD,,,US,South Carolina,Greenville,8 - 10 years,5-7 years,College degree,Woman,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28080,9/25/2024 19:54:18,45-54,"Accounting, Banking & Finance",Product Manager Lead,,117000.0,8000.0,125000.0,USD,,,USA,Missouri,Remote,11 - 20 years,11 - 20 years,College degree,Woman,White
28081,9/25/2024 21:34:43,45-54,Education (Primary/Secondary),Curriculum Writer,I am a freelance contract curriculum writer fo...,70000.0,0.0,70000.0,USD,,,United States,South Carolina,Bennettsville,21 - 30 years,21 - 30 years,Master's degree,Woman,White
28082,9/30/2024 10:52:30,55-64,Government and Public Administration,Clerical Officer,,28600.0,0.0,28600.0,EUR,,,Ireland,,Dublin,21 - 30 years,1 year or less,"Professional degree (MD, JD, etc.)",Man,White
28083,10/4/2024 13:05:24,18-24,Computing or Tech,Software Engineering Co-Op,I was an Intern,56160.0,0.0,56160.0,USD,,,United States,Rhode Island,Jhonston,1 year or less,1 year or less,College degree,Man,White


# Fixing Types

In [69]:
salaries["Time"] = pd.to_datetime(salaries["Time"], format='%m/%d/%Y %H:%M:%S')

salaries.dtypes

Time                  datetime64[ns]
Age                           object
Industry                      object
Job                           object
Job Context                   object
Salary                       float64
Bonus                        float64
Total Remuneration           float64
Currency                      object
Other Currency                object
Income Context                object
Country                       object
US State                      object
City                          object
Experience                    object
Field experience              object
Education                     object
Gender                        object
Race                          object
dtype: object

# Check Missing Values

In [70]:
pair_list = salaries["Currency"].unique()

In [71]:
missing_values = salaries.isnull().sum()
missing_percentage = (missing_values / len(salaries)) * 100
missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage (%)': missing_percentage})
missing_df[missing_df['Missing Values'] > 0].sort_values(by='Percentage (%)', ascending=False)

Unnamed: 0,Missing Values,Percentage (%)
Other Currency,27878,99.262952
Income Context,25041,89.161474
Job Context,20818,74.124978
US State,5026,17.895674
Education,222,0.790458
Race,177,0.63023
Gender,171,0.608866
City,82,0.291971
Industry,74,0.263486
Job,1,0.003561


We can therefore see that over 99.2% of the dataset is payed in one of the currencies listed in the "Currency" column. For this reason we think it would make sense to remove the other currencies from our analysis as they would not have a significant impact on the results. We will therefore analyse salaries from the US, Great Britain, Canada, Euro Zone, Australia and New Zealand, Switzerland, South Africa, Sweden, Hong Kong and Japan. This may bias our views due to it being focused on advanced economies.   

# Get current exchange rate

In [72]:
#We are going to use Selenium here to get the exchange rate of the currencies
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# instantiate a Chrome options object
options = webdriver.ChromeOptions()

# set the options to use Chrome in headless mode
options.add_argument("--headless=new")

# initialize an instance of the chrome driver (browser) in headless mode
driver = webdriver.Chrome(
    options=options,
)

# visit your target site
driver.get("https://www.oanda.com/currency-converter/live-exchange-rates/")

# extract all the product containers
rates = driver.find_elements(By.XPATH, "/html/body/div[1]/main/div/div/div[3]")

time.sleep(3)

In [73]:
def get_rate(rate):
  if rate == "EUR":
    return 1
  
  #HKD doesnt exist unfortunately and I couldn't find it so i will hard code the rate
  if rate == "HKD":
    return 0.12
  
  if rate == "AUD/NZD":
    rate = "AUD"
  
  rate_pair = rate + "/EUR"
  for rate in rates:
    if rate_pair in rate.text:
      exchange_rate_path = rate.text.split(rate_pair)[1].split("\n")
      return float(exchange_rate_path[1] + exchange_rate_path[2])



pair_list = salaries['Currency'].unique().tolist()

exchange_rate = {}
for pair in pair_list:
  if pair == "Other":
    continue
  exchange_rate[pair] = get_rate(pair)

  
def get_total_eur(row):
    for currency_key in ['Currency', 'Other Currency']:
        currency = row[currency_key]
        if currency in exchange_rate:
            return row['Total Remuneration'] * exchange_rate[currency]
    return None  # Return None if currency is not found




salaries['Total Remuneration EUR'] = salaries.apply(get_total_eur, axis=1)
salaries

Unnamed: 0,Time,Age,Industry,Job,Job Context,Salary,Bonus,Total Remuneration,Currency,Other Currency,Income Context,Country,US State,City,Experience,Field experience,Education,Gender,Race,Total Remuneration EUR
0,2021-04-27 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,,55000.0,0.0,55000.0,USD,,,United States,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White,51243.500
1,2021-04-27 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,,54600.0,4000.0,58600.0,GBP,,,United Kingdom,,Cambridge,8 - 10 years,5-7 years,College degree,Non-binary,White,70296.560
2,2021-04-27 11:02:38,25-34,"Accounting, Banking & Finance",Marketing Specialist,,34000.0,0.0,34000.0,USD,,,US,Tennessee,Chattanooga,2 - 4 years,2 - 4 years,College degree,Woman,White,31677.800
3,2021-04-27 11:02:41,25-34,Nonprofits,Program Manager,,62000.0,3000.0,65000.0,USD,,,USA,Wisconsin,Milwaukee,8 - 10 years,5-7 years,College degree,Woman,White,60560.500
4,2021-04-27 11:02:42,25-34,"Accounting, Banking & Finance",Accounting Manager,,60000.0,7000.0,67000.0,USD,,,US,South Carolina,Greenville,8 - 10 years,5-7 years,College degree,Woman,White,62423.900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28080,2024-09-25 19:54:18,45-54,"Accounting, Banking & Finance",Product Manager Lead,,117000.0,8000.0,125000.0,USD,,,USA,Missouri,Remote,11 - 20 years,11 - 20 years,College degree,Woman,White,116462.500
28081,2024-09-25 21:34:43,45-54,Education (Primary/Secondary),Curriculum Writer,I am a freelance contract curriculum writer fo...,70000.0,0.0,70000.0,USD,,,United States,South Carolina,Bennettsville,21 - 30 years,21 - 30 years,Master's degree,Woman,White,65219.000
28082,2024-09-30 10:52:30,55-64,Government and Public Administration,Clerical Officer,,28600.0,0.0,28600.0,EUR,,,Ireland,,Dublin,21 - 30 years,1 year or less,"Professional degree (MD, JD, etc.)",Man,White,28600.000
28083,2024-10-04 13:05:24,18-24,Computing or Tech,Software Engineering Co-Op,I was an Intern,56160.0,0.0,56160.0,USD,,,United States,Rhode Island,Jhonston,1 year or less,1 year or less,College degree,Man,White,52324.272


# Voici ton code intial sur currency rate conversion

In [None]:
salaries["Other Currency"] = salaries['Other Currency'].fillna('NaN')
allowed_currencies = ['USD', 'GBP', 'CAD', 'EUR', 'AUD/NZD', 'CHF', 'ZAR', 'SEK', 'HKD', 'JPY']
other = ['Other', 'NaN', 'Na', 'N/a']
salaries = salaries[
    (salaries['Currency'].isin(allowed_currencies) & salaries['Other Currency'].isin(other) ) | (salaries['Currency'].isin(other) & salaries['Other Currency'].isin(allowed_currencies))]
print(salaries['Currency'].unique())
print(salaries['Other Currency'].unique())
len(salaries)


# prompt: make a function that converts the values of the "Total Remuneration" into EUR depending on the currency located in "Currency" or "Other currency" with current exchange rates

def convert_to_eur(row):
  """Converts the 'Total Remuneration' to EUR based on the currency in 'Currency' or 'Other Currency'.

  Args:
    row: A pandas Series representing a row in the DataFrame.

  Returns:
    The 'Total Remuneration' in EUR.
  """
  #Current Exchange Rates
  exchange_rates = {
      'USD': 0.93,  # EUR/USD rate
      'GBP': 1.15,  # EUR/GBP rate
      'CAD': 0.68,  # EUR/CAD rate
      'EUR': 1.00,  # EUR/EUR rate
      'AUD/NZD': 0.60,  # EUR/AUD rate (assuming the same for NZD for simplicity)
      'CHF': 0.95,  # EUR/CHF rate
      'ZAR': 0.05,  # EUR/ZAR rate
      'SEK': 0.09,  # EUR/SEK rate
      'HKD': 0.12,  # EUR/HKD rate
      'JPY': 0.0071,  # EUR/JPY rate
  }

  if row['Currency'] in exchange_rates:
    currency = row['Currency']
  elif row['Other Currency'] in exchange_rates:
    currency = row['Other Currency']
  else:
    return None  # Return None if currency is not found

  return row['Total Remuneration'] * exchange_rates[currency]

# Apply the function to create a new column
salaries['Total Remuneration EUR'] = salaries.apply(convert_to_eur, axis=1)
salaries

# Questions
1. Which industry pays the most?
2. How does salary increase given years of experience?
3. How do salaries compare for the same role in different locations?
4. How much do salaries differ by gender and years of experience?
5. How do factors like race and education level correlate with salary?
6. Is there a “sweet spot” total work experience vs years in the specific field?

In [None]:
salary_average = salaries[['Industry','Total Remuneration EUR']]
list_pay_industry = salary_average.groupby("Industry").mean()
print(list_pay_industry.sort_values(by="Total Remuneration EUR", ascending= False))

