# Data Cleaning: Salary

In [2]:
import pandas as pd
import re 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv("final_sorted_job_titles_date.csv")

In [12]:
df.columns

Index(['Job Title', 'Company', 'Location', 'Salary Min', 'Salary Max',
       'Description', 'Extracted Skills', 'Longitude', 'Latitude', 'URL',
       'Created', 'Experience_years', 'Experience_Level', ' Apache Airflow',
       ' Azure', ' Big Data', ' Data Pipelines', ' Deep Learning', ' Docker',
       ' ETL', ' GCP', ' Hadoop', ' Java', ' KPI', ' Kafka', ' Kubernetes',
       ' Machine Learning', ' Pandas', ' Power BI', ' PyTorch', ' Python',
       ' R', ' SQL', ' Scikit-Learn', ' Snowflake', ' Spark', ' Tableau',
       ' TensorFlow', 'AWS', 'Apache Airflow', 'Azure', 'Big Data',
       'Data Pipelines', 'Deep Learning', 'Docker', 'ETL', 'GCP',
       'Google Analytics', 'Hadoop', 'Java', 'KPI', 'Kafka', 'Kubernetes',
       'Machine Learning', 'Pandas', 'Power BI', 'PyTorch', 'Python', 'R',
       'SQL', 'Snowflake', 'Spark', 'Tableau', 'TensorFlow'],
      dtype='object')

In [5]:
# Identifying Anomalous Salary Values
low_salaries = df[(df["Salary Min"] < 1000) | (df["Salary Max"] < 1000)]
low_salaries

Unnamed: 0,Job Title,Company,Location,Salary Min,Salary Max,Description,Extracted Skills,Longitude,Latitude,URL,...,Pandas,Power BI,PyTorch,Python,R,SQL,Snowflake,Spark,Tableau,TensorFlow
24,Data Engineer,SRA Information Technology,"Barton, South Canberra",150.0,200.0,Senior Software Developer | Full Stack Develop...,[],149.133481,-35.309391,https://www.adzuna.com.au/details/5082054934?u...,...,0,0,0,0,0,0,0,0,0,0
58,Data Engineer,HCLTech,"The Rocks, Sydney",150.0,200.0,Data Architect Location: Sydney Job type: Perm...,[],151.206616,-33.865715,https://www.adzuna.com.au/details/5056637568?u...,...,0,1,0,0,0,0,0,0,0,0
59,Data Engineer,InfoCentric,"Dunnstown, Moorabool Area",150.0,200.0,We are looking for an experienced AWS Data Arc...,[],143.952548,-37.579665,https://www.adzuna.com.au/details/5056635779?u...,...,0,0,0,0,0,0,0,0,0,0
60,Data Scientist,Cevo trades as Cevo Pty,"Dunnstown, Moorabool Area",200.0,250.0,Who are we? Cevo is a trusted leader in techno...,[],143.952548,-37.579665,https://www.adzuna.com.au/details/5056456712?u...,...,0,0,0,0,0,0,0,0,0,0
61,Data Scientist,Cevo trades as Cevo Pty,"The Rocks, Sydney",200.0,250.0,Who are we? Cevo is a trusted leader in techno...,[],151.206616,-33.865715,https://www.adzuna.com.au/details/5056456424?u...,...,0,0,0,0,0,0,0,0,0,0
62,Data Scientist,Cevo trades as Cevo Pty,"Dunnstown, Moorabool Area",200.0,250.0,Who are we? Cevo is a trusted leader in techno...,[],143.952548,-37.579665,https://www.adzuna.com.au/details/5056456711?u...,...,0,0,0,0,0,0,0,0,0,0
80,Data Scientist,Innablr,"Dunnstown, Moorabool Area",150.0,200.0,Innablr is an engineering-led cloud native con...,[],143.952548,-37.579665,https://www.adzuna.com.au/details/5056636178?u...,...,0,0,0,0,0,0,0,0,0,0
81,Data Scientist,News Corporation,"The Rocks, Sydney",200.0,250.0,News Corp Australia is looking for a highly dr...,[],151.206616,-33.865715,https://www.adzuna.com.au/details/5067936284?u...,...,0,0,0,0,0,0,0,0,0,0
86,Data Engineer,intelia,"Dunnstown, Moorabool Area",150.0,200.0,Get AI-powered advice on this job and more exc...,[],143.952548,-37.579665,https://www.adzuna.com.au/details/5056635361?u...,...,0,0,0,0,0,0,0,0,0,0
88,Data Engineer,VIS Global Pty Ltd,"Dunnstown, Moorabool Area",150.0,200.0,"As a GCP Data Architect , you’ll be responsibl...",[],143.952548,-37.579665,https://www.adzuna.com.au/details/5056636697?u...,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Step 1: Fix incorrect salary scales (multiply by 1,000)
salaries_to_correct = [90, 120, 135, 145, 150, 190, 200, 250]
df.loc[df["Salary Min"].isin(salaries_to_correct), "Salary Min"] *= 1000
df.loc[df["Salary Max"].isin(salaries_to_correct), "Salary Max"] *= 1000

In [7]:
# Check if Step 1 has been adjusted: YES!
low_salaries = df[(df["Salary Min"] < 1000) | (df["Salary Max"] < 1000)]
low_salaries

Unnamed: 0,Job Title,Company,Location,Salary Min,Salary Max,Description,Extracted Skills,Longitude,Latitude,URL,...,Pandas,Power BI,PyTorch,Python,R,SQL,Snowflake,Spark,Tableau,TensorFlow
384,Data Scientist,Digimatch,"Ile-de-France, France",400.0,600.0,Nous recherchons un Data Scientist pour renfor...,[],2.351634,48.811332,https://www.adzuna.fr/details/5079956277?utm_m...,...,0,0,0,0,0,0,0,0,0,0
385,Data Scientist,Jinane Consulting,"Montrouge, Antony",100.0,570.0,Objectifs/Descriptif de mission : Les principa...,[],2.31993,48.81846,https://www.adzuna.fr/details/5076501022?utm_m...,...,0,0,0,0,0,0,0,0,0,0
387,Data Scientist,Phaidon London- Glocomms,"Paris, Ile-de-France",400.0,550.0,Proposer des solutions techniques pour répondr...,[],2.344631,48.863839,https://www.adzuna.fr/details/5067546520?utm_m...,...,0,0,0,0,0,0,0,0,0,0
388,Data Scientist,AVALIANCE,"Montrouge, Antony",400.0,550.0,Nous recherchons pour notre client basé à Mont...,[],2.31993,48.81846,https://www.adzuna.fr/details/5054750074?utm_m...,...,0,0,0,0,0,0,0,0,0,0
397,Data Scientist,APRIL,"Caluire-et-Cuire, Lyon",486.0,1802.0,"Venez extraire, analyser et développer des alg...",[],4.85052,45.76853,https://www.adzuna.fr/details/5036975152?utm_m...,...,0,0,0,0,0,0,0,0,0,0
398,Data Scientist,Canal Plus,"Puteaux, Nanterre",486.0,1802.0,"Plus précisément, vous intégrerez la direction...",[],2.23762,48.88328,https://www.adzuna.fr/details/5076170998?utm_m...,...,0,0,0,0,0,0,0,0,0,0
401,Data Scientist,Canal Plus,"Puteaux, Nanterre",486.0,1802.0,"Plus précisément, vous intégrerez la direction...",[],2.23762,48.88328,https://www.adzuna.fr/details/5076171459?utm_m...,...,0,0,0,0,0,0,0,0,0,0
403,Data Scientist,Framatome,"Rhône, Auvergne-Rhône-Alpes",486.0,1802.0,Framatome souhaite étendre sa capacité à trait...,[],4.902705,45.85511,https://www.adzuna.fr/details/5080251351?utm_m...,...,0,0,0,0,0,0,0,0,0,0
404,Data Scientist,Groupe Aptenia,"Ile-de-France, France",600.0,650.0,"Vous rejoindrez l'équipe Global IT (GIT), qui ...",[],2.351634,48.811332,https://www.adzuna.fr/details/5062815536?utm_m...,...,0,0,0,1,0,0,0,0,0,0
800,Data Engineer,Just People Information Security,"Brisbane, Brisbane Region",0.0,286000.0,You will be a key part of delivering high qual...,[],152.928532,-27.416056,https://www.adzuna.com.au/details/5079379477?u...,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Step 2: Convert daily salaries to annual by multiplying by 252
daily_salaries = [100, 400, 480, 520, 550, 570, 600, 650]
df.loc[df["Salary Min"].isin(daily_salaries), "Salary Min"] *= 252
df.loc[df["Salary Max"].isin(daily_salaries), "Salary Max"] *= 252

In [9]:
# Check if Step 2 has been adjusted: YES
low_salaries = df[(df["Salary Min"] < 1000) | (df["Salary Max"] < 1000)]
low_salaries

Unnamed: 0,Job Title,Company,Location,Salary Min,Salary Max,Description,Extracted Skills,Longitude,Latitude,URL,...,Pandas,Power BI,PyTorch,Python,R,SQL,Snowflake,Spark,Tableau,TensorFlow
397,Data Scientist,APRIL,"Caluire-et-Cuire, Lyon",486.0,1802.0,"Venez extraire, analyser et développer des alg...",[],4.85052,45.76853,https://www.adzuna.fr/details/5036975152?utm_m...,...,0,0,0,0,0,0,0,0,0,0
398,Data Scientist,Canal Plus,"Puteaux, Nanterre",486.0,1802.0,"Plus précisément, vous intégrerez la direction...",[],2.23762,48.88328,https://www.adzuna.fr/details/5076170998?utm_m...,...,0,0,0,0,0,0,0,0,0,0
401,Data Scientist,Canal Plus,"Puteaux, Nanterre",486.0,1802.0,"Plus précisément, vous intégrerez la direction...",[],2.23762,48.88328,https://www.adzuna.fr/details/5076171459?utm_m...,...,0,0,0,0,0,0,0,0,0,0
403,Data Scientist,Framatome,"Rhône, Auvergne-Rhône-Alpes",486.0,1802.0,Framatome souhaite étendre sa capacité à trait...,[],4.902705,45.85511,https://www.adzuna.fr/details/5080251351?utm_m...,...,0,0,0,0,0,0,0,0,0,0
800,Data Engineer,Just People Information Security,"Brisbane, Brisbane Region",0.0,286000.0,You will be a key part of delivering high qual...,[],152.928532,-27.416056,https://www.adzuna.com.au/details/5079379477?u...,...,0,0,0,0,0,0,0,0,0,0
1350,Data Engineer,,"Johannesburg, Gauteng",0.0,13200.0,Join a leading company pioneering the power of...,[],28.026104,-26.183117,https://www.adzuna.co.za/land/ad/5051267974?se...,...,0,0,0,0,0,0,0,0,0,0
1356,Data Engineer,Jordan Human Resources,"Centurion, Tshwane",36.0,36.0,A client in the automobile industry is looking...,[],28.18586,-25.8425,https://www.adzuna.co.za/details/5078585241?ut...,...,0,0,0,0,0,0,0,0,0,0
1415,Machine Learning Engineer,CES Information Technologies,"Hyderabad, Telangana",0.0,1500000.0,o Critical thinking mind who likes to solve co...,[],78.50806,17.40275,https://www.adzuna.in/details/2658062217?utm_m...,...,0,0,0,0,0,0,0,0,0,0
1417,Machine Learning Engineer,SIlverPeople Consulting,"Mumbai, Maharashtra",0.0,2700000.0,ML Engineer-Analyst/ Senior Analyst Job purpos...,[],72.84415,19.00821,https://www.adzuna.in/details/2383302553?utm_m...,...,0,0,0,0,0,0,0,0,0,0
1629,Machine Learning Engineer,V R Della Infotech Inc,"Sunnyvale, Santa Clara County",2.0,61.0,Description: Description Job Title EA ML Engin...,[],-122.02194,37.376099,https://www.adzuna.com/details/5080727251?utm_...,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Delete these 7 rows!
df = df[df["Salary Min"] >= 10000]
new_count = df.shape[0]
print(f"New total job listings after deletion: {new_count}")

New total job listings after deletion: 2364


In [11]:
df

Unnamed: 0,Job Title,Company,Location,Salary Min,Salary Max,Description,Extracted Skills,Longitude,Latitude,URL,...,Pandas,Power BI,PyTorch,Python,R,SQL,Snowflake,Spark,Tableau,TensorFlow
0,Data Engineer,Catch Recruit,"Sandton, North Johannesburg",240000.0,420000.0,Key Responsibilities - Develop and maintain sc...,[],28.048330,-26.068610,https://www.adzuna.co.za/details/5012435460?ut...,...,0,0,0,0,0,0,0,0,0,0
1,Data Engineer,TN Italy,"Lombardia, Italia",50000.0,70000.0,"Data Core Engineer - Milano/Roma/Napoli, Città...",[],9.574901,45.581846,https://www.adzuna.it/land/ad/5083569093?se=xq...,...,0,0,0,0,1,0,0,0,0,0
2,Data Scientist,Amazon,"Provincia di Milano, Lombardia",50000.0,70000.0,"Regional Safety Engineer, Data Center Health a...",[],9.138379,45.473082,https://www.adzuna.it/land/ad/5080642147?se=xq...,...,0,0,0,0,0,0,0,0,0,0
3,Data Scientist,ENGINEERINGUK,"Provincia di Milano, Lombardia",50000.0,70000.0,"Regional Safety Engineer, Data Center Health a...",[],9.138379,45.473082,https://www.adzuna.it/land/ad/4934765924?se=xq...,...,0,0,0,0,0,0,0,0,0,0
4,Data Scientist,Amazon,"Provincia di Milano, Lombardia",50000.0,70000.0,"Regional Safety Engineer, Data Center Health a...",[],9.138379,45.473082,https://www.adzuna.it/land/ad/4998300620?se=xq...,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2417,Analytics Consultant,Link Group,"Wrocław, dolnośląskie",22680.0,23940.0,Chcesz pracować w dynamicznie rozwijającej się...,[],17.038540,51.107890,https://www.adzuna.pl/land/ad/5081135237?se=cq...,...,0,0,0,0,0,1,0,0,0,0
2418,Analytics Consultant,Link Group,"Gdańsk, Trójmiasto",22680.0,23940.0,Chcesz pracować w dynamicznie rozwijającej się...,[],18.646640,54.352030,https://www.adzuna.pl/land/ad/5081135244?se=cq...,...,0,0,0,0,0,1,0,0,0,0
2419,Analytics Consultant,Link Group,"Warszawa, mazowieckie",22680.0,23940.0,Chcesz pracować w dynamicznie rozwijającej się...,[],21.245300,52.215190,https://www.adzuna.pl/land/ad/5081135234?se=cq...,...,0,0,0,0,0,1,0,0,0,0
2420,Analytics Consultant,Link Group,"Poznań, wielkopolskie",22680.0,23940.0,Chcesz pracować w dynamicznie rozwijającej się...,[],16.938750,52.403210,https://www.adzuna.pl/land/ad/5081135315?se=cq...,...,0,0,0,0,0,1,0,0,0,0


In [73]:
# Step 4: Standardize Currency to USD (assuming all values are in local currencies)
# Conversion rates (Example: You can update based on real-time rates)
currency_conversion = {
    "pl": 0.25,  # Polish Zloty to USD
    "it": 1.09,  # Euro to USD
    "fr": 1.09,  # Euro to USD
    "es": 1.09,  # Euro to USD
    "de": 1.09,  # Euro to USD
    "gb": 1.26,  # British Pound to USD
    "au": 0.66,  # Australian Dollar to USD
    "ca": 0.74,  # Canadian Dollar to USD
    "in": 0.012, # Indian Rupee to USD
    "za": 0.054, # South African Rand to USD
    "mx": 0.056,  # Mexican Peso to USD
    "at": 1.09,  # Austria (Euro)
    "be": 1.09,  # Belgium (Euro)
    "br": 0.20,  # Brazil (Brazilian Real to USD)
    "ch": 1.12,  # Switzerland (Swiss Franc to USD)
    "nl": 1.09,  # Netherlands (Euro)
    "nz": 0.61,  # New Zealand (NZD to USD)
    "sg": 0.74   # Singapore (SGD to USD)
}

In [74]:
# Function to extract country code from URL and apply conversion
def convert_salary_using_url(row):
    url = str(row["URL"])
    salary_min = row["Salary Min"]
    salary_max = row["Salary Max"]
    
    # Extract country code from the domain using regex (handle .co.za and .co.uk separately)
    match = re.search(r'adzuna\.([a-z]{2,3})/', url)
    country_code = match.group(1) if match else "us"  # Default to USD if no match
    
    # Adjust for unique domain structures
    if "adzuna.co.za" in url:
        country_code = "za"
    elif "adzuna.co.uk" in url:
        country_code = "gb"
    elif "adzuna.com.au" in url:
        country_code = "au"
    
    # Get conversion rate (default to 1 if country not found)
    conversion_rate = currency_conversion.get(country_code, 1)
    
    # Convert salaries to USD
    return round(salary_min * conversion_rate, 2), round(salary_max * conversion_rate, 2) if pd.notna(salary_max) else None


In [75]:
# Apply conversion function
df[["Salary Min", "Salary Max"]] = df.apply(convert_salary_using_url, axis=1, result_type="expand")

In [76]:
df

Unnamed: 0,Job Title,Company,Location,Salary Min,Salary Max,Description,Extracted Skills,Longitude,Latitude,URL,...,Pandas,Power BI,PyTorch,Python,R,SQL,Snowflake,Spark,Tableau,TensorFlow
0,Data Engineer,Catch Recruit,"Sandton, North Johannesburg",12960.0,22680.0,Key Responsibilities - Develop and maintain sc...,[],28.04833,-26.06861,https://www.adzuna.co.za/details/5012435460?ut...,...,0,0,0,0,0,0,0,0,0,0
1,Data Engineer,TN Italy,"Lombardia, Italia",54500.0,76300.0,"Data Core Engineer - Milano/Roma/Napoli, Città...",[],9.574901,45.581846,https://www.adzuna.it/land/ad/5083569093?se=xq...,...,0,0,0,0,1,0,0,0,0,0
2,Data Scientist,Amazon,"Provincia di Milano, Lombardia",54500.0,76300.0,"Regional Safety Engineer, Data Center Health a...",[],9.138379,45.473082,https://www.adzuna.it/land/ad/5080642147?se=xq...,...,0,0,0,0,0,0,0,0,0,0
3,Data Scientist,ENGINEERINGUK,"Provincia di Milano, Lombardia",54500.0,76300.0,"Regional Safety Engineer, Data Center Health a...",[],9.138379,45.473082,https://www.adzuna.it/land/ad/4934765924?se=xq...,...,0,0,0,0,0,0,0,0,0,0
4,Data Scientist,Amazon,"Provincia di Milano, Lombardia",54500.0,76300.0,"Regional Safety Engineer, Data Center Health a...",[],9.138379,45.473082,https://www.adzuna.it/land/ad/4998300620?se=xq...,...,0,0,0,0,0,0,0,0,0,0
5,Data Scientist,Amazon,"Provincia di Milano, Lombardia",54500.0,76300.0,"Regional Safety Engineer, Data Center Health a...",[],9.138379,45.473082,https://www.adzuna.it/land/ad/4986949638?se=xq...,...,0,0,0,0,0,0,0,0,0,0
6,Machine Learning Engineer,TN Italy,"Provincia di Brescia, Lombardia",54500.0,76300.0,Social network you want to login/join with: Ma...,[],10.263698,45.651602,https://www.adzuna.it/land/ad/5083567845?se=xq...,...,0,0,0,0,0,0,0,0,0,0
7,Data Engineer,Unit8,"Warszawa, mazowieckie",66000.0,84000.0,"Who We Are Founded in 2017, Unit8 is a fast-gr...",[],21.2453,52.21519,https://www.adzuna.pl/land/ad/5077274837?se=5N...,...,0,0,0,0,1,0,0,0,0,0
8,Data Engineer,Unit8,"Wrocław, dolnośląskie",66000.0,84000.0,"Who We Are Founded in 2017, Unit8 is a fast-gr...",[],17.03854,51.10789,https://www.adzuna.pl/land/ad/5077279762?se=5N...,...,0,0,0,0,1,0,0,0,0,0
9,Data Engineer,Unit8,"Kraków, małopolskie",66000.0,84000.0,"Who We Are Founded in 2017, Unit8 is a fast-gr...",[],20.17848,50.1024,https://www.adzuna.pl/land/ad/5077279767?se=5N...,...,0,0,0,0,1,0,0,0,0,0


In [77]:
# Count job listings where Salary Min is below 10,000
low_salary_count = df[df["Salary Min"] < 10000].shape[0]
print(f"Number of job listings where Salary Min is below 10,000: {low_salary_count}")


Number of job listings where Salary Min is below 10,000: 47


In [78]:
# Delete these 47 Job listings.
df = df[df["Salary Min"] >= 10000]

In [79]:
df.describe()

Unnamed: 0,Salary Min,Salary Max,Longitude,Latitude,Experience_years,Apache Airflow,Azure,Big Data,Data Pipelines,Deep Learning,...,Pandas,Power BI,PyTorch,Python,R,SQL,Snowflake,Spark,Tableau,TensorFlow
count,2317.0,2316.0,2317.0,2317.0,2317.0,2317.0,2317.0,2317.0,2317.0,2317.0,...,2317.0,2317.0,2317.0,2317.0,2317.0,2317.0,2317.0,2317.0,2317.0,2317.0
mean,79779.911731,93717.194072,-16.614265,40.581493,3.151057,0.001295,0.011653,0.011221,0.023738,0.00259,...,0.001295,0.032369,0.000432,0.055244,0.058265,0.060423,0.007769,0.00259,0.005179,0.000432
std,41980.097861,45786.624766,56.8848,20.359173,3.45756,0.035967,0.107341,0.105358,0.152263,0.050833,...,0.035967,0.177017,0.020775,0.228505,0.234294,0.23832,0.087816,0.050833,0.071795,0.020775
min,10500.0,12000.0,-157.8465,-37.844856,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50458.44,63000.0,-75.566083,38.97,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,70500.0,82948.475,0.131237,45.473082,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,100960.0,114000.0,16.93875,51.523772,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,278460.0,566800.0,174.7633,58.299755,15.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [80]:
df

Unnamed: 0,Job Title,Company,Location,Salary Min,Salary Max,Description,Extracted Skills,Longitude,Latitude,URL,...,Pandas,Power BI,PyTorch,Python,R,SQL,Snowflake,Spark,Tableau,TensorFlow
0,Data Engineer,Catch Recruit,"Sandton, North Johannesburg",12960.0,22680.0,Key Responsibilities - Develop and maintain sc...,[],28.04833,-26.06861,https://www.adzuna.co.za/details/5012435460?ut...,...,0,0,0,0,0,0,0,0,0,0
1,Data Engineer,TN Italy,"Lombardia, Italia",54500.0,76300.0,"Data Core Engineer - Milano/Roma/Napoli, Città...",[],9.574901,45.581846,https://www.adzuna.it/land/ad/5083569093?se=xq...,...,0,0,0,0,1,0,0,0,0,0
2,Data Scientist,Amazon,"Provincia di Milano, Lombardia",54500.0,76300.0,"Regional Safety Engineer, Data Center Health a...",[],9.138379,45.473082,https://www.adzuna.it/land/ad/5080642147?se=xq...,...,0,0,0,0,0,0,0,0,0,0
3,Data Scientist,ENGINEERINGUK,"Provincia di Milano, Lombardia",54500.0,76300.0,"Regional Safety Engineer, Data Center Health a...",[],9.138379,45.473082,https://www.adzuna.it/land/ad/4934765924?se=xq...,...,0,0,0,0,0,0,0,0,0,0
4,Data Scientist,Amazon,"Provincia di Milano, Lombardia",54500.0,76300.0,"Regional Safety Engineer, Data Center Health a...",[],9.138379,45.473082,https://www.adzuna.it/land/ad/4998300620?se=xq...,...,0,0,0,0,0,0,0,0,0,0
5,Data Scientist,Amazon,"Provincia di Milano, Lombardia",54500.0,76300.0,"Regional Safety Engineer, Data Center Health a...",[],9.138379,45.473082,https://www.adzuna.it/land/ad/4986949638?se=xq...,...,0,0,0,0,0,0,0,0,0,0
6,Machine Learning Engineer,TN Italy,"Provincia di Brescia, Lombardia",54500.0,76300.0,Social network you want to login/join with: Ma...,[],10.263698,45.651602,https://www.adzuna.it/land/ad/5083567845?se=xq...,...,0,0,0,0,0,0,0,0,0,0
7,Data Engineer,Unit8,"Warszawa, mazowieckie",66000.0,84000.0,"Who We Are Founded in 2017, Unit8 is a fast-gr...",[],21.2453,52.21519,https://www.adzuna.pl/land/ad/5077274837?se=5N...,...,0,0,0,0,1,0,0,0,0,0
8,Data Engineer,Unit8,"Wrocław, dolnośląskie",66000.0,84000.0,"Who We Are Founded in 2017, Unit8 is a fast-gr...",[],17.03854,51.10789,https://www.adzuna.pl/land/ad/5077279762?se=5N...,...,0,0,0,0,1,0,0,0,0,0
9,Data Engineer,Unit8,"Kraków, małopolskie",66000.0,84000.0,"Who We Are Founded in 2017, Unit8 is a fast-gr...",[],20.17848,50.1024,https://www.adzuna.pl/land/ad/5077279767?se=5N...,...,0,0,0,0,1,0,0,0,0,0


In [81]:
pd.set_option("display.max_rows", None)
filtered_df = df[df["Company"].str.lower().isin(["link group", "provectus", "softeq"])]
filtered_df

Unnamed: 0,Job Title,Company,Location,Salary Min,Salary Max,Description,Extracted Skills,Longitude,Latitude,URL,...,Pandas,Power BI,PyTorch,Python,R,SQL,Snowflake,Spark,Tableau,TensorFlow
495,Data Scientist,Link Group,"Warszawa, mazowieckie",50400.0,55440.0,About the Role We are looking for a SAS Data S...,[],21.2453,52.21519,https://www.adzuna.pl/land/ad/5059151584?se=PO...,...,0,0,0,0,0,0,0,0,0,0
496,Data Scientist,Link Group,"Warszawa, mazowieckie",60480.0,65520.0,About the Role We are looking for a SAS Data S...,[],21.2453,52.21519,https://www.adzuna.pl/land/ad/5059151617?se=PO...,...,0,0,0,0,0,0,0,0,0,0
1109,Data Engineer,Link Group,"Kraków, małopolskie",10500.0,12000.0,"Global leader in licensed images, videos, and ...",[],20.17848,50.1024,https://www.adzuna.pl/land/ad/5074474690?se=GE...,...,0,0,0,0,0,0,0,0,0,0
1110,Data Engineer,Link Group,"Gdańsk, Trójmiasto",10500.0,12000.0,"Global leader in licensed images, videos, and ...",[],18.64664,54.35203,https://www.adzuna.pl/land/ad/5074474676?se=GE...,...,0,0,0,0,0,0,0,0,0,0
1111,Data Engineer,Link Group,"Warszawa, mazowieckie",10500.0,12000.0,"Global leader in licensed images, videos, and ...",[],21.2453,52.21519,https://www.adzuna.pl/land/ad/5074474720?se=GE...,...,0,0,0,0,0,0,0,0,0,0
1112,Data Engineer,Link Group,"Wrocław, dolnośląskie",10500.0,12000.0,"Global leader in licensed images, videos, and ...",[],17.03854,51.10789,https://www.adzuna.pl/land/ad/5074474707?se=GE...,...,0,0,0,0,0,0,0,0,0,0
1115,Data Engineer,Link Group,"Poznań, wielkopolskie",10500.0,12000.0,"Global leader in licensed images, videos, and ...",[],16.93875,52.40321,https://www.adzuna.pl/land/ad/5074474620?se=GE...,...,0,0,0,0,0,0,0,0,0,0
1116,Data Engineer,Link Group,"Wrocław, dolnośląskie",66000.0,75000.0,Obowiązki: Projektowanie i implementacja proce...,[],17.03854,51.10789,https://www.adzuna.pl/land/ad/5010497126?se=GE...,...,0,0,0,0,0,0,0,0,0,0
1117,Data Engineer,Link Group,"Katowice, śląskie",66000.0,75000.0,Obowiązki: Projektowanie i implementacja proce...,[],19.02547,50.26008,https://www.adzuna.pl/land/ad/5010497150?se=GE...,...,0,0,0,0,0,0,0,0,0,0
1118,Data Engineer,Link Group,"Warszawa, mazowieckie",66000.0,75000.0,Obowiązki: Projektowanie i implementacja proce...,[],21.2453,52.21519,https://www.adzuna.pl/land/ad/5010497078?se=GE...,...,0,0,0,0,0,0,0,0,0,0


In [82]:
# The LinkGroup, able.Tech, Softeq and Provectus Jobs are monthly, and need to be converted into yearly salaries:
companies = ["link group", "softeq", "provectus"]
df.loc[(df["Company"].str.lower().isin(companies)) & (df["Salary Max"] < 28000), "Salary Max"] *= 12


In [83]:
# Check if this has been done:
pd.set_option("display.max_rows", None)
filtered_df = df[df["Company"].str.lower().isin(["link group", "provectus", "softeq"])]
filtered_df

Unnamed: 0,Job Title,Company,Location,Salary Min,Salary Max,Description,Extracted Skills,Longitude,Latitude,URL,...,Pandas,Power BI,PyTorch,Python,R,SQL,Snowflake,Spark,Tableau,TensorFlow
495,Data Scientist,Link Group,"Warszawa, mazowieckie",50400.0,55440.0,About the Role We are looking for a SAS Data S...,[],21.2453,52.21519,https://www.adzuna.pl/land/ad/5059151584?se=PO...,...,0,0,0,0,0,0,0,0,0,0
496,Data Scientist,Link Group,"Warszawa, mazowieckie",60480.0,65520.0,About the Role We are looking for a SAS Data S...,[],21.2453,52.21519,https://www.adzuna.pl/land/ad/5059151617?se=PO...,...,0,0,0,0,0,0,0,0,0,0
1109,Data Engineer,Link Group,"Kraków, małopolskie",10500.0,144000.0,"Global leader in licensed images, videos, and ...",[],20.17848,50.1024,https://www.adzuna.pl/land/ad/5074474690?se=GE...,...,0,0,0,0,0,0,0,0,0,0
1110,Data Engineer,Link Group,"Gdańsk, Trójmiasto",10500.0,144000.0,"Global leader in licensed images, videos, and ...",[],18.64664,54.35203,https://www.adzuna.pl/land/ad/5074474676?se=GE...,...,0,0,0,0,0,0,0,0,0,0
1111,Data Engineer,Link Group,"Warszawa, mazowieckie",10500.0,144000.0,"Global leader in licensed images, videos, and ...",[],21.2453,52.21519,https://www.adzuna.pl/land/ad/5074474720?se=GE...,...,0,0,0,0,0,0,0,0,0,0
1112,Data Engineer,Link Group,"Wrocław, dolnośląskie",10500.0,144000.0,"Global leader in licensed images, videos, and ...",[],17.03854,51.10789,https://www.adzuna.pl/land/ad/5074474707?se=GE...,...,0,0,0,0,0,0,0,0,0,0
1115,Data Engineer,Link Group,"Poznań, wielkopolskie",10500.0,144000.0,"Global leader in licensed images, videos, and ...",[],16.93875,52.40321,https://www.adzuna.pl/land/ad/5074474620?se=GE...,...,0,0,0,0,0,0,0,0,0,0
1116,Data Engineer,Link Group,"Wrocław, dolnośląskie",66000.0,75000.0,Obowiązki: Projektowanie i implementacja proce...,[],17.03854,51.10789,https://www.adzuna.pl/land/ad/5010497126?se=GE...,...,0,0,0,0,0,0,0,0,0,0
1117,Data Engineer,Link Group,"Katowice, śląskie",66000.0,75000.0,Obowiązki: Projektowanie i implementacja proce...,[],19.02547,50.26008,https://www.adzuna.pl/land/ad/5010497150?se=GE...,...,0,0,0,0,0,0,0,0,0,0
1118,Data Engineer,Link Group,"Warszawa, mazowieckie",66000.0,75000.0,Obowiązki: Projektowanie i implementacja proce...,[],21.2453,52.21519,https://www.adzuna.pl/land/ad/5010497078?se=GE...,...,0,0,0,0,0,0,0,0,0,0


In [84]:
df.to_csv("Cleaned_Salary_Data_and_Title_and_Skills_Final.csv", index=False)
