Load file

In [1]:
import pandas as pd
import numpy as np

In [2]:
job_data = pd.read_csv('ai_job_dataset.csv')

In [4]:
job_data['posting_date'] = pd.to_datetime(job_data['posting_date'], format='%d-%m-%Y')
job_data['application_deadline'] = pd.to_datetime(job_data['application_deadline'], format='%d-%m-%Y')

In [5]:
    def create_dataframe(self): 
        """
        Converts the job data into a pandas DataFrame.
        """
        return pd.read_csv(self.job_data)

In [11]:
   def get_company_locations(df):
        """
        Extracts unique company locations from the job data.
        """
        if 'company_location' in df.columns:
            return df['company_location'].unique().tolist()
        return []

In [12]:
list_of_locations = get_company_locations(job_data)
print("Unique company locations:", list_of_locations)

Unique company locations: ['China', 'Canada', 'Switzerland', 'India', 'France', 'Germany', 'United Kingdom', 'Singapore', 'Austria', 'Sweden', 'South Korea', 'Norway', 'Netherlands', 'United States', 'Israel', 'Australia', 'Ireland', 'Denmark', 'Finland', 'Japan']


In [42]:
    def salary_range_per_emp_type(df, country):
        """
        Calculates the salary range for each employment type.
        """
        if 'company_location' not in df.columns or 'employment_type' not in df.columns or 'salary_usd' not in df.columns:
            #print("Required columns are missing in the DataFrame.")
            print("Available columns:", df.columns)
            return {}
        else:
            filtered_df = df[df['company_location'] == country]
            salary_ranges = filtered_df.groupby('employment_type')['salary_usd'].agg(['min', 'max', 'mean'])
            return salary_ranges
            #return pd.DataFrame(salary_ranges).reset_index()

In [44]:
for country in list_of_locations:
    salary_ranges = salary_range_per_emp_type(job_data, country)
    print(type(salary_ranges))
    print(f"Salary ranges for {country}:")
    print(salary_ranges)

<class 'pandas.core.frame.DataFrame'>
Salary ranges for China:
                   min     max          mean
employment_type                             
CT               35593  201759  85850.790055
FL               33035  203496  83774.810526
FT               35148  201300  88481.834225
PT               33013  204450  81832.341463
<class 'pandas.core.frame.DataFrame'>
Salary ranges for Canada:
                   min     max           mean
employment_type                              
CT               46460  271344  117140.549738
FL               50749  271118  117491.873684
FT               44119  252186  111212.630841
PT               46467  261291  111287.310345
<class 'pandas.core.frame.DataFrame'>
Salary ranges for Switzerland:
                   min     max           mean
employment_type                              
CT               66178  398084  170427.375635
FL               67383  364635  168483.702247
FT               65416  390292  173916.718919
PT               65625  3990

In [45]:
    def get_avg_exp_per_level(df, country):
        """
        Calculates the average experience required for each job level.
        """
        if 'company_location' not in df.columns or 'experience_level' not in df.columns or 'years_experience' not in df.columns:
            print("Required columns are missing in the DataFrame.")
            print("Available columns:", df.columns)
            return {}
        else:
            filtered_df = df[df['company_location'] == country]
            avg_exp_per_level = filtered_df.groupby('experience_level')['years_experience'].mean().to_dict()
            return avg_exp_per_level

In [54]:
for country in list_of_locations:
    exp_level = get_avg_exp_per_level(job_data, country)
    print(type(exp_level))
    print(f"Average years of experience per experience level for {country}:")
    print(exp_level)

<class 'dict'>
Average years of experience per experience level for China:
{'EN': 0.43434343434343436, 'EX': 14.818181818181818, 'MI': 2.962566844919786, 'SE': 6.861386138613861}
<class 'dict'>
Average years of experience per experience level for Canada:
{'EN': 0.478494623655914, 'EX': 14.432291666666666, 'MI': 2.9789473684210526, 'SE': 7.124378109452737}
<class 'dict'>
Average years of experience per experience level for Switzerland:
{'EN': 0.4948453608247423, 'EX': 14.0, 'MI': 2.962962962962963, 'SE': 6.918918918918919}
<class 'dict'>
Average years of experience per experience level for India:
{'EN': 0.4603174603174603, 'EX': 14.776470588235295, 'MI': 3.1105527638190953, 'SE': 7.076530612244898}
<class 'dict'>
Average years of experience per experience level for France:
{'EN': 0.4521276595744681, 'EX': 14.410526315789474, 'MI': 3.01010101010101, 'SE': 6.9119170984455955}
<class 'dict'>
Average years of experience per experience level for Germany:
{'EN': 0.5161290322580645, 'EX': 14.2

In [48]:
    def get_num_industries(df):
        """
        Counts the number of unique industries in the job data.
        """
        if 'company_location' not in df.columns or 'industry' not in df.columns:
            return {}
        else:
            number_of_industries = df.groupby('company_location')['industry' ].size().to_dict()
            return number_of_industries

In [49]:
num_industries_per_country= get_num_industries(job_data)
print(type(num_industries_per_country))
print("Number of unique industries per country:")
print(num_industries_per_country)

<class 'dict'>
Number of unique industries per country:
{'Australia': 732, 'Austria': 765, 'Canada': 769, 'China': 763, 'Denmark': 778, 'Finland': 733, 'France': 769, 'Germany': 814, 'India': 754, 'Ireland': 750, 'Israel': 751, 'Japan': 733, 'Netherlands': 731, 'Norway': 721, 'Singapore': 764, 'South Korea': 722, 'Sweden': 752, 'Switzerland': 746, 'United Kingdom': 729, 'United States': 724}


In [56]:
    def get_benefit_score_range(df):
        """
        Calculates the range of benefit scores across all jobs.
        """
        if 'company_location' not in df.columns or 'benefits_score' not in df.columns:
            print("Required columns are missing in the DataFrame.")
            print("Available columns:", df.columns)
            return {}
        else:
            benefit_scores = df.groupby('company_location')['benefits_score'].agg(['min', 'max', 'mean'])
            return benefit_scores   

In [57]:

benefits = get_benefit_score_range(job_data)
print(type(benefits))
print(benefits)

<class 'pandas.core.frame.DataFrame'>
                  min   max      mean
company_location                     
Australia         5.0  10.0  7.518989
Austria           5.0  10.0  7.518562
Canada            5.0  10.0  7.571261
China             5.0  10.0  7.516252
Denmark           5.0  10.0  7.601542
Finland           5.0  10.0  7.541610
France            5.0  10.0  7.513004
Germany           5.0  10.0  7.486855
India             5.0  10.0  7.568302
Ireland           5.0  10.0  7.389867
Israel            5.0  10.0  7.527031
Japan             5.0  10.0  7.468759
Netherlands       5.0  10.0  7.514227
Norway            5.0  10.0  7.480999
Singapore         5.0  10.0  7.482984
South Korea       5.0  10.0  7.422022
Sweden            5.0  10.0  7.495878
Switzerland       5.0  10.0  7.450804
United Kingdom    5.0  10.0  7.440329
United States     5.0  10.0  7.568508
