In [1]:
import pandas as pd
import altair as alt

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

In [6]:
df = pd.read_csv('ai_job_market.csv')

In [7]:
print(df.head())

   job_id              company_name    industry                 job_title  \
0       1           Foster and Sons  Healthcare              Data Analyst   
1       2   Boyd, Myers and Ramirez        Tech  Computer Vision Engineer   
2       3                  King Inc        Tech          Quant Researcher   
3       4  Cooper, Archer and Lynch        Tech        AI Product Manager   
4       5                  Hall LLC     Finance            Data Scientist   

                                     skills_required experience_level  \
0  NumPy, Reinforcement Learning, PyTorch, Scikit...              Mid   
1                    Scikit-learn, CUDA, SQL, Pandas           Senior   
2          MLflow, FastAPI, Azure, PyTorch, SQL, GCP            Entry   
3       Scikit-learn, C++, Pandas, LangChain, AWS, R              Mid   
4                    Excel, Keras, SQL, Hugging Face           Senior   

  employment_type               location salary_range_usd posted_date  \
0       Full-time        

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   job_id            2000 non-null   int64 
 1   company_name      2000 non-null   object
 2   industry          2000 non-null   object
 3   job_title         2000 non-null   object
 4   skills_required   2000 non-null   object
 5   experience_level  2000 non-null   object
 6   employment_type   2000 non-null   object
 7   location          2000 non-null   object
 8   salary_range_usd  2000 non-null   object
 9   posted_date       2000 non-null   object
 10  company_size      2000 non-null   object
 11  tools_preferred   2000 non-null   object
dtypes: int64(1), object(11)
memory usage: 187.6+ KB


In [9]:
#Czyszczenie danych (pensje i daty)

In [10]:
# Rozdzielenie kolumny 'salary_range_usd' na dwie nowe, przez '-'

In [11]:
df[['min_salary_usd', 'max_salary_usd']] = df['salary_range_usd'].str.split('-', expand=True)

In [12]:
df['min_salary_usd'] = pd.to_numeric(df['min_salary_usd'])
df['max_salary_usd'] = pd.to_numeric(df['max_salary_usd'])

In [13]:
df['avg_salary_usd'] = (df['min_salary_usd'] + df['max_salary_usd']) / 2

In [14]:
df['posted_date'] = pd.to_datetime(df['posted_date'])

In [15]:
df['posted_year'] = df['posted_date'].dt.year

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   job_id            2000 non-null   int64         
 1   company_name      2000 non-null   object        
 2   industry          2000 non-null   object        
 3   job_title         2000 non-null   object        
 4   skills_required   2000 non-null   object        
 5   experience_level  2000 non-null   object        
 6   employment_type   2000 non-null   object        
 7   location          2000 non-null   object        
 8   salary_range_usd  2000 non-null   object        
 9   posted_date       2000 non-null   datetime64[ns]
 10  company_size      2000 non-null   object        
 11  tools_preferred   2000 non-null   object        
 12  min_salary_usd    2000 non-null   int64         
 13  max_salary_usd    2000 non-null   int64         
 14  avg_salary_usd    2000 n

In [17]:
print(df.head())

   job_id              company_name    industry                 job_title  \
0       1           Foster and Sons  Healthcare              Data Analyst   
1       2   Boyd, Myers and Ramirez        Tech  Computer Vision Engineer   
2       3                  King Inc        Tech          Quant Researcher   
3       4  Cooper, Archer and Lynch        Tech        AI Product Manager   
4       5                  Hall LLC     Finance            Data Scientist   

                                     skills_required experience_level  \
0  NumPy, Reinforcement Learning, PyTorch, Scikit...              Mid   
1                    Scikit-learn, CUDA, SQL, Pandas           Senior   
2          MLflow, FastAPI, Azure, PyTorch, SQL, GCP            Entry   
3       Scikit-learn, C++, Pandas, LangChain, AWS, R              Mid   
4                    Excel, Keras, SQL, Hugging Face           Senior   

  employment_type               location salary_range_usd posted_date  \
0       Full-time        

In [18]:
#Analiza wymaganych umiejętności

In [19]:
skills_to_check = [
    'Python', 'SQL', 'R', 'C++',
    'TensorFlow', 'PyTorch', 'Keras', 'Scikit-learn', 'Hugging Face',
    'AWS', 'Azure', 'GCP',
    'Pandas', 'NumPy', 'MLflow', 'FastAPI', 'Flask',
    'Excel', 'Power BI', 'Tableau'
]

In [22]:
print(f" skan opisów w odkryciu {len(skills_to_check)} umiejętności...")

 skan opisów w odkryciu 20 umiejętności...


In [23]:
#słownik
skill_counts = {}

In [24]:
df['skills_required'] = df['skills_required'].astype(str)

In [25]:
for skill in skills_to_check:
    count = df['skills_required'].str.contains(skill, case=False).sum()
    skill_counts[skill] = count

In [26]:
# konwercja słownika na Df, aby łatwo go posortować i zwizualizować
skills_df = pd.DataFrame(list(skill_counts.items()), columns=['Skill', 'Count'])

In [27]:
skills_df = skills_df.sort_values(by='Count', ascending=False) #sort 

In [28]:
print(skills_df)

           Skill  Count
3            C++   1816
2              R   1769
4     TensorFlow    452
17         Excel    432
12        Pandas    427
15       FastAPI    419
13         NumPy    416
10         Azure    413
8   Hugging Face    408
1            SQL    408
6          Keras    406
18      Power BI    404
11           GCP    404
9            AWS    404
0         Python    402
5        PyTorch    401
7   Scikit-learn    400
16         Flask    398
14        MLflow    389
19       Tableau      0


In [30]:
# Wykres popularność umiejętności (słupkowy)

In [33]:
chart_skills = alt.Chart(skills_df).mark_bar().encode(
    x=alt.X('Skill', title='Umiejętność', sort='-y'),
    y=alt.Y('Count', title='Liczba Ofert Pracy'),
    tooltip=[
        alt.Tooltip('Skill', title='Umiejętność'),
        alt.Tooltip('Count', title='Liczba Ofert')
    ]
).properties(
    width=800,
    title='Najbardziej Wymagane Umiejętności w Ofertach Pracy AI'
).interactive()

In [34]:
chart_skills.save('top_ai_skills.html')

In [35]:
#zarobki vs stanowisko

In [36]:
top_10_jobs = df['job_title'].value_counts().head(10).index.tolist()

In [37]:
print(f"10 najpopularniejszych stanowisk: {top_10_jobs}")

10 najpopularniejszych stanowisk: ['Data Analyst', 'NLP Engineer', 'AI Product Manager', 'Quant Researcher', 'ML Engineer', 'Data Scientist', 'AI Researcher', 'Computer Vision Engineer']


In [38]:
#filtr DataFrame, aby miał tylko 10 stanowisk
df_top10_jobs = df[df['job_title'].isin(top_10_jobs)]

In [42]:
# Wykres rozkład pensji dla Top 10 stanowisk boxplot
chart_salary_job_title = alt.Chart(df_top10_jobs).mark_boxplot().encode(
    # Sortujemy słupki od największej mediany pensji
    x=alt.X('job_title', title='Stanowisko', sort=alt.EncodingSortField(field="avg_salary_usd", op="median", order='descending')),
    y=alt.Y('avg_salary_usd', title='Średnia Pensja (USD)'),
    tooltip=[
        alt.Tooltip('job_title', title='Stanowisko'),
        alt.Tooltip('avg_salary_usd', title='Śr. Pensja (USD)', format=',.0f')
    ]
).properties(
    width=800,
    title='Rozkład średnich pensji dla Top 10 stanowisk'
).interactive()

In [43]:
chart_salary_job_title.save('salary_vs_job_title.html')

In [44]:
#analiza zarobków vs doswiadczenia

In [45]:
#wykres boxplot

In [48]:
level_order = ['Entry', 'Mid', 'Senior']

In [50]:
chart_salary_level = alt.Chart(df).mark_boxplot().encode(
    x=alt.X('experience_level', title='Poziom Doświadczenia', sort=level_order),
    y=alt.Y('avg_salary_usd', title='Średnia Pensja (USD)'),
    tooltip=[
        alt.Tooltip('experience_level', title='Poziom Doświadczenia'),
        alt.Tooltip('avg_salary_usd', title='Śr. Pensja (USD)', format=',.0f')
    ]
).properties(
    width=600,
    title='Rozkład średnich pensji wg poziomu doświadczenia'
).interactive()

chart_salary_level.save('salary_vs_experience.html')

In [51]:
#analiza zarobków vs wielkość firmy

In [52]:
size_order = ['Small', 'Medium', 'Large']

In [53]:
median_salary_by_size = df.groupby('company_size')['avg_salary_usd'].median().reset_index()

In [54]:
print(median_salary_by_size.sort_values(by='avg_salary_usd', ascending=False))

  company_size  avg_salary_usd
0        Large        125425.0
2      Startup        123675.0
1          Mid        120550.0


In [62]:
chart_salary_size = alt.Chart(median_salary_by_size).mark_bar().encode(
    x=alt.X('company_size', title='Wielkość Firmy', sort=size_order, scale=alt.Scale(padding=0.5)),
    y=alt.Y('avg_salary_usd', title='Mediana Średnich Pensji (USD)'),
    tooltip=[
        alt.Tooltip('company_size', title='Wielkość Firmy'),
        alt.Tooltip('avg_salary_usd', title='Mediana Pensji', format=',.0f')
    ]
).properties(
    width=500,
    title='Mediana pensji wg wielkości firmy'
).interactive()

chart_salary_size.save('salary_vs_company_size.html')

In [63]:
# analiza branż


In [64]:
industry_counts = df['industry'].value_counts().head(10).reset_index()

In [65]:
industry_counts.columns = ['industry', 'count']

In [67]:
print(industry_counts)

     industry  count
0  Automotive    300
1   Education    294
2      Retail    293
3  E-commerce    291
4     Finance    279
5        Tech    274
6  Healthcare    269


In [68]:
top_10_industry_list = industry_counts['industry'].tolist()

In [71]:
df_top_industries = df[df['industry'].isin(top_10_industry_list)]

In [74]:
# Wykres liczba ofert wg branży (słupkowy)
chart_industry_count = alt.Chart(industry_counts).mark_bar().encode(
    x=alt.X('industry', title='Branża', sort='-y'),
    y=alt.Y('count', title='Liczba Ofert Pracy'),
    tooltip=[
        alt.Tooltip('industry', title='Branża'),
        alt.Tooltip('count', title='Liczba Ofert')
    ]
).properties(
    width=600,
    title='Top 10 Branż (wg liczby ofert AI)'
).interactive()

In [75]:
chart_industry_count.save('top_10_industries_by_count.html')

In [77]:
median_salary_by_industry = df_top_industries.groupby('industry')['avg_salary_usd'].median().reset_index()

In [83]:
# Wykres  mediana pensji wg branży (słupkowy)
chart_industry_salary = alt.Chart(median_salary_by_industry).mark_bar(color='green',size=90).encode(
    # Sortujemy wg mediany pensji
    x=alt.X('industry', title='Branża', sort=alt.SortField(field="avg_salary_usd", order='descending')),
    y=alt.Y('avg_salary_usd', title='Mediana Średnich Pensji (USD)'),
    tooltip=[
        alt.Tooltip('industry', title='Branża'),
        alt.Tooltip('avg_salary_usd', title='Mediana Pensji', format=',.0f')
    ]
).properties(
    width=800,
    title='Mediana pensji w Top 10 branżach'
).interactive()

In [84]:
chart_industry_salary.save('salary_by_top_10_industry.html')