In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [85]:
df = pd.read_csv('ai_job_market_insights.csv')
df.head()

Unnamed: 0,Job_Title,Industry,Company_Size,Location,AI_Adoption_Level,Automation_Risk,Required_Skills,Salary_USD,Remote_Friendly,Job_Growth_Projection
0,Cybersecurity Analyst,Entertainment,Small,Dubai,Medium,High,UX/UI Design,111392.165243,Yes,Growth
1,Marketing Specialist,Technology,Large,Singapore,Medium,High,Marketing,93792.562466,No,Decline
2,AI Researcher,Technology,Large,Singapore,Medium,High,UX/UI Design,107170.263069,Yes,Growth
3,Sales Manager,Retail,Small,Berlin,Low,High,Project Management,93027.953758,No,Growth
4,Cybersecurity Analyst,Entertainment,Small,Tokyo,Low,Low,JavaScript,87752.922171,Yes,Decline


In [87]:
df.tail()

Unnamed: 0,Job_Title,Industry,Company_Size,Location,AI_Adoption_Level,Automation_Risk,Required_Skills,Salary_USD,Remote_Friendly,Job_Growth_Projection
495,Data Scientist,Telecommunications,Medium,Berlin,Low,Medium,Machine Learning,105821.394046,Yes,Stable
496,Cybersecurity Analyst,Telecommunications,Small,London,Low,High,UX/UI Design,119794.992146,No,Decline
497,Cybersecurity Analyst,Energy,Large,Dubai,High,Low,UX/UI Design,79644.933099,Yes,Stable
498,Operations Manager,Healthcare,Large,Paris,High,Low,Python,77642.150625,Yes,Stable
499,HR Manager,Entertainment,Medium,Berlin,Medium,High,Project Management,68764.378921,Yes,Decline


In [89]:
df.describe()

Unnamed: 0,Salary_USD
count,500.0
mean,91222.390974
std,20504.291453
min,31969.526346
25%,78511.514863
50%,91998.195286
75%,103971.282092
max,155209.821614


In [91]:
df.isnull().sum()

Job_Title                0
Industry                 0
Company_Size             0
Location                 0
AI_Adoption_Level        0
Automation_Risk          0
Required_Skills          0
Salary_USD               0
Remote_Friendly          0
Job_Growth_Projection    0
dtype: int64

In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Job_Title              500 non-null    object 
 1   Industry               500 non-null    object 
 2   Company_Size           500 non-null    object 
 3   Location               500 non-null    object 
 4   AI_Adoption_Level      500 non-null    object 
 5   Automation_Risk        500 non-null    object 
 6   Required_Skills        500 non-null    object 
 7   Salary_USD             500 non-null    float64
 8   Remote_Friendly        500 non-null    object 
 9   Job_Growth_Projection  500 non-null    object 
dtypes: float64(1), object(9)
memory usage: 39.2+ KB


In [95]:
X = df.drop(columns=['Salary_USD'])
y = df['Salary_USD']

In [97]:
X

Unnamed: 0,Job_Title,Industry,Company_Size,Location,AI_Adoption_Level,Automation_Risk,Required_Skills,Remote_Friendly,Job_Growth_Projection
0,Cybersecurity Analyst,Entertainment,Small,Dubai,Medium,High,UX/UI Design,Yes,Growth
1,Marketing Specialist,Technology,Large,Singapore,Medium,High,Marketing,No,Decline
2,AI Researcher,Technology,Large,Singapore,Medium,High,UX/UI Design,Yes,Growth
3,Sales Manager,Retail,Small,Berlin,Low,High,Project Management,No,Growth
4,Cybersecurity Analyst,Entertainment,Small,Tokyo,Low,Low,JavaScript,Yes,Decline
...,...,...,...,...,...,...,...,...,...
495,Data Scientist,Telecommunications,Medium,Berlin,Low,Medium,Machine Learning,Yes,Stable
496,Cybersecurity Analyst,Telecommunications,Small,London,Low,High,UX/UI Design,No,Decline
497,Cybersecurity Analyst,Energy,Large,Dubai,High,Low,UX/UI Design,Yes,Stable
498,Operations Manager,Healthcare,Large,Paris,High,Low,Python,Yes,Stable


In [99]:
y

0      111392.165243
1       93792.562466
2      107170.263069
3       93027.953758
4       87752.922171
           ...      
495    105821.394046
496    119794.992146
497     79644.933099
498     77642.150625
499     68764.378921
Name: Salary_USD, Length: 500, dtype: float64

In [101]:
categorical_columns = X.select_dtypes(include=['object']).columns
categorical_columns

Index(['Job_Title', 'Industry', 'Company_Size', 'Location',
       'AI_Adoption_Level', 'Automation_Risk', 'Required_Skills',
       'Remote_Friendly', 'Job_Growth_Projection'],
      dtype='object')

In [113]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('category', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough'
)

In [115]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])


In [117]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [132]:
print(X_train)
print(X_test)

                 Job_Title            Industry Company_Size       Location  \
249         Data Scientist       Manufacturing        Small         Sydney   
433             HR Manager          Technology       Medium         Berlin   
19       Software Engineer       Entertainment        Small          Dubai   
322      Software Engineer       Manufacturing        Small          Tokyo   
332            UX Designer       Manufacturing       Medium          Paris   
..                     ...                 ...          ...            ...   
106             HR Manager  Telecommunications       Medium  San Francisco   
270   Marketing Specialist  Telecommunications        Large         Berlin   
348      Software Engineer              Energy        Small          Dubai   
435         Data Scientist          Technology        Small         London   
102  Cybersecurity Analyst           Education        Small         London   

    AI_Adoption_Level Automation_Risk     Required_Skills Remot

In [134]:
print(y_train)
print(y_test)

249    73518.823169
433    69869.734146
19     73920.458527
322    93294.875749
332    97763.405153
           ...     
106    67257.172389
270    84511.199458
348    87667.401352
435    86925.248553
102    76583.279265
Name: Salary_USD, Length: 400, dtype: float64
361    140475.995812
73      75497.338130
374    118489.240263
155     70100.002136
104    105366.858990
           ...      
347     68665.577823
86      58952.450768
75      73532.815908
438    106177.858402
15      82799.357707
Name: Salary_USD, Length: 100, dtype: float64


In [123]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(400, 9)
(400,)
(100, 9)
(100,)


In [125]:

pipeline.fit(X_train, y_train)


In [127]:
y_pred = pipeline.predict(X_test)

In [129]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 526976021.94549596
R^2 Score: -0.018820615596098378
