<a href="https://colab.research.google.com/github/KeertahanaKV/Global-AI-Job-Market-Salary-Trends/blob/main/carrer_path.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()  # Upload your archive.zip file

import zipfile, os

zip_file = list(uploaded.keys())[0]
extract_dir = "extracted"

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)


Saving archive.zip to archive.zip


In [None]:
import pandas as pd

csv_path = os.path.join(extract_dir, "ai_job_dataset.csv")
df = pd.read_csv(csv_path)


In [None]:
def group_job_titles(title):
    title = title.lower()
    if "scientist" in title:
        return "Data Scientist"
    elif "analyst" in title:
        return "Data Analyst"
    elif "engineer" in title:
        return "AI/ML Engineer"
    elif "manager" in title:
        return "Product Manager"
    elif "research" in title:
        return "AI Research"
    elif "mlops" in title or "ops" in title:
        return "MLOps Engineer"
    else:
        return "Other"

df['career_path'] = df['job_title'].apply(group_job_titles)


In [None]:
top_roles = ['Data Scientist', 'Data Analyst', 'AI/ML Engineer', 'Product Manager']
df = df[df['career_path'].isin(top_roles)].copy()


In [None]:
df = df[['career_path', 'salary_usd', 'years_experience', 'company_size', 'education_required']]
df.dropna(inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, top_k_accuracy_score

# Label encode target
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['career_path'])

X = df.drop(columns=['career_path', 'label'])
y = df['label']

# Categorical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()

# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

# Train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)

top1_accuracy = accuracy_score(y_test, y_pred)
top3_accuracy = top_k_accuracy_score(y_test, y_proba, k=3)

print("üéØ Top-1 Accuracy:", round(top1_accuracy * 100, 2), "%")
print("‚úÖ Top-3 Career Path Accuracy:", round(top3_accuracy * 100, 2), "%")


üéØ Top-1 Accuracy: 60.27 %
‚úÖ Top-3 Career Path Accuracy: 93.27 %


In [None]:
sample_input = pd.DataFrame([{
    'salary_usd': 95000,
    'years_experience': 4,
    'company_size': 'M',
    'education_required': "Master's Degree"
}])

probs = pipeline.predict_proba(sample_input)[0]
top_3_indices = probs.argsort()[-3:][::-1]
top_3_roles = label_encoder.inverse_transform(top_3_indices)

print("\nüîç Suggested Career Paths for You:")
for i, role in enumerate(top_3_roles, 1):
    print(f"{i}. {role} (Confidence: {probs[top_3_indices[i-1]] * 100:.2f}%)")



üîç Suggested Career Paths for You:
1. AI/ML Engineer (Confidence: 58.64%)
2. Data Scientist (Confidence: 25.59%)
3. Data Analyst (Confidence: 8.48%)
