In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import warnings

In [2]:
#loading the dataset in
df = pd.read_csv("data/ai_job_market.csv")
print("Rows and Columns:", df.shape)
df.head()

Rows and Columns: (2000, 12)


Unnamed: 0,job_id,company_name,industry,job_title,skills_required,experience_level,employment_type,location,salary_range_usd,posted_date,company_size,tools_preferred
0,1,Foster and Sons,Healthcare,Data Analyst,"NumPy, Reinforcement Learning, PyTorch, Scikit...",Mid,Full-time,"Tracybury, AR",92860-109598,2025-08-20,Large,"KDB+, LangChain"
1,2,"Boyd, Myers and Ramirez",Tech,Computer Vision Engineer,"Scikit-learn, CUDA, SQL, Pandas",Senior,Full-time,"Lake Scott, CU",78523-144875,2024-03-22,Large,"FastAPI, KDB+, TensorFlow"
2,3,King Inc,Tech,Quant Researcher,"MLflow, FastAPI, Azure, PyTorch, SQL, GCP",Entry,Full-time,"East Paige, CM",124496-217204,2025-09-18,Large,"BigQuery, PyTorch, Scikit-learn"
3,4,"Cooper, Archer and Lynch",Tech,AI Product Manager,"Scikit-learn, C++, Pandas, LangChain, AWS, R",Mid,Full-time,"Perezview, FI",50908-123743,2024-05-08,Large,"TensorFlow, BigQuery, MLflow"
4,5,Hall LLC,Finance,Data Scientist,"Excel, Keras, SQL, Hugging Face",Senior,Contract,"North Desireeland, NE",98694-135413,2025-02-24,Large,"PyTorch, LangChain"


In [None]:
# ***************************************************************************************
# *    Title: AI Job Market EDA and Salary Prediction
# *    Author: Hammad Farooq
# *    Date: 2025
# *    Availability: https://www.kaggle.com/code/hammadfarooq470/ai-job-market-eda-and-salary-prediction
# *    Usage: Used data cleaning and processing functions to convert dataframe into usable format for the project
# *
# ***************************************************************************************
# Data Cleaning and Processing
def parse_salary_mean(s):
    if pd.isna(s):
        return np.nan
    # remove currency symbols if any, then split on non-digit(s)
    parts = re.findall(r'\d+', s)
    if len(parts) >= 2:
        lo = int(parts[0])
        hi = int(parts[1])
        return (lo + hi) / 2.0
    elif len(parts) == 1:
        return float(parts[0])
    else:
        return np.nan

df['salary_mean'] = df['salary_range_usd'].astype(str).apply(parse_salary_mean)
df['posted_date'] = pd.to_datetime(df['posted_date'], errors='coerce')
df = df.dropna(subset=['salary_mean']).reset_index(drop=True)  # simplicity: drop rows without salary

# Feature Engineering
# Extract country (last token after comma), city (first token before comma)
def split_location(loc):
    if pd.isna(loc): return (np.nan, np.nan)
    parts = [p.strip() for p in loc.split(',')]
    if len(parts) == 1:
        return (parts[0], np.nan)
    return (parts[0], parts[-1])

df[['city', 'country']] = df['location'].apply(lambda x: pd.Series(split_location(x)))
# number of skills listed
df['num_skills'] = df['skills_required'].fillna("").apply(lambda s: len([t for t in re.split(r',|;', s) if t.strip()!='']))
# posted date features
df['posted_year'] = df['posted_date'].dt.year
df['posted_month'] = df['posted_date'].dt.month.fillna(0).astype(int)
# common experience_level mapping
df['experience_level'] = df['experience_level'].fillna('Other').str.title()
# employment type
df['employment_type'] = df['employment_type'].fillna('Other').str.title()

#Preparing data for modeling
# We'll use: experience_level, industry, employment_type, country, num_skills, posted_year, posted_month,
# plus TF-IDF features from 'skills_required' and 'job_title' (text fields)

# Fill NA for simple categorical fields
for c in ['experience_level','industry','employment_type','country']:
    df[c] = df[c].fillna('Unknown')

# Text fields - fillna
df['skills_required'] = df['skills_required'].fillna("")
df['job_title'] = df['job_title'].fillna("")

# Create TF-IDF features for text fields
skills_tfidf = TfidfVectorizer(max_features=800, token_pattern=r"(?u)\b\w[\w\+\-\.]*\b", ngram_range=(1,2))
title_tfidf = TfidfVectorizer(max_features=400, token_pattern=r"(?u)\b\w[\w\+\-\.]*\b", ngram_range=(1,2))

X_skills = skills_tfidf.fit_transform(df['skills_required'])
X_title = title_tfidf.fit_transform(df['job_title'])

# Categorical features via OneHotEncoder (limit categories by top N to avoid explosion)
cat_cols = ['experience_level','industry','employment_type','country']
# Keep top k categories for each to reduce dimensionality
topk=20
ohe_columns = {}
for col in cat_cols:
    top = df[col].value_counts().nlargest(topk).index.tolist()
    df[col+'_reduced'] = df[col].apply(lambda x: x if x in top else 'Other')
    ohe_columns[col+'_reduced'] = top + ['Other']

ohe = OneHotEncoder(handle_unknown='ignore')
X_cat = ohe.fit_transform(df[[c+'_reduced' for c in cat_cols]])

# Numeric features
X_num = df[['num_skills','posted_year','posted_month']].fillna(0).values

# Combine sparse and dense: stack numeric into sparse by converting to csr and hstack
from scipy import sparse
X_num_sparse = sparse.csr_matrix(X_num)
X_full = hstack([X_num_sparse, X_cat, X_skills, X_title]).tocsr()
y = df['salary_mean'].values

print("Final matrix shape:", X_full.shape)

#End Code being referenced


Final matrix shape: (2000, 551)


In [8]:
# Creating the Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size=0.2, random_state=67)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (1600, 551) Test shape: (400, 551)


Plan
1. Vanilla Linear Regression 
2. Lasso and Ridge Regression 
3. Pure Decision Tree
4. Random Forest, 

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 31526 stored elements and shape (1600, 551)>