# Canada Job Market (Python Only): EDA + ML Salary Prediction

This notebook loads a 2,000-row synthetic dataset of Canadian job postings, performs EDA/visualizations, and trains a simple ML model to predict salaries using province, job title, and month.

**Files expected:** `job_postings_canada_large.csv`


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

sns.set_style('whitegrid')
df = pd.read_csv('job_postings_canada_large.csv')
df.head()

In [None]:
# Basic info
print('Rows:', len(df))
print('Columns:', df.columns.tolist())
print('Provinces:\n', df['province'].value_counts())
print('\nJob titles:\n', df['job_title'].value_counts().head())
print('\nSalary (CAD) summary:')
print(df['estimated_salary'].describe())

In [None]:
# Chart 1: Average salary by province
plt.figure(figsize=(8,5))
avg_salary = df.groupby('province')['estimated_salary'].mean().reset_index()
sns.barplot(data=avg_salary.sort_values('estimated_salary', ascending=False), x='estimated_salary', y='province')
plt.title('Average Salary by Province (CAD)')
plt.xlabel('Average Salary (CAD)')
plt.tight_layout()
plt.savefig('avg_salary_by_province.png')
plt.show()

In [None]:
# Chart 2: Top job titles by postings
plt.figure(figsize=(9,5))
top_titles = df['job_title'].value_counts().nlargest(10)
sns.barplot(x=top_titles.values, y=top_titles.index)
plt.title('Top 10 Job Titles by Postings')
plt.xlabel('Postings')
plt.tight_layout()
plt.savefig('top_job_titles.png')
plt.show()

In [None]:
# Monthly trend (simple counts)
plt.figure(figsize=(10,5))
monthly = df.groupby('month')['job_id'].count().reset_index(name='job_count')
monthly = monthly.sort_values('month')
plt.plot(monthly['month'], monthly['job_count'], marker='o')
plt.title('Monthly Job Posting Trends')
plt.xlabel('Month')
plt.ylabel('Job Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('monthly_trends.png')
plt.show()

In [None]:
# ML: Salary prediction using province, job_title, month
X = df[['province','job_title','month']]
y = df['estimated_salary']

categorical = ['province','job_title','month']
preprocess = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical)]
)
pipe = Pipeline(steps=[('prep', preprocess), ('model', LinearRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipe.fit(X_train, y_train)
r2 = pipe.score(X_test, y_test)
print(f'R^2 score on test set: {r2:.3f}')

# Example prediction
sample = pd.DataFrame({
    'province': ['Ontario','BC'],
    'job_title': ['Data Analyst','Data Scientist'],
    'month': ['2023-08','2023-10']
})
pred = pipe.predict(sample)
for i, val in enumerate(pred):
    print(f'Predicted salary for {sample.iloc[i].to_dict()}: ${val:,.0f}')
