In [None]:
PREDICTION 

In [14]:
# CELL 1: Imports & Load Data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import (
    accuracy_score, classification_report,
    r2_score, mean_absolute_error
)
import joblib
from packaging import version
import sklearn

In [15]:
df = pd.read_csv("jobs.csv")
print(df.shape)
df.head()


(99636, 25)


Unnamed: 0,Job Id,Qualifications,location,Country,latitude,longitude,Work Type,Company Size,Job Posting Date,Preference,...,Job Description,Benefits,skills,Responsibilities,Company,Company Profile,Min_Salary_K,Max_Salary_K,Min_Experience,Max_Experience
0,1017340000000000.0,BBA,Panama City,Panama,8.5379,-80.7821,Contract,93242,19/05/2022,Male,...,Promote diversity and inclusion in the supply ...,"{'Transportation Benefits, Professional Develo...",Supplier diversity programs Diversity and incl...,Promote supplier diversity initiatives and inc...,RWE AG,"{""Sector"":""Energy"",""Industry"":""Energy - Utilit...",55,84,5,10
1,2421050000000000.0,MBA,Tunis,Tunisia,33.8869,9.5375,Part-Time,18411,08/11/2021,Male,...,Architectural Drafters assist architects and e...,"{'Employee Assistance Programs (EAP), Tuition ...",Architectural drafting AutoCAD 2D and 3D model...,Prepare detailed architectural drawings and pl...,Asian Paints,"{""Sector"":""Consumer Goods"",""Industry"":""Paints ...",61,108,0,12
2,1822640000000000.0,M.Com,Harare,Zimbabwe,-19.0154,29.1549,Full-Time,120621,21/10/2021,Both,...,An Art Education Coordinator plans and manages...,"{'Employee Referral Programs, Financial Counse...",Art education curriculum Program development T...,"Coordinate art education programs, curriculum ...",Laboratory Corp. of America,"{""Sector"":""Healthcare Services"",""Industry"":""He...",57,82,0,11
3,3068000000000000.0,B.Com,Tirana,Albania,41.1533,20.1683,Temporary,128908,16/08/2023,Male,...,Environmental Impact Analysts assess the envir...,"{'Transportation Benefits, Professional Develo...",Environmental impact analysis Data collection ...,Assess the environmental impact of projects an...,Massachusetts Mutual Life Insurance,"{""Sector"":""Insurance"",""Industry"":""Insurance: L...",56,95,5,12
4,1747900000000000.0,BCA,City of Baghdad,Iraq,33.2232,43.6793,Temporary,114717,20/06/2023,Female,...,An Art Education Coordinator plans and manages...,"{'Employee Referral Programs, Financial Counse...",Art education curriculum Program development T...,"Coordinate art education programs, curriculum ...",Sartorius AG,"{""Sector"":""Lab Equipment"",""Industry"":""Life Sci...",58,122,4,13


In [16]:
# ==========================================
# CELL 2: Feature Engineering & Selection
# ==========================================

df["Avg_Salary"] = (df["Min_Salary_K"] + df["Max_Salary_K"]) / 2
df["Experience_Range"] = df["Max_Experience"] - df["Min_Experience"]

df = df.dropna(subset=["Role", "Avg_Salary"])

NUM_COLS = ["Company Size", "Min_Experience", "Max_Experience", "Experience_Range"]
CAT_COLS = ["Qualifications", "Country", "Work Type"]
TEXT_COL = "skills"

MODEL_COLS = NUM_COLS + CAT_COLS + [TEXT_COL, "Role", "Avg_Salary"]
df_model = df[MODEL_COLS].copy()

print("Numeric:", NUM_COLS)
print("Categorical:", CAT_COLS)
print("Text:", TEXT_COL)
print("Rows after filtering:", df_model.shape[0])

Numeric: ['Company Size', 'Min_Experience', 'Max_Experience', 'Experience_Range']
Categorical: ['Qualifications', 'Country', 'Work Type']
Text: skills
Rows after filtering: 99636


In [17]:
# ==========================================
# CELL 3: Preprocessing Pipeline (FIXED)
# ==========================================
def to_1d(x):
    """Convert input to 1D array of strings for TfidfVectorizer."""
    if x is None:
        return np.array([''], dtype=object)

    if isinstance(x, pd.DataFrame):
        x = x.iloc[:, 0]
    
    if isinstance(x, pd.Series):
        x = x.values

    x = np.asarray(x, dtype=object)

    if x.ndim > 1:
        x = x.ravel()

    if x.ndim == 0 or (isinstance(x, np.ndarray) and x.shape == ()):
        x = np.array([str(x.item() if hasattr(x, 'item') else x)], dtype=object)

    x = np.array([str(item) if item is not None else '' for item in x], dtype=object)
    
    return x


numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler(with_mean=False))  
])

if version.parse(sklearn.__version__) >= version.parse("1.2"):
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
else:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe)
])

text_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value='')),
    ("to1d", FunctionTransformer(to_1d, validate=False)),
    ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english'))
])


PREPROCESSOR = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, NUM_COLS),
        ("cat", categorical_pipe, CAT_COLS),
        ("txt", text_pipe, [TEXT_COL]),  
    ],
    remainder="drop",
    sparse_threshold=1.0
)

print("Preprocessor created successfully!")


Preprocessor created successfully!


In [None]:
# ==========================================
# CELL 4: Train Role Classification Model
# ==========================================

X_role = df_model[NUM_COLS + CAT_COLS + [TEXT_COL]]
y_role = df_model["Role"]

try:
    Xr_tr, Xr_te, yr_tr, yr_te = train_test_split(
        X_role, y_role, test_size=0.2, random_state=42, stratify=y_role
    )
except ValueError:
    Xr_tr, Xr_te, yr_tr, yr_te = train_test_split(
        X_role, y_role, test_size=0.2, random_state=42
    )

role_clf = Pipeline([
    ("pre", PREPROCESSOR),
    ("clf", LogisticRegression(max_iter=1000, solver="saga", multi_class="multinomial"))
])

role_clf.fit(Xr_tr, yr_tr)
yr_pred = role_clf.predict(Xr_te)

print("=== ROLE CLASSIFICATION ===")
print("Accuracy:", accuracy_score(yr_te, yr_pred))
print("\nClassification Report:")
print(classification_report(yr_te, yr_pred, zero_division=0))




In [6]:
# ==========================================
# CELL 5: Train Salary Regression Model
# ==========================================
X_sal = df_model[NUM_COLS + CAT_COLS + [TEXT_COL]]
y_sal = df_model["Avg_Salary"]

Xs_tr, Xs_te, ys_tr, ys_te = train_test_split(
    X_sal, y_sal, test_size=0.2, random_state=42
)

salary_reg = Pipeline([
    ("pre", PREPROCESSOR),
    ("reg", Ridge(alpha=1.0, random_state=42))
])

salary_reg.fit(Xs_tr, ys_tr)
ys_pred = salary_reg.predict(Xs_te)

print("\n=== SALARY REGRESSION ===")
print("R^2:", r2_score(ys_te, ys_pred))
print("MAE:", mean_absolute_error(ys_te, ys_pred))



=== SALARY REGRESSION ===
R^2: -0.006244066124583281
MAE: 6.454016013715419


In [None]:
# ==========================================
# CELL 6: Save Models
# ==========================================

joblib.dump(PREPROCESSOR, "preprocessor.pkl")
joblib.dump(salary_reg, "salary_regressor.pkl")
joblib.dump(role_clf, "role_classifier.pkl")

print("\n Models saved successfully!")
print("Files created:")
print("  - preprocessor.pkl")
print("  - salary_regressor.pkl")
print("  - role_classifier.pkl")


 Models saved successfully!
Files created:
  - preprocessor.pkl
  - salary_regressor.pkl
  - role_classifier.pkl


In [10]:
# ==========================================
# CELL 7: Load Models and Make Predictions
# ==========================================
try:
    role_clf_loaded = joblib.load("role_classifier.pkl")
    salary_reg_loaded = joblib.load("salary_regressor.pkl")
    print(" Models loaded successfully!\n")
except FileNotFoundError as e:
    print(f" Error: {e}")
    print("Please ensure the model files exist in the current directory.")
    raise


def predict_job_profile(new_data, role_model, salary_model):
    """
    Predict the ideal job role and corresponding salary for a new profile.

    Returns
    -------
    (predicted_role, role_confidence, predicted_salary)
    """
    # Convert input dictionary to one-row DataFrame
    row = pd.DataFrame([new_data])

    # Feature engineering
    row["Experience_Range"] = row.get("Max_Experience", 0) - row.get("Min_Experience", 0)

    # ---- Role prediction ----
    predicted_role = role_model.predict(row)[0]

    role_confidence = None
    if hasattr(role_model, "predict_proba"):
        try:
            probs = role_model.predict_proba(row)[0]
            role_confidence = float(np.max(probs))   # max probability between all classes
        except Exception:
            role_confidence = None

    # ---- Salary prediction ----
    predicted_salary = float(salary_model.predict(row)[0])

    return predicted_role, role_confidence, predicted_salary


 Models loaded successfully!



In [12]:
# ==========================================
# CELL 8: Example Predictions
# ==========================================

def pretty_print_prediction(title, profile, role, role_conf, salary):
    print("\n")
    print("=" * 60)
    print(title)
    print("=" * 60)
    print(f"Qualifications: {profile['Qualifications']}")
    print(f"Experience: {profile['Min_Experience']}-{profile['Max_Experience']} years")
    print(f"Country: {profile['Country']}")
    print(f"Work Type: {profile['Work Type']}")
    print(f"Company Size: {profile['Company Size']}")
    print(f"Skills: {profile['skills']}")
    print("\n" + "─" * 60)
    print("PREDICTION:")
    if role_conf is not None:
        print(f" Ideal Job Role: {role}  (confidence: {role_conf * 100:.2f}%)")
    else:
        print(f" Ideal Job Role: {role}")
    print(f" Predicted Avg Salary: ${salary:.2f}K")
    print("=" * 60)


# Example 1: MBA candidate with 3-6 years experience
example_profile_1 = {
    "Qualifications": "MBA",
    "Country": "United States",
    "Work Type": "Full-Time",
    "Company Size": 500,
    "Min_Experience": 3,
    "Max_Experience": 6,
    "skills": "python, sql, data analysis, project management, machine learning"
}

role_1, conf_1, salary_1 = predict_job_profile(
    example_profile_1, role_clf_loaded, salary_reg_loaded
)
pretty_print_prediction("EXAMPLE 1: MBA Candidate",
                        example_profile_1, role_1, conf_1, salary_1)


# Example 2: BCA graduate, entry level
example_profile_2 = {
    "Qualifications": "BCA",
    "Country": "India",
    "Work Type": "Full-Time",
    "Company Size": 1000,
    "Min_Experience": 0,
    "Max_Experience": 2,
    "skills": "java, javascript, html, css, react, nodejs"
}

role_2, conf_2, salary_2 = predict_job_profile(
    example_profile_2, role_clf_loaded, salary_reg_loaded
)
pretty_print_prediction("EXAMPLE 2: BCA Graduate (Entry Level)",
                        example_profile_2, role_2, conf_2, salary_2)


# Example 3: PhD in Data Science
example_profile_3 = {
    "Qualifications": "PhD",
    "Country": "United Kingdom",
    "Work Type": "Full-Time",
    "Company Size": 5000,
    "Min_Experience": 5,
    "Max_Experience": 10,
    "skills": "machine learning, deep learning, python, tensorflow, pytorch, nlp, computer vision, research"
}

role_3, conf_3, salary_3 = predict_job_profile(
    example_profile_3, role_clf_loaded, salary_reg_loaded
)
pretty_print_prediction("EXAMPLE 3: PhD in Data Science",
                        example_profile_3, role_3, conf_3, salary_3)




EXAMPLE 1: MBA Candidate
Qualifications: MBA
Experience: 3-6 years
Country: United States
Work Type: Full-Time
Company Size: 500
Skills: python, sql, data analysis, project management, machine learning

────────────────────────────────────────────────────────────
PREDICTION:
 Ideal Job Role: Machine Learning Engineer  (confidence: 3.53%)
 Predicted Avg Salary: $82.77K


EXAMPLE 2: BCA Graduate (Entry Level)
Qualifications: BCA
Experience: 0-2 years
Country: India
Work Type: Full-Time
Company Size: 1000
Skills: java, javascript, html, css, react, nodejs

────────────────────────────────────────────────────────────
PREDICTION:
 Ideal Job Role: Frontend Web Developer  (confidence: 9.51%)
 Predicted Avg Salary: $82.45K


EXAMPLE 3: PhD in Data Science
Qualifications: PhD
Experience: 5-10 years
Country: United Kingdom
Work Type: Full-Time
Company Size: 5000
Skills: machine learning, deep learning, python, tensorflow, pytorch, nlp, computer vision, research

───────────────────────────────

In [13]:
# ==========================================
# CELL 9: Test with Custom Input
# ==========================================

my_profile = {
    "Qualifications": "B.Tech",  
    "Country": "United States",   
    "Work Type": "Full-Time",     
    "Company Size": 2000,        
    "Min_Experience": 2,          
    "Max_Experience": 4,         
    "skills": "python, aws, docker, kubernetes, devops, ci/cd"  
}

my_role, my_conf, my_salary = predict_job_profile(
    my_profile, role_clf_loaded, salary_reg_loaded
)

print("\n")
print("=" * 60)
print("YOUR CUSTOM PROFILE")
print("=" * 60)
print(f"Qualifications: {my_profile['Qualifications']}")
print(f"Experience: {my_profile['Min_Experience']}-{my_profile['Max_Experience']} years")
print(f"Country: {my_profile['Country']}")
print(f"Work Type: {my_profile['Work Type']}")
print(f"Company Size: {my_profile['Company Size']}")
print(f"Skills: {my_profile['skills']}")
print("\n" + "─" * 60)
print("PREDICTION:")
if my_conf is not None:
    print(f" Ideal Job Role: {my_role}  (confidence: {my_conf * 100:.2f}%)")
else:
    print(f" Ideal Job Role: {my_role}")
print(f" Predicted Avg Salary: ${my_salary:.2f}K")
print("=" * 60)

print("\n All predictions completed successfully!")




YOUR CUSTOM PROFILE
Qualifications: B.Tech
Experience: 2-4 years
Country: United States
Work Type: Full-Time
Company Size: 2000
Skills: python, aws, docker, kubernetes, devops, ci/cd

────────────────────────────────────────────────────────────
PREDICTION:
 Ideal Job Role: DevOps Engineer  (confidence: 5.74%)
 Predicted Avg Salary: $82.38K

 All predictions completed successfully!
