In [7]:
import pandas as pd

# Load dataset (replace with your file path or URL)
df = pd.read_csv("postings.csv")

# View first 5 rows
# print("First 5 rows:")
# print(df.head())

# View summary statistics
print("\nDataset Description:")
print(df.describe())

# Optional: Basic info about columns and datatypes
print("\nDataset Info:")
print(df.info())



Dataset Description:
             job_id    max_salary    company_id          views     med_salary  \
count  1.238490e+05  2.979300e+04  1.221320e+05  122160.000000    6280.000000   
mean   3.896402e+09  9.193942e+04  1.220401e+07      14.618247   22015.619876   
std    8.404355e+07  7.011101e+05  2.554143e+07      85.903598   52255.873846   
min    9.217160e+05  1.000000e+00  1.009000e+03       1.000000       0.000000   
25%    3.894587e+09  4.828000e+01  1.435200e+04       3.000000      18.940000   
50%    3.901998e+09  8.000000e+04  2.269650e+05       4.000000      25.500000   
75%    3.904707e+09  1.400000e+05  8.047188e+06       8.000000    2510.500000   
max    3.906267e+09  1.200000e+08  1.034730e+08    9975.000000  750000.000000   

         min_salary       applies  original_listed_time  remote_allowed  \
count  2.979300e+04  23320.000000          1.238490e+05         15246.0   
mean   6.491085e+04     10.591981          1.713152e+12             1.0   
std    4.959738e+05    

In [17]:
# =========================================
# JOB DEMAND + SALARY + REMOTE PREDICTOR
# =========================================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
import joblib

# =========================
# 1️⃣ LOAD DATA
# =========================
df = pd.read_csv("postings.csv")

# Drop irrelevant columns
drop_cols = ["description", "job_description", "expiry", "closed_time", "listed_time"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

# Keep only what we need
df = df[["title", "views", "med_salary", "remote_allowed"]].dropna()

# =========================
# 2️⃣ FEATURE EXTRACTION
# =========================
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df["title"])
joblib.dump(vectorizer, "vectorizer.pkl")

# Target variables
y_demand = df["views"]               # regression
y_salary = df["med_salary"]          # regression
y_remote = df["remote_allowed"]      # classification

# Train/test split
Xd_train, Xd_test, yd_train, yd_test = train_test_split(X, y_demand, test_size=0.2, random_state=42)
Xs_train, Xs_test, ys_train, ys_test = train_test_split(X, y_salary, test_size=0.2, random_state=42)
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X, y_remote, test_size=0.2, random_state=42)

# =========================
# 3️⃣ TRAIN MODELS
# =========================
print("Training job demand model...")
model_demand = RandomForestRegressor(n_estimators=100, random_state=42)
model_demand.fit(Xd_train, yd_train)

print("Training salary model...")
model_salary = RandomForestRegressor(n_estimators=100, random_state=42)
model_salary.fit(Xs_train, ys_train)

print("Training remote prediction model...")
model_remote = RandomForestClassifier(n_estimators=100, random_state=42)
model_remote.fit(Xr_train, yr_train)

# =========================
# 4️⃣ EVALUATION
# =========================
print("\n--- Evaluation ---")
print(f"Demand MSE: {mean_squared_error(yd_test, model_demand.predict(Xd_test)):.2f}")
print(f"Salary MSE: {mean_squared_error(ys_test, model_salary.predict(Xs_test)):.2f}")
print(f"Remote Accuracy: {accuracy_score(yr_test, model_remote.predict(Xr_test)):.2f}")

# =========================
# 5️⃣ SAVE MODELS
# =========================
# joblib.dump(vectorizer, "vectorizer.pkl")
joblib.dump(model_demand, "model_demand.pkl")
joblib.dump(model_salary, "model_salary.pkl")
joblib.dump(model_remote, "model_remote.pkl")

print("\n✅ Models trained and saved successfully!")

# =========================
# 6️⃣ PREDICTION FUNCTION
# =========================
def predict_job_outlook(job_title):
    v = joblib.load("vectorizer.pkl")
    m_demand = joblib.load("model_demand.pkl")
    m_salary = joblib.load("model_salary.pkl")
    m_remote = joblib.load("model_remote.pkl")

    X_new = v.transform([job_title])

    demand = m_demand.predict(X_new)[0]
    salary = m_salary.predict(X_new)[0]
    remote = m_remote.predict(X_new)[0]

    print("\nJob Outlook for:", job_title)
    print(f"Estimated Demand (views): {demand:.0f}")
    print(f"Expected Median Salary: ₹{salary:,.0f}")
    print(f"Remote Friendly: {'Yes' if remote == 1 else 'No'}")

# Example usage
predict_job_outlook("Full Stack Developer")


Training job demand model...
Training salary model...
Training remote prediction model...

--- Evaluation ---
Demand MSE: 19856.18
Salary MSE: 3642697594.16
Remote Accuracy: 1.00

✅ Models trained and saved successfully!

Job Outlook for: Full Stack Developer
Estimated Demand (views): 35
Expected Median Salary: ₹17,431
Remote Friendly: Yes
