# Projekt


# Bearbeta datan

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Läs in data
data = pd.read_csv("stackoverflow_full.csv")

# 🎯 Målvariabel
y = data["Employed"]

# 🔑 Features vi använder
X = data.drop(columns=["Employed"])

# --- Steg 1: Hantera kategoriska och numeriska variabler ---
categorical_cols = ["Age", "Accessibility", "EdLevel", "Employment", 
                    "Gender", "MentalHealth", "MainBranch", "Country"]

numeric_cols = ["YearsCode", "YearsCodePro", "PreviousSalary", "ComputerSkills"]

# --- Steg 2: Transformer ---
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

# Dela upp i train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


-------------

# ML Modellering


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# --- Pipeline: Preprocessing + Modell ---
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Träna modellen
clf.fit(X_train, y_train)

# Prediktioner
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]

# Utvärdering
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

# Exempel: sannolikheter för testkandidater
proba_df = pd.DataFrame({
    "Candidate": X_test.index,
    "Hire_Probability": y_proba
}).sort_values("Hire_Probability", ascending=False)

print(proba_df.head(10))


ROC-AUC: 0.8740636966201457
              precision    recall  f1-score   support

           0       0.77      0.77      0.77     10221
           1       0.80      0.79      0.80     11818

    accuracy                           0.79     22039
   macro avg       0.78      0.78      0.78     22039
weighted avg       0.79      0.79      0.79     22039

       Candidate  Hire_Probability
7017       31750          1.000000
8975       36178          1.000000
17309      52546          1.000000
6865       48585          1.000000
19365      20446          0.999999
16577      46007          0.999999
19082      14776          0.999999
5434       40398          0.999998
9162       15470          0.999997
2409       42889          0.999997
