# Projekt


# Bearbeta datan

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Läs in data
data = pd.read_csv("stackoverflow_full.csv")

# Målvariabel
y = data["Employed"]

# Features vi använder
X = data.drop(columns=["Employed"])
X = data.drop(columns=["MentalHealth"])  # Ta bort MentalHealth vi inte vet något om den
X = data.drop(columns=["Accessibility"])  # Ta bort accessibility då den har för många unika värden

# --- Steg 1: Hantera kategoriska och numeriska variabler ---
categorical_cols = ["Age", "EdLevel", "Employment", 
                    "Gender", "MainBranch", "HaveWorkedWith", "Country"]

numeric_cols = ["YearsCode", "YearsCodePro", "PreviousSalary", "ComputerSkills"]

# --- Steg 2: Transformer ---
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

# Dela upp i train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


-------------

# ML Modellering


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# --- Pipeline: Preprocessing + Modell ---
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Träna modellen
clf.fit(X_train, y_train)

# Prediktioner
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]

# Random Forest som jämförelse
rf_clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

print("\nRandom Forest Modell:")
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
y_proba_rf = rf_clf.predict_proba(X_test)[:,1]

# Utvärdering
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

# Exempel: sannolikheter för testkandidater
proba_df = pd.DataFrame({
    "Candidate": X_test.index,
    "Hire_Probability": y_proba
}).sort_values("Hire_Probability", ascending=False)

print(proba_df.head(10))



Random Forest Modell:
ROC-AUC: 0.8766101651285496
              precision    recall  f1-score   support

           0       0.77      0.78      0.77      6814
           1       0.81      0.80      0.80      7879

    accuracy                           0.79     14693
   macro avg       0.79      0.79      0.79     14693
weighted avg       0.79      0.79      0.79     14693

      Candidate  Hire_Probability
4101      36178          1.000000
9180      52546          1.000000
4524      20446          1.000000
929       14776          0.999999
2886      40398          0.999999
9262      42889          0.999998
3147      15470          0.999997
6487       3834          0.999997
1740      41643          0.999997
8925      17512          0.999996
