# Slask

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC                 # <--- om du vill ha SVM
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    root_mean_squared_error,
    mean_squared_error
)

# Läs in data
try:
    data = pd.read_csv("stackoverflow_full.csv")
except FileNotFoundError:
    print("Fel: Filen 'stackoverflow_full.csv' kunde inte hittas. Kontrollera sökvägen.")
    exit()

# --- Steg 1: Hantera 'HaveWorkedWith' med MultiLabelBinarizer ---
print("Förbehandlar kolumnen 'HaveWorkedWith'...")
data['HaveWorkedWith'] = data['HaveWorkedWith'].fillna('').str.split(';')
data['HaveWorkedWith'] = data['HaveWorkedWith'].apply(lambda x: [s.strip() for s in x])

mlb = MultiLabelBinarizer()
worked_with_encoded = mlb.fit_transform(data['HaveWorkedWith'])
worked_with_df = pd.DataFrame(
    worked_with_encoded, columns=mlb.classes_, index=data.index
)

data = pd.concat([data.drop('HaveWorkedWith', axis=1), worked_with_df], axis=1)

# --- Steg 2: Features & target ---
y = data["Employed"]
cols_to_drop = ["Employed", "Employment", "MentalHealth", "Accessibility"]
X = data.drop(columns=cols_to_drop, errors='ignore')

# --- Steg 3: Delning av datasetets variabler --
numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist()

print(f"\nNumeriska kolumner: {numeric_cols}")
print(f"Kategoriska kolumner: {categorical_cols}")

# --- Steg 4: Preprocess‑pipeline ---
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(
    steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]
)

preprocessor = ColumnTransformer(
    transformers=[("num", numeric_transformer, numeric_cols),
                  ("cat", categorical_transformer, categorical_cols)],
    remainder='passthrough'
)

# Dela upp i train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Steg 5: Modeller ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    # "SVM (RBF, probability=True)": SVC(kernel='rbf', probability=True, random_state=42)  # kan aktiveras
}

print("\n--- Modeller och utvärdering ---")
for name, model in models.items():
    print(f"\nModell: {name}")

    pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                               ("classifier", model)])

    pipeline.fit(X_train, y_train)

    # ----- K‑fold RMSE -----
    cv_mse = cross_val_score(pipeline, X_train, y_train,
                             cv=5, scoring='neg_mean_squared_error')
    cv_rmse = np.sqrt(-cv_mse)
    print(f"Genomsnittlig RMSE vid 5‑fold CV: {np.mean(cv_rmse):.4f}")

    # ----- ROC‑AUC -----
    cv_auc = cross_val_score(pipeline, X_train, y_train,
                             cv=5, scoring='roc_auc')
    print(f"Genomsnittlig ROC‑AUC vid 5‑fold: {np.mean(cv_auc):.4f}")

    # ----- Testdata -----
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]

    print(f"ROC‑AUC på testdata: {roc_auc_score(y_test, y_proba):.4f}")

    # ----- RMSE på klass‑prediktioner -----
    rmse_pred = root_mean_squared_error(y_test, y_pred)
    print(f"RMSE på klass‑prediktioner: {rmse_pred:.4f}")

    # ----- RMSE på sannolikheter -----
    rmse_proba = root_mean_squared_error(y_test, y_proba)
    print(f"RMSE på sannolikheter: {rmse_proba:.4f}")

    print("Classification Report på testdata:")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("-" * 40)

# ----- Exempel: Sannolikheter med Random Forest -----
final_rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])
final_rf_pipeline.fit(X_train, y_train)
y_proba_rf = final_rf_pipeline.predict_proba(X_test)[:, 1]

proba_df = pd.DataFrame({
    "Candidate_ID": X_test.index,
    "Hire_Probability_RF": y_proba_rf
}).sort_values("Hire_Probability_RF", ascending=False)

print("\nHögsta sannolikheterna för anställning (Random Forest):")
print(proba_df.head(10).to_string(index=False))


Förbehandlar kolumnen 'HaveWorkedWith'...

Numeriska kolumner: ['Unnamed: 0', 'YearsCode', 'YearsCodePro', 'PreviousSalary', 'ComputerSkills', '', 'APL', 'ASP.NET', 'ASP.NET Core', 'AWS', 'Angular', 'Angular.js', 'Ansible', 'Assembly', 'Bash/Shell', 'Blazor', 'C', 'C#', 'C++', 'COBOL', 'Cassandra', 'Chef', 'Clojure', 'Cloud Firestore', 'Colocation', 'CouchDB', 'Couchbase', 'Crystal', 'Dart', 'Delphi', 'Deno', 'DigitalOcean', 'Django', 'Docker', 'Drupal', 'DynamoDB', 'Elasticsearch', 'Elixir', 'Erlang', 'Express', 'F#', 'FastAPI', 'Fastify', 'Firebase', 'Firebase Realtime Database', 'Flask', 'Flow', 'Fortran', 'Gatsby', 'Git', 'Go', 'Google Cloud', 'Google Cloud Platform', 'Groovy', 'HTML/CSS', 'Haskell', 'Heroku', 'Homebrew', 'IBM Cloud or Watson', 'IBM DB2', 'Java', 'JavaScript', 'Julia', 'Kotlin', 'Kubernetes', 'LISP', 'Laravel', 'Linode', 'Lua', 'MATLAB', 'Managed Hosting', 'MariaDB', 'Matlab', 'Microsoft Azure', 'Microsoft SQL Server', 'MongoDB', 'MySQL', 'Neo4j', 'Next.js', 'Node.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import root_mean_squared_error, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC                    # <--- Ny import
from sklearn.metrics import classification_report, roc_auc_score

# Läs in data
try:
    data = pd.read_csv("stackoverflow_full.csv")
except FileNotFoundError:
    print("Fel: Filen 'stackoverflow_full.csv' kunde inte hittas. Kontrollera sökvägen.")
    exit()

# --- Steg 1: Hantera 'HaveWorkedWith'
print("Förbehandlar kolumnen 'HaveWorkedWith'...")
data['HaveWorkedWith'] = data['HaveWorkedWith'].fillna('').str.split(';')
data['HaveWorkedWith'] = data['HaveWorkedWith'].apply(lambda x: [s.strip() for s in x])

mlb = MultiLabelBinarizer()
worked_with_encoded = mlb.fit_transform(data['HaveWorkedWith'])
worked_with_df = pd.DataFrame(worked_with_encoded,
                              columns=mlb.classes_,
                              index=data.index)

data = pd.concat([data.drop('HaveWorkedWith', axis=1), worked_with_df], axis=1)

# --- Steg 2: Features & label
y = data["Employed"]
cols_to_drop = ["Employed", "Employment", "MentalHealth", "Accessibility"]
X = data.drop(columns=cols_to_drop, errors='ignore')

# --- Steg 3: Sortera kolumner
numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist()

print(f"\nNumeriska kolumner: {numeric_cols}")
print(f"Kategoriska kolumner: {categorical_cols}")

# --- Steg 4: Preprocessing‑pipeline
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

# Dela upp i train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Steg 5: Lägg till SVM i modellordboken
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100,
                                           random_state=42),
    "SVM (RBF, probability=True)": SVC(kernel="rbf",
                                         probability=True,
                                         random_state=42)  # <--- SVM
}

print("\n--- Modeller och utvärdering ---")
for name, model in models.items():
    print(f"\nModell: {name}")

    # Skapa en komplett pipeline (preprocess + klassificerare)
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])

    # Träna på hela träningsuppsättningen
    pipeline.fit(X_train, y_train)

    # 5‑fold k‑val med ROC‑AUC
    cv_scores = cross_val_score(pipeline, X_train, y_train,
                                cv=5, scoring='roc_auc')
    print(f"Genomsnittlig ROC‑AUC vid 5‑fold: {np.mean(cv_scores):.4f}")

    # Testförutsägelser
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]

    print(f"ROC‑AUC på testdata: {roc_auc_score(y_test, y_proba):.4f}")
    print("Classification Report på testdata:")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("-" * 40)

# ---------- Valfri: Högsta sannolikheter med Random Forest ----------
final_rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100,
                                          random_state=42))
])
final_rf_pipeline.fit(X_train, y_train)
y_proba_rf = final_rf_pipeline.predict_proba(X_test)[:, 1]

proba_df = pd.DataFrame({
    "Candidate_ID": X_test.index,
    "Hire_Probability_RF": y_proba_rf
}).sort_values("Hire_Probability_RF", ascending=False)

print("\nHögsta sannolikheterna för anställning (Random Forest):")
print(proba_df.head(10).to_string(index=False))


Förbehandlar kolumnen 'HaveWorkedWith'...

Numeriska kolumner: ['Unnamed: 0', 'YearsCode', 'YearsCodePro', 'PreviousSalary', 'ComputerSkills', '', 'APL', 'ASP.NET', 'ASP.NET Core', 'AWS', 'Angular', 'Angular.js', 'Ansible', 'Assembly', 'Bash/Shell', 'Blazor', 'C', 'C#', 'C++', 'COBOL', 'Cassandra', 'Chef', 'Clojure', 'Cloud Firestore', 'Colocation', 'CouchDB', 'Couchbase', 'Crystal', 'Dart', 'Delphi', 'Deno', 'DigitalOcean', 'Django', 'Docker', 'Drupal', 'DynamoDB', 'Elasticsearch', 'Elixir', 'Erlang', 'Express', 'F#', 'FastAPI', 'Fastify', 'Firebase', 'Firebase Realtime Database', 'Flask', 'Flow', 'Fortran', 'Gatsby', 'Git', 'Go', 'Google Cloud', 'Google Cloud Platform', 'Groovy', 'HTML/CSS', 'Haskell', 'Heroku', 'Homebrew', 'IBM Cloud or Watson', 'IBM DB2', 'Java', 'JavaScript', 'Julia', 'Kotlin', 'Kubernetes', 'LISP', 'Laravel', 'Linode', 'Lua', 'MATLAB', 'Managed Hosting', 'MariaDB', 'Matlab', 'Microsoft Azure', 'Microsoft SQL Server', 'MongoDB', 'MySQL', 'Neo4j', 'Next.js', 'Node.