In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("stackoverflow_full.csv")
df["Country_raw"] = df["Country"]   # endast för stratifiering

counts = df["Country_raw"].value_counts()

rare = counts[counts < 2].index
df["Country_grouped"] = df["Country_raw"].replace(rare, "Other")

valid = counts[counts >= 2].index
df = df[df["Country_raw"].isin(valid)]

df.head()

Unnamed: 0.1,Unnamed: 0,Age,Accessibility,EdLevel,Employment,Gender,MentalHealth,MainBranch,YearsCode,YearsCodePro,Country,PreviousSalary,HaveWorkedWith,ComputerSkills,Employed,Country_raw,Country_grouped
0,0,<35,No,Master,1,Man,No,Dev,7,4,Sweden,51552.0,C++;Python;Git;PostgreSQL,4,0,Sweden,Sweden
1,1,<35,No,Undergraduate,1,Man,No,Dev,12,5,Spain,46482.0,Bash/Shell;HTML/CSS;JavaScript;Node.js;SQL;Typ...,12,1,Spain,Spain
2,2,<35,No,Master,1,Man,No,Dev,15,6,Germany,77290.0,C;C++;Java;Perl;Ruby;Git;Ruby on Rails,7,0,Germany,Germany
3,3,<35,No,Undergraduate,1,Man,No,Dev,9,6,Canada,46135.0,Bash/Shell;HTML/CSS;JavaScript;PHP;Ruby;SQL;Gi...,13,0,Canada,Canada
4,4,>35,No,PhD,0,Man,No,NotDev,40,30,Singapore,160932.0,C++;Python,2,0,Singapore,Singapore


In [3]:
cols_drop = [
    "Gender",
    "MentalHealth",
    "Accessibility",
    "Unnamed: 0",
    "HaveWorkedWith",   # tar bort råkolumnen
]
df = df.drop(columns=[c for c in cols_drop if c in df.columns])



In [4]:
df = df[df["YearsCodePro"] <= df["YearsCode"]]
df = df.drop(df[(df["Age"]=="<35") & (df["YearsCode"]>35)].index)

In [5]:
# PreviousSalary_norm är en standardiserad, relativ lön:

# 1.0 ≈ kandidatens lön = landets medianlön.

# >1.0 ≈ tjänar mer än medianen (t.ex. 2.4 = 240 % av median).

# <1.0 ≈ tjänar under medianen.

df['Country_original'] = df['Country'] 

median_salary_by_country = df.groupby("Country")["PreviousSalary"].median()
df["PreviousSalary_norm"] = df.apply(
    lambda r: r["PreviousSalary"] / median_salary_by_country[r["Country"]],
    axis=1
)

df = df.drop(columns="PreviousSalary")

In [6]:
df = pd.get_dummies(
    df,
    columns=["Age", "EdLevel", "MainBranch", "Country", "Employment"],
    drop_first=True
)

In [7]:
df.head()

Unnamed: 0,YearsCode,YearsCodePro,ComputerSkills,Employed,Country_raw,Country_grouped,Country_original,PreviousSalary_norm,Age_>35,EdLevel_NoHigherEd,...,Country_United Republic of Tanzania,Country_United States of America,Country_Uruguay,Country_Uzbekistan,"Country_Venezuela, Bolivarian Republic of...",Country_Viet Nam,Country_Yemen,Country_Zambia,Country_Zimbabwe,Employment_1
0,7,4,4,0,Sweden,Sweden,Sweden,0.88105,False,False,...,False,False,False,False,False,False,False,False,False,True
1,12,5,12,1,Spain,Spain,Spain,1.089669,False,False,...,False,False,False,False,False,False,False,False,False,True
2,15,6,7,0,Germany,Germany,Germany,1.191662,False,False,...,False,False,False,False,False,False,False,False,False,True
3,9,6,13,0,Canada,Canada,Canada,0.590838,False,False,...,False,False,False,False,False,False,False,False,False,True
4,40,30,2,0,Singapore,Singapore,Singapore,2.458366,True,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
print("Antal rader efter rensning:", len(df))
print("Antal kolumner efter encoding:", len(df.columns))
print("Första kolumnerna:", df.columns[:20].tolist())
print(df.head())

Antal rader efter rensning: 72859
Antal kolumner efter encoding: 175
Första kolumnerna: ['YearsCode', 'YearsCodePro', 'ComputerSkills', 'Employed', 'Country_raw', 'Country_grouped', 'Country_original', 'PreviousSalary_norm', 'Age_>35', 'EdLevel_NoHigherEd', 'EdLevel_Other', 'EdLevel_PhD', 'EdLevel_Undergraduate', 'MainBranch_NotDev', 'Country_Albania', 'Country_Algeria', 'Country_Andorra', 'Country_Angola', 'Country_Argentina', 'Country_Armenia']
   YearsCode  YearsCodePro  ComputerSkills  Employed Country_raw  \
0          7             4               4         0      Sweden   
1         12             5              12         1       Spain   
2         15             6               7         0     Germany   
3          9             6              13         0      Canada   
4         40            30               2         0   Singapore   

  Country_grouped Country_original  PreviousSalary_norm  Age_>35  \
0          Sweden           Sweden             0.881050    False   
1   

In [9]:
print(df.columns.tolist())

['YearsCode', 'YearsCodePro', 'ComputerSkills', 'Employed', 'Country_raw', 'Country_grouped', 'Country_original', 'PreviousSalary_norm', 'Age_>35', 'EdLevel_NoHigherEd', 'EdLevel_Other', 'EdLevel_PhD', 'EdLevel_Undergraduate', 'MainBranch_NotDev', 'Country_Albania', 'Country_Algeria', 'Country_Andorra', 'Country_Angola', 'Country_Argentina', 'Country_Armenia', 'Country_Australia', 'Country_Austria', 'Country_Azerbaijan', 'Country_Bahrain', 'Country_Bangladesh', 'Country_Barbados', 'Country_Belarus', 'Country_Belgium', 'Country_Belize', 'Country_Benin', 'Country_Bhutan', 'Country_Bolivia', 'Country_Bosnia and Herzegovina', 'Country_Botswana', 'Country_Brazil', 'Country_Bulgaria', 'Country_Cambodia', 'Country_Cameroon', 'Country_Canada', 'Country_Cape Verde', 'Country_Chile', 'Country_China', 'Country_Colombia', 'Country_Congo, Republic of the...', 'Country_Costa Rica', 'Country_Croatia', 'Country_Cuba', 'Country_Cyprus', 'Country_Czech Republic', "Country_Côte d'Ivoire", 'Country_Democr

In [10]:
# ===================== 1. Importer =====================
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# ===================== 2. Features & target =====================
# Ta bort target och icke-numeriska landkolumner
X = df.drop(columns=["Employed", "Country_raw", "Country_grouped", "Country_original"], errors="ignore")
y = df["Employed"]

# ===================== 3. Train/Val/Test-split =====================
# Först dela av ett 20 % orört test-set, stratifierat på land
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=df["Country_grouped"]
)

# ===================== 4. Modeller att jämföra =====================
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, solver="lbfgs"),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(random_state=42)
}

# ===================== 5. 5-fold Cross-Validation på träningsdatan =====================
print("=== 5-fold cross-validation (på 80 % träningsdata) ===")
for name, model in models.items():
    scores = cross_val_score(model, X_train_full, y_train_full,
                             cv=5, scoring="accuracy", n_jobs=-1)
    print(f"{name}: mean={scores.mean():.3f}, std={scores.std():.3f}")

# ===================== 6. Slutlig träning på hela träningsdatan =====================
print("\n=== Slutlig utvärdering på orört test-set ===")
for name, model in models.items():
    model.fit(X_train_full, y_train_full)
    preds = model.predict(X_test)
    print(f"\n{name}")
    print("Accuracy:", round(accuracy_score(y_test, preds), 3))
    print(classification_report(y_test, preds))


=== 5-fold cross-validation (på 80 % träningsdata) ===
LogisticRegression: mean=0.783, std=0.003
RandomForest: mean=0.776, std=0.003
GradientBoosting: mean=0.786, std=0.003

=== Slutlig utvärdering på orört test-set ===

LogisticRegression
Accuracy: 0.784
              precision    recall  f1-score   support

           0       0.76      0.77      0.76      6658
           1       0.80      0.79      0.80      7914

    accuracy                           0.78     14572
   macro avg       0.78      0.78      0.78     14572
weighted avg       0.78      0.78      0.78     14572


RandomForest
Accuracy: 0.775
              precision    recall  f1-score   support

           0       0.76      0.73      0.75      6658
           1       0.78      0.81      0.80      7914

    accuracy                           0.77     14572
   macro avg       0.77      0.77      0.77     14572
weighted avg       0.77      0.77      0.77     14572


GradientBoosting
Accuracy: 0.791
              precision   

In [11]:
# Logistic Regression exempel

from sklearn.linear_model import LogisticRegression
import pandas as pd

# Skapa och träna en logistisk regressionsmodell
lr = LogisticRegression(max_iter=1000, solver="lbfgs")
lr.fit(X_train, y_train)

# Visa de 20 features med störst absoluta koefficienter
coefs = pd.Series(lr.coef_[0], index=X_train.columns).sort_values(key=abs, ascending=False)
print(coefs.head(40))


lr.fit(X_train, y_train)
coefs = pd.Series(lr.coef_[0], index=X_train.columns).sort_values(key=abs, ascending=False)
print(coefs.head(40))


NameError: name 'X_train' is not defined

In [None]:
import pandas as pd

# Gruppindelning av antal färdigheter för överskådlighet
bins = [0, 5, 10, 15, 20, 30, 50, df["ComputerSkills"].max()]
labels = ["0–5", "6–10", "11–15", "16–20", "21–30", "31–50", "51+"]

skill_groups = pd.cut(df["ComputerSkills"], bins=bins, labels=labels, right=True)

# Beräkna anställningsgrad per grupp
hire_rate_by_skills = df.groupby(skill_groups)["Employed"].mean().round(3)

print(hire_rate_by_skills)

ComputerSkills
0–5      0.017
6–10     0.239
11–15    0.597
16–20    0.834
21–30    0.960
31–50    0.999
51+      1.000
Name: Employed, dtype: float64


  hire_rate_by_skills = df.groupby(skill_groups)["Employed"].mean().round(3)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, solver="lbfgs"),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f"\n=== {name} ===")
    print("Accuracy:", round(accuracy_score(y_test, preds), 3))
    print(classification_report(y_test, preds))




=== LogisticRegression ===
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6658
           1       1.00      1.00      1.00      7914

    accuracy                           1.00     14572
   macro avg       1.00      1.00      1.00     14572
weighted avg       1.00      1.00      1.00     14572


=== RandomForest ===
Accuracy: 0.966
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      6658
           1       0.96      0.98      0.97      7914

    accuracy                           0.97     14572
   macro avg       0.97      0.96      0.97     14572
weighted avg       0.97      0.97      0.97     14572


=== GradientBoosting ===
Accuracy: 0.993
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6658
           1       1.00      0.99      0.99      7914

    accuracy                           0.99     14572
   mac

In [None]:
import pandas as pd
import numpy as np

# Läs in datan
df = pd.read_csv("stackoverflow_full.csv")

# Skapa en kopia av den råa landskolumnen för senare användning
df["Country_raw"] = df["Country"]

# Ta bort länder med för få svar (mindre än 2)
counts = df["Country_raw"].value_counts()
valid_countries = counts[counts >= 2].index
df = df[df["Country_raw"].isin(valid_countries)]

# Släpp icke-relevanta kolumner
cols_to_drop = [
    "Gender",
    "MentalHealth",
    "Accessibility",
    "Unnamed: 0",
    "HaveWorkedWith",
]
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

# Rensa data som verkar orimlig
# Se till att 'YearsCodePro' inte är större än 'YearsCode'
df = df[df["YearsCodePro"] <= df["YearsCode"]]

# Ta bort rader där 'Age' är '<35' men 'YearsCode' är >35
df = df.drop(df[(df["Age"] == "<35") & (df["YearsCode"] > 35)].index)

# Skapa en normaliserad lönkolumn
df["Country_original"] = df["Country"]
median_salary_by_country = df.groupby("Country")["PreviousSalary"].median()
df["PreviousSalary_norm"] = df.apply(
    lambda r: r["PreviousSalary"] / median_salary_by_country[r["Country"]],
    axis=1
)
# Släpp den ursprungliga lönkolumnen
df = df.drop(columns="PreviousSalary")

# Utför One-Hot Encoding på kategoriska variabler
categorical_cols = ["Age", "EdLevel", "MainBranch", "Country", "Employment"]
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Skapa en grupperad landskolumn för stratifiering, baserat på den rensade listan av länder
counts_grouped = df["Country_raw"].value_counts()
rare_grouped = counts_grouped[counts_grouped < 2].index
df["Country_grouped"] = df["Country_raw"].replace(rare_grouped, "Other")

# Visa de första raderna av den färdiga datan
print("Antal rader efter rensning:", len(df))
print("Antal kolumner efter encoding:", len(df.columns))
print("Första kolumnerna:", df.columns[:20].tolist())
print(df.head())