Laboration: Maskininlärning

In [None]:
import pandas as pd  
import numpy as np   
import matplotlib.pyplot as plt 
import seaborn as sns  

from sklearn.model_selection import train_test_split, GridSearchCV  
from sklearn.preprocessing import StandardScaler, Normalizer 

from sklearn.ensemble import RandomForestClassifier  
from sklearn.svm import LinearSVC  
from sklearn.neighbors import KNeighborsClassifier  

from sklearn.metrics import roc_auc_score  

from sklearn.ensemble import VotingClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
df = pd.read_csv("cardio_train.csv", sep=";") 

df.head()

In [None]:
df["age"] = round(df["age"] / 365, 2)  
df.head()

In [None]:
cardiovascular = df["cardio"].value_counts().values 
cardiovascular

In [None]:

fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(nrows=3, ncols=2, figsize=(16, 16))

cardiovascular_labels = ["Positiv", "Negativ"]
ax1.bar(cardiovascular_labels, cardiovascular) 
ax1.set_title("Positiv and Negativ Hjärt-Kärlsjukdom")

ax2.pie(df["cholesterol"].value_counts().values, labels=["Normal", "Över Normalt", "Extremt över normalt"], autopct="%1.1f%%")
ax2.set_title("Kolesterolnivåer")

ax3.hist(df["age"], bins=50) 
ax3.set_title("Åldersfördelning")

ax4.pie(df["smoke"].value_counts().values, labels=["icke-rökare", "rökare"], autopct="%1.1f%%")
ax4.set_title("Rökarefördelning")

ax5.hist(df["weight"], bins=50)
ax5.set_title("Viktfördelning")

ax6.hist(df["height"], bins=50)
ax6.set_title("Längdfördelning")

plt.show()

In [None]:
fig, ax7 = plt.subplots(figsize=(8, 4))

sns.countplot(data=df, x="cardio", hue="gender", ax=ax7)

ax7.set_xticks([0, 1])
ax7.set_xticklabels(["Negative", "Positive"])

ax7.set_xlabel("Hjärt-kärlsjukdom")

ax7.set_title("Negativ och Positiv Hjärt- och Kärlsjukdom mellan könen")

ax7.legend(title="Kön", labels=["Kvinnor", "Män"])

plt.show()

In [None]:
df["bmi"] = df["weight"] / ((df["height"] / 100) ** 2)
df["bmi"] = df["bmi"].round(1) 

# Filtrera bort orimliga BMI-värden. Sätter värden på rimliga värden som kan vara trovärdig.
df = df[(df["bmi"] <= 60) & (df["bmi"] >= 10)]

df["bmi"].min(), df["bmi"].max()

In [None]:
bins = [6, 18.4, 24.9, 29.9, 34.9, 39.9, 260]
labels = ["Undervikt", "Normalvikt", "Övervikt", 
          "Fet Klass 1", "Fet Klass 2", "Fet Klass 3"]

df["bmi_levels"] = pd.cut(df["bmi"], bins=bins, labels=labels)

df["bmi_levels"].value_counts()

In [None]:
# Filtrera bort orimliga värden för blodtryck. Värder som innehåller utifrån nedan begränsningar anses rimliga
# ap_hi (systoliskt) ska vara mellan 70 och 250
# ap_lo (diastoliskt) ska vara mellan 40 och 150
df = df[(df["ap_hi"] <= 250) & (df["ap_hi"] >= 70) &
        (df["ap_lo"] <= 150) & (df["ap_lo"] >= 50) &
        (df["ap_lo"] < df["ap_hi"])]

In [None]:
conditions = [
    (df["ap_hi"] < 90) | (df["ap_lo"] < 60),  
    (df["ap_hi"] < 120) & (df["ap_lo"] < 80),    
    (df["ap_hi"] >= 120) & (df["ap_hi"] < 130) & (df["ap_lo"] < 80),  
    ((df["ap_hi"] >= 130) & (df["ap_hi"] < 140)) | ((df["ap_lo"] >= 80) & (df["ap_lo"] < 90)),  
    ((df["ap_hi"] >= 140) & (df["ap_hi"] < 180)) | ((df["ap_lo"] >= 90) & (df["ap_lo"] < 120)),  
    (df["ap_hi"] >= 180) | (df["ap_lo"] >= 120)  
]

labels_bp = [
    "Under optimal",
    "Hälsosam",
    "Upphöjd",
    "Steg 1 hypertoni",
    "Steg 2 hypertoni",
    "Hypertoni krisis"
]

df["blood_pressure_levels"] = np.select(conditions, labels_bp, default="Unknown")

df["blood_pressure_levels"].value_counts()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(30, 10))

for ax, feature, title in zip(
    axs,
    ["bmi_levels", "blood_pressure_levels", "gender"],
    ["BMI Levels", "Blood Pressure Levels", "Gender (1 = Kvinnor, 2 = Män)"]):
    
    ratio = df.groupby(feature)["cardio"].mean()
    sns.barplot(x=ratio.index, y=ratio.values, ax=ax)
    ax.set_title(title)
    ax.set_ylabel("Proportion med hjärt- och kärlsjukdom")
    ax.set_ylim(0, 1)

plt.show()

In [None]:
corr_matrix = df.select_dtypes(include=[np.number]).corr()

plt.figure(figsize=(12, 10)) 
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")  
plt.title("Korrelationer mellan features")
plt.show()

In [None]:
df_model1 = df.copy() 

df_model1 = df_model1.drop(["ap_hi", "ap_lo", "height", "weight", "bmi"], axis=1)

df_model1 = pd.get_dummies(df_model1, columns=["kön", "bmi_levels", "blood_pressure_levels"]).astype(int)

df_model1 = df_model1.rename(columns={"gender_1":"gender_female","gender_2":"gender_male"})

df_model1.head()

In [None]:
X = df_model1.drop("cardio", axis=1)  
y = df_model1["cardio"]

np.random.seed(42)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, 
                                                    test_size=0.15, 
                                                    stratify=y) 
# Validering är 15% 
val_size = 0.15 / 0.85 


X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val,
                                                  test_size=val_size,
                                                  stratify=y_train_val)

scaler = StandardScaler()

normalize = Normalizer()

# Standardisering
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Normalisering
X_train_normalize = normalize.fit_transform(X_train)
X_val_normalize = normalize.transform(X_val)
X_test_normalize = normalize.transform(X_test)

print("Träningsdata:", X_train.shape, "| Valideringsdata:", X_val.shape, "| Testdata:", X_test.shape)

In [None]:
df_model2 = df.copy() 

df_model2 = df_model2.drop(["bmi_levels", "blood_pressure_levels", "height", "weight"], axis=1)

df_model2 = pd.get_dummies(df_model2, columns=["gender"]).astype(int)

df_model2 = df_model2.rename(columns={"gender_1":"gender_female","gender_2":"gender_male"})

df_model2.head()

In [None]:
X2 = df_model2.drop("cardio", axis=1)  
y2 = df_model2["cardio"]

np.random.seed(42)

X_train_val2, X_test2, y_train_val2, y_test2 = train_test_split(X2, y2, 
                                                    test_size=0.15, 
                                                    stratify=y2) 

# Validering är 15% 
val_size2 = 0.15 / 0.85  


X_train2, X_val2, y_train2, y_val2 = train_test_split(X_train_val2, y_train_val2,
                                                  test_size=val_size2,
                                                  stratify=y_train_val2)

scaler = StandardScaler()

normalize = Normalizer()

# Standardisering
X_train_scaled2 = scaler.fit_transform(X_train2)
X_val_scaled2 = scaler.transform(X_val2)
X_test_scaled2 = scaler.transform(X_test2)

# Normalisering
X_train_normalize2 = normalize.fit_transform(X_train2)
X_val_normalize2 = normalize.transform(X_val2)
X_test_normalize2 = normalize.transform(X_test2)

In [None]:
np.random.seed(42)

# Inställningar pga begränsningar på vanliga datorer.

models = {
    "LinearSVC": {
        "model": LinearSVC(max_iter=10000), 
        "params": {"C": [0.01, 0.1, 1, 10, 100]}  
                 },
    
    "KNN": {
        "model": KNeighborsClassifier(), 
        "params": {"n_neighbors": [3, 5, 7, 9],  
                   "p": [1, 2]} 
           },
    
    "RandomForest": {
        "model": RandomForestClassifier(), 
        "params": {"n_estimators": [50, 100, 200], 
                   "max_depth": [None, 5, 10],        
                   "min_samples_split": [2, 5, 10]}    
                     }
         }

In [18]:
# Standard Scaled Dataset 1

np.random.seed(42)

results = {}

for model_name, mp in models.items():
    print(f"\n--- {model_name} (df_model1) ---")
    
    grid = GridSearchCV(
        estimator=mp["model"],
        param_grid=mp["params"], 
        scoring="roc_auc", 
        cv=5, 
        n_jobs=-1,
        verbose=2 
    )
    
    grid.fit(X_train_scaled, y_train)
    
    try:
        y_val_pred = grid.predict_proba(X_val_scaled)[:, 1]
    except AttributeError:
        y_val_pred = grid.decision_function(X_val_scaled)
    
    auc = roc_auc_score(y_val, y_val_pred)
    
    results[model_name] = {
        "best_params": grid.best_params_,
        "validation_roc_auc": auc
    }
    
    print("Bästa parametrar:", grid.best_params_)
    print("Validation ROC-AUC:", auc)

print("\nSammanfattning av modellresultat")
for model_name, res in results.items():
    print(f"{model_name}: ROC-AUC = {res['validation_roc_auc']:.3f}, Bästa parametrar = {res['best_params']}")

KeyboardInterrupt: 

In [None]:
# Normalized Dataset 1

np.random.seed(42)

results = {}

for model_name, mp in models.items():
    print(f"\n--- {model_name} (df_model1) ---")
    
    grid = GridSearchCV(
        estimator=mp["model"],
        param_grid=mp["params"], 
        scoring="roc_auc", 
        cv=5, 
        n_jobs=-1, 
        verbose=2 
    )
    
    grid.fit(X_train_normalize, y_train)
    
    try:
        y_val_pred = grid.predict_proba(X_val_normalize)[:, 1]
    except AttributeError:
        y_val_pred = grid.decision_function(X_val_normalize)
    
    auc = roc_auc_score(y_val, y_val_pred)
    
    results[model_name] = {
        "best_params": grid.best_params_,
        "validation_roc_auc": auc
    }
    
    print("Bästa parametrar:", grid.best_params_)
    print("Validation ROC-AUC:", auc)

print("\nSammanfattning av modellresultat")
for model_name, res in results.items():
    print(f"{model_name}: ROC-AUC = {res['validation_roc_auc']:.3f}, Bästa parametrar = {res['best_params']}")

In [None]:
# Standard Scaled Dataset 2

np.random.seed(42)

results2 = {}

for model_name, mp in models.items():
    print(f"\n--- {model_name} (df_model2) ---")
    
    grid2 = GridSearchCV(
        estimator=mp["model"],
        param_grid=mp["params"],
        scoring="roc_auc",
        cv=5,
        n_jobs=-1,
        verbose=2
    )
    
    grid2.fit(X_train_scaled2, y_train2)
    
    try:
        y_val_pred2 = grid2.predict_proba(X_val_scaled2)[:, 1]
    except AttributeError:
        y_val_pred2 = grid2.decision_function(X_val_scaled2)
    
    auc2 = roc_auc_score(y_val2, y_val_pred2)
    
    results2[model_name] = {
        "best_params": grid2.best_params_,
        "validation_roc_auc": auc2
    }
    
    print("Bästa parametrar:", grid2.best_params_)
    print("Validation ROC-AUC:", auc2)

print("\nSammanfattning av modellresultat")
for model_name, res in results2.items():
    print(f"{model_name}: ROC-AUC = {res['validation_roc_auc']:.3f}, Bästa parametrar = {res['best_params']}")

In [None]:
# Normalized Dataset 2

np.random.seed(42)

results2 = {}

for model_name, mp in models.items():
    print(f"\n--- {model_name} (df_model2) ---")
    
    grid2 = GridSearchCV(
        estimator=mp["model"],
        param_grid=mp["params"],
        scoring="roc_auc",
        cv=5,
        n_jobs=-1,
        verbose=2
    )
    
    grid2.fit(X_train_normalize2, y_train2)

    try:
        y_val_pred2 = grid2.predict_proba(X_val_normalize2)[:, 1]
    except AttributeError:
        y_val_pred2 = grid2.decision_function(X_val_normalize2)
    
    auc2 = roc_auc_score(y_val2, y_val_pred2)
    
    results2[model_name] = {
        "best_params": grid2.best_params_,
        "validation_roc_auc": auc2
    }
    
    print("Bästa parametrar:", grid2.best_params_)
    print("Validation ROC-AUC:", auc2)


print("\nSammanfattning av modellresultat")
for model_name, res in results2.items():
    print(f"{model_name}: ROC-AUC = {res['validation_roc_auc']:.3f}, Bästa parametrar = {res['best_params']}")

In [None]:
# Standard Scaled Dataset 2 vals eftersom den gav bättre ROC-AUC 

np.random.seed(42)

X_train_fult = np.concatenate((X_train_scaled2, X_val_scaled2), axis=0)
y_train_fult = np.concatenate((y_train2, y_val2), axis=0)

linear_svc_best = LinearSVC(max_iter=10000, C=100)
knn_best = KNeighborsClassifier(n_neighbors=9, p=1)
rf_best = RandomForestClassifier(max_depth=10, min_samples_split=5, n_estimators=200)

ensemble_model = VotingClassifier(
    estimators=[
        ("LinearSVC", linear_svc_best),
        ("KNN", knn_best),
        ("RandomForest", rf_best)],
    voting="hard") 

ensemble_model.fit(X_train_fult, y_train_fult)

y_test_pred = ensemble_model.predict(X_test_scaled2)

cm = confusion_matrix(y_test2, y_test_pred)


tn, fp, fn, tp = cm.ravel()
ny_cm = np.array([[tp, fp],
                   [fn, tn]])

fig, ax = plt.subplots(figsize=(8, 6))

sns.heatmap(
    ny_cm,
    annot=True,          
    fmt="d",             
    cmap="Greens",        
    xticklabels=["Positiv", "Negativ"], 
    yticklabels=["Positiv", "Negativ"]) 

plt.title("Confusion Matrix")
plt.xlabel("Förutspått tabell")
plt.ylabel("Sanningstabell")
plt.show()

report = classification_report(y_test2, y_test_pred, target_names=["Negativ", "Positiv"])
print("Klassifikations Rapport:\n", report)