In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score, mean_squared_error, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier, XGBRegressor
from sklearn.calibration import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [16]:
conditions = {("A", "B", "C", "D"): "hyperthyroid condition", ("E", "F", "G", "H"): "hypothyroid condition", ("I", "J"): "binding protein", ("K"): "general health", ("L", "M", "N"): "replacement therapy",("O", "P", "Q", "S", "T"): "other",("R"): "discordant results", ("-"): "healthy"}
def create_target(verdict):
    if len(verdict) == 1:
        for key in conditions:
            if verdict in key:
                return conditions[key]
        return "invalid verdict"
    else:
        return "other"
def dataCreator(targetColumun):
    dt = pd.read_csv('proj-data.csv')
    dt = dt.replace('?',np.nan)
    dt = dt[dt['age:'] <= 120]
    dt.loc[(dt['sex:'].isna()) & (dt['pregnant:'] == True), 'sex:'] = 0
    mColumns = ['TSH:', 'T3:', 'TT4:', "T4U:", "FTI:", "TBG:"]
    dt["diagnoses"] = dt["diagnoses"].apply(create_target)
    dt["target"] = dt[targetColumun]
    dt = dt.dropna(subset=['sex:'])
    dt = dt.drop([targetColumun, "referral source:", "[record identification]"], axis=1)
    dt = dt.replace({ "f": 0, "t": 1, "F":0, "M":1})
    for column in mColumns:
        dt[column] = pd.to_numeric(dt[column], errors='coerce')
    numeric_columns = dt.select_dtypes(include=[np.number])
    dt[numeric_columns.columns] = numeric_columns.fillna(-1)
    return dt
def modelCreator(dt, problem_type):
    le = LabelEncoder()
    X, y = dt.drop("target", axis=1), le.fit_transform(dt["target"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
    classification_models = {"Logistic Regression": LogisticRegression(max_iter=10000),"Decision Tree Classifier": DecisionTreeClassifier(),"Random Forest Classifier": RandomForestClassifier(max_depth=15, min_samples_split=2, n_estimators=150, random_state=4),"SVM": SVC(),"KNN Classifier": KNeighborsClassifier(), "MLP": MLPClassifier(max_iter=1000),"NaiveBayes": GaussianNB(),"XGB Classifier": XGBClassifier(eval_metric='mlogloss')}
    regression_models = {"Linear Regression": LinearRegression(),"Decision Tree Regressor": DecisionTreeRegressor(),"Random Forest Regressor": RandomForestRegressor(),"SVR": SVR(),"KNN Regressor": KNeighborsRegressor(),"Ridge Regression": Ridge(),"Lasso Regression": Lasso(),"XGB Regressor": XGBRegressor(eval_metric='mlogloss')}
    models = classification_models if problem_type == 'classification' else regression_models
    scaler = StandardScaler()
    X_train_scaled, X_test_scaled = scaler.fit_transform(X_train), scaler.transform(X_test)
    results = []
    for name, model in models.items():
        model.fit(X_train_scaled, y_train)
        predictions = model.predict(X_test_scaled)
        if problem_type == 'classification':
            f1 = f1_score(y_test, predictions, average='macro')
            results.append((name, model, f1))
            print(f"Model {name}: " +f"Accuracy: {format(accuracy_score(y_test, predictions), '.3f')}| " +f"F1 Score avg: {format(f1, '.3f')}| "+f"Precision: {format(precision_score(y_test, predictions, average='macro'), '.3f')}| "+f"Recall: {format(recall_score(y_test, predictions, average='macro'), '.3f')}")
        else:
            mse = mean_squared_error(y_test, predictions)
            results.append((name, model, mse))
            print(f"Model {name}: " +f"MSE: {format(mse, '.3f')}")
    top_3_models = sorted(results, key=lambda x: x[2], reverse=True)[:3] if problem_type == 'classification' else sorted(results, key=lambda x: x[2])[:3]
    return [(name, model) for name, model, _ in top_3_models]

In [17]:
def pearson_spearman(dt):
    dt['target_code'] = dt['target'].astype('category').cat.codes
    corr_matrixSpear = dt.drop('target', axis=1).corr(method='spearman')
    spearman_corr = corr_matrixSpear['target_code'].abs().sort_values(ascending=False)
    spearman_corr = spearman_corr.drop('target_code')
    dt.drop('target_code', axis=1, inplace=True)
    # plt.figure(figsize=(5, 5))
    # spearman_corr.plot(kind='bar')
    # plt.title('Feature Correlation with Target')
    # plt.ylabel('Correlation')
    # plt.show()
    return spearman_corr.index.tolist()
def feature_importance(models, X, y):
    le = LabelEncoder()
    y = le.fit_transform(y)
    feature_names = X.columns
    d = dict()
    for name, model in models:
        print(f"Model: {name}")
        model.fit(X, y)
        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
            indices = np.argsort(importances)[::-1]
            names = [feature_names[i] for i in indices]
            d.update({name: names})
            plt.figure()
            plt.title(f"Feature Importance for {name}")
            plt.barh(range(X.shape[1]), importances[indices])
            plt.yticks(range(X.shape[1]), names)
            plt.show()
        else:
            print("Model does not support feature importance")
    return d
def sfs_selector(dt, models, X, y, direction):
    le = LabelEncoder()
    y = le.fit_transform(y)
    d = dict()
    print(f"{direction.capitalize()} Selection")
    for name, model in models:
        sfs = SequentialFeatureSelector(model, n_features_to_select=10, direction=direction)
        train, test = train_test_split(dt, test_size=0.2, random_state=4)
        N, M = train.shape
        M = M - 1
        sfs.fit(X, y)
        features = sfs.get_support()
        seleFeatures = np.arange(M)[features]
        seleFeatureNames = train.columns[seleFeatures].append(pd.Index(['target']))
        d.update({name: seleFeatureNames})
    return d

In [18]:
def model_tuning_with_scalers(models, param_grid, problem_type):
    pipelines = {}
    for model_name, (model, X, y) in models.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
        pipelines[f"{model_name} Standard"] = (Pipeline([("Scaler", StandardScaler()), (model_name, model)]), X_train, y_train, X_test, y_test)
        pipelines[f"{model_name} MinMax"] = (Pipeline([("Scaler", MinMaxScaler()), (model_name, model)]), X_train, y_train, X_test, y_test)
    tuned_models = {}
    scoring_type = "f1_macro" if problem_type == 'classification' else "neg_root_mean_squared_error"
    for pipeline_name, (pipeline, X, y, X_test1, y_test1) in pipelines.items():
        search = GridSearchCV(pipeline, param_grid[pipeline_name.split()[0]], n_jobs=-1, scoring=scoring_type)
        search.fit(X, y)
        best_model = search.best_estimator_
        tuned_models[pipeline_name] = {'model': best_model, 'X': X, 'y': y, 'X_test': X_test1, 'y_test': y_test1}
        changed_params = {param: value for param, value in best_model.named_steps[pipeline_name.split()[0]].get_params().items() if value != best_model.named_steps[pipeline_name.split()[0]].__class__().get_params()[param]}
        print(f"Pipeline: {pipeline_name}," f"Changed Params: {changed_params}, "f"Best cv-score: {search.best_score_}")
    return tuned_models

In [19]:
def evaluate_models(tuned_models, model_name, problem_type):
    X_train,y_train = tuned_models[model_name]["X"], tuned_models[model_name]["y"]
    X_test, y_test = tuned_models[model_name]["X_test"],tuned_models[model_name]["y_test"]
    model = tuned_models[model_name]["model"]
    scaler = model.named_steps['Scaler']
    X_train_scaled, X_test_scaled = scaler.fit_transform(X_train), scaler.fit_transform(X_test)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    if problem_type == 'classification':
        train_accuracy, test_accuracy = accuracy_score(y_train, model.predict(X_train_scaled)), accuracy_score(y_test, y_pred)
        print(f"Model: {model_name}, Train Accuracy: {train_accuracy:.3f}, Test Accuracy: {test_accuracy:.3f}, F1 macro: {f1_score(y_test, y_pred, average='macro'):.3f}, Recall: {recall_score(y_test, y_pred, average='macro'):.3f}")
    else:
        print(f"Model: {model_name}, MSE: {mean_squared_error(y_test, y_pred):.3f}, R2: {model.score(X_test_scaled, y_test):.3f}")

# O1

In [20]:
dt = dataCreator("diagnoses")
best_models = modelCreator(dt, "classification")
# spear_corr = pearson_spearman(dt)
# most_important_features = feature_importance(best_models, dt.drop("target", axis=1), dt["target"])
sfs_forward = sfs_selector(dt, best_models, dt.drop("target", axis=1), dt["target"], 'forward')
# print(' '.join(name for name, _ in best_models))
print(' '.join(str(sfs_forward[key].tolist()) for key in sfs_forward))
le = LabelEncoder()
dt_tree, dt_forest, dt_xgb = dt.copy(), dt.copy(), dt.copy()
tree_best_features = ['sex:','on thyroxine:','thyroid surgery:','tumor:','TSH:','T3:','TT4:','T4U:','FTI:','TBG:',"target"]
forest_best_features = ['on thyroxine:','pregnant:','thyroid surgery:','TSH:','T3 measured:','T3:','TT4:','T4U:','FTI:','TBG:',"target"]
xgb_best_features = ['age:','sex:','on thyroxine:','query on thyroxine:','on antithyroid medication:','thyroid surgery:','I131 treatment:','tumor:','psych:','TSH:','T3:','TT4:', 'T4U:','FTI:', 'TBG:',"target"]
cols_to_drop_tree, cols_to_drop_forest, cols_to_drop_xgb = dt_tree.columns.difference(tree_best_features), dt_forest.columns.difference(forest_best_features), dt_xgb.columns.difference(xgb_best_features)
dt_tree, dt_forest, dt_xgb = dt_tree.drop(cols_to_drop_tree, axis=1), dt_forest.drop(cols_to_drop_forest, axis=1), dt_xgb.drop(cols_to_drop_xgb, axis=1)
X_tree, X_forest, X_xgb = dt_tree.drop('target', axis=1), dt_forest.drop('target', axis=1), dt_xgb.drop('target', axis=1)
y_tree, y_forest, y_xgb = le.fit_transform(dt_tree['target']), le.fit_transform(dt_forest['target']), le.fit_transform(dt_xgb['target'])
models = {"DecisionTree": (DecisionTreeClassifier(), X_tree, y_tree),"RandomForest": (RandomForestClassifier(n_estimators=150,random_state=4), X_forest, y_forest), "XGB": (XGBClassifier(eval_metric='mlogloss'), X_xgb, y_xgb)}
for model_name, (model, X, y) in models.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    train_preds,test_preds= model.predict(X_train), model.predict(X_test)
    print(f"Score do {model_name}: Train acc: {format(accuracy_score(y_train, train_preds), '.3f')}| Test acc {format(accuracy_score(y_test, test_preds), '.3f')}| F1 macro: {format(f1_score(y_test, test_preds, average='macro'), '.3f')}| Recall: {format(recall_score(y_test, test_preds, average='macro'), '.3f')}")
param_grid = {"XGB": {"XGB__max_depth": [10, 15, 25, 30],"XGB__n_estimators": [200, 300, 400],"XGB__learning_rate": [0.01, 0.1, 0.2],"XGB__subsample": [0.7, 0.8, 0.9],"XGB__colsample_bytree": [0.5, 0.7, 1.0]},"RandomForest": {"RandomForest__max_depth": [15, 20, 25],"RandomForest__n_estimators": [150, 200, 250],"RandomForest__min_samples_split": [2, 5, 10],"RandomForest__min_samples_leaf": [1, 2, 5]},"DecisionTree": {"DecisionTree__max_depth": [15, 20, 25, 30],"DecisionTree__min_samples_split": [2, 5, 10],"DecisionTree__min_samples_leaf": [1, 2, 5]}}
tuned_models = model_tuning_with_scalers(models, param_grid, "classification")
for name in tuned_models:
    evaluate_models(tuned_models, name, "classification")

Model Logistic Regression: Accuracy: 0.845| F1 Score avg: 0.586| Precision: 0.731| Recall: 0.509
Model Decision Tree Classifier: Accuracy: 0.929| F1 Score avg: 0.820| Precision: 0.824| Recall: 0.820
Model Random Forest Classifier: Accuracy: 0.938| F1 Score avg: 0.841| Precision: 0.882| Recall: 0.814
Model SVM: Accuracy: 0.804| F1 Score avg: 0.457| Precision: 0.753| Recall: 0.376
Model KNN Classifier: Accuracy: 0.831| F1 Score avg: 0.588| Precision: 0.757| Recall: 0.506
Model MLP: Accuracy: 0.897| F1 Score avg: 0.757| Precision: 0.779| Recall: 0.743
Model NaiveBayes: Accuracy: 0.075| F1 Score avg: 0.140| Precision: 0.190| Recall: 0.296
Model XGB Classifier: Accuracy: 0.948| F1 Score avg: 0.878| Precision: 0.882| Recall: 0.877
Forward Selection
['age:', 'sex:', 'on thyroxine:', 'thyroid surgery:', 'TSH:', 'T3:', 'TT4:', 'T4U:', 'FTI:', 'TBG:', 'target'] ['on thyroxine:', 'thyroid surgery:', 'TSH:', 'T3 measured:', 'T3:', 'TT4:', 'T4U measured:', 'T4U:', 'FTI:', 'TBG:', 'target'] ['age:',

## O2

### Sex

In [100]:
dt = dataCreator("sex:")
le = LabelEncoder()
best_models = modelCreator(dt, "classification")
print(' '.join(name for name, _ in best_models))
# spear_corr = pearson_spearman(dt)
# most_important_features = feature_importance(best_models, dt.drop("target", axis=1), dt["target"])
sfs_forward = sfs_selector(dt, best_models, dt.drop("target", axis=1), dt["target"], 'forward')
print(' '.join(str(sfs_forward[key].tolist()) for key in sfs_forward))
xgb_best_features = ["T4U:","pregnant:","on thyroxine:","T4U measured:","TT4 measured:","on thyroxine:","query hyperthyroid:","tumor:","psych:","TBG measured:", "target"]
tree_best_features = ["on thyroxine:","on antithyroid medication:","query hyperthyroid:","query hypothyroid:","tumor","psych","TSH measured:","T4U measured:","FTI measured:", "target"]
mlp_best_features = ['on thyroxine:','query on thyroxine:','thyroid surgery:','tumor:','hypopituitary:','psych:','TSH measured:','T3 measured:','T3:','T4U:','TBG measured:','TBG:','diagnoses', "target"]
dt_tree, dt_mlp, dt_xgb = dt.copy(), dt.copy(), dt.copy()
cols_to_drop_tree, cols_to_drop_mlp, cols_to_drop_xgb = dt_tree.columns.difference(tree_best_features), dt_mlp.columns.difference(mlp_best_features), dt_xgb.columns.difference(xgb_best_features)
dt_tree, dt_mlp, dt_xgb = dt_tree.drop(cols_to_drop_tree, axis=1), dt_mlp.drop(cols_to_drop_mlp, axis=1), dt_xgb.drop(cols_to_drop_xgb, axis=1)
X_tree, X_mlp, X_xgb = dt_tree.drop('target', axis=1), dt_mlp.drop('target', axis=1), dt_xgb.drop('target', axis=1)
y_tree, y_mlp, y_xgb = le.fit_transform(dt_tree['target']), le.fit_transform(dt_mlp['target']), le.fit_transform(dt_xgb['target'])
models = {"DecisionTree": (DecisionTreeClassifier(), X_tree, y_tree),"MLP": (MLPClassifier(max_iter=1000), X_mlp, y_mlp),"XGB": (XGBClassifier(eval_metric='mlogloss'), X_xgb, y_xgb),}
for model_name, (model, X, y) in models.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    train_preds,test_preds= model.predict(X_train), model.predict(X_test)
    print(f"Score do {model_name}: Train acc: {format(accuracy_score(y_train, train_preds), '.3f')} Test acc {format(accuracy_score(y_test, test_preds), '.3f')} F1 macro: {format(f1_score(y_test, test_preds, average='macro'), '.3f')}Recall: {format(recall_score(y_test, test_preds, average='macro'), '.3f')}")
param_grid = {"DecisionTree": {"DecisionTree__max_depth": [5, 10, 15, 20, 25, 30],"DecisionTree__min_samples_split": [2, 5, 10],"DecisionTree__min_samples_leaf": [1, 2, 4]},"MLP": {"MLP__hidden_layer_sizes": [(50,50,50), (50,100,50), (100,)],"MLP__activation": ['tanh', 'relu'],"MLP__solver": ['sgd', 'adam'],"MLP__alpha": [0.0001, 0.05],"MLP__learning_rate": ['constant','adaptive'],},"XGB": {"XGB__max_depth": [5, 10, 15, 20, 25, 30],"XGB__n_estimators": [100, 150, 200],"XGB__learning_rate": [0.01, 0.1, 0.2, 0.3],"XGB__subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],"XGB__colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}}
tuned_models = model_tuning_with_scalers(models, param_grid, "classification")
for name in tuned_models:
    evaluate_models(tuned_models, name, "classification")

Model Logistic Regression: Accuracy: 0.678| F1 Score avg: 0.543| Precision: 0.640| Recall: 0.562
Model Decision Tree Classifier: Accuracy: 0.620| F1 Score avg: 0.566| Precision: 0.569| Recall: 0.565
Model Random Forest Classifier: Accuracy: 0.675| F1 Score avg: 0.540| Precision: 0.630| Recall: 0.560
Model SVM: Accuracy: 0.670| F1 Score avg: 0.468| Precision: 0.663| Recall: 0.528
Model KNN Classifier: Accuracy: 0.646| F1 Score avg: 0.565| Precision: 0.585| Recall: 0.566
Model MLP: Accuracy: 0.682| F1 Score avg: 0.602| Precision: 0.636| Recall: 0.601
Model NaiveBayes: Accuracy: 0.544| F1 Score avg: 0.544| Precision: 0.604| Recall: 0.604
Model XGB Classifier: Accuracy: 0.673| F1 Score avg: 0.596| Precision: 0.623| Recall: 0.595
MLP XGB Classifier Decision Tree Classifier
Forward Selection
['query on thyroxine:', 'on antithyroid medication:', 'pregnant:', 'tumor:', 'psych:', 'T3:', 'T4U measured:', 'T4U:', 'TBG measured:', 'TBG:', 'target'] ['query hyperthyroid:', 'lithium:', 'tumor:', 'hy

### Age

In [62]:
dt = dataCreator("age:")
le = LabelEncoder()
best_models = modelCreator(dt, "regression")
print(' '.join(name for name, _ in best_models))
# spear_corr = pearson_spearman(dt)
# most_important_features = feature_importance(best_models, dt.drop("target", axis=1), dt["target"])
sfs_forward = sfs_selector(dt, best_models, dt.drop("target", axis=1), dt["target"], 'forward')
print(' '.join(str(sfs_forward[key].tolist()) for key in sfs_forward))
svr_best_features = ['query on thyroxine:', 'on antithyroid medication:', 'sick:','pregnant:', 'thyroid surgery:', 'I131 treatment:', 'lithium:','goitre:', 'tumor:', 'hypopituitary:', 'psych:', 'TSH measured:', 'T3:','T4U:', 'TBG measured:', 'target']
linear_best_features = ['on antithyroid medication:', 'sick:', 'pregnant:', 'thyroid surgery:','I131 treatment:', 'lithium:', 'goitre:', 'psych:', 'TSH measured:','T3 measured:', 'T3:', 'TT4:', 'T4U:', 'FTI:', 'TBG:', 'target']
ridge_best_features = ['on antithyroid medication:', 'sick:', 'pregnant:', 'thyroid surgery:','I131 treatment:', 'lithium:', 'goitre:', 'psych:', 'TSH measured:','T3 measured:', 'T3:', 'TT4:', 'T4U:', 'FTI:', 'TBG:', 'target']
dt_svr, dt_linear, dt_ridge = dt.copy(), dt.copy(), dt.copy()
cols_to_drop_svr, cols_to_drop_linear, cols_to_drop_ridge = dt_svr.columns.difference(svr_best_features), dt_linear.columns.difference(linear_best_features), dt_ridge.columns.difference(ridge_best_features)
dt_svr, dt_linear, dt_ridge = dt_svr.drop(cols_to_drop_svr, axis=1), dt_linear.drop(cols_to_drop_linear, axis=1), dt_ridge.drop(cols_to_drop_ridge, axis=1)
X_svr, X_linear, X_ridge = dt_svr.drop('target', axis=1), dt_linear.drop('target', axis=1), dt_ridge.drop('target', axis=1)
y_svr, y_linear, y_ridge = le.fit_transform(dt_svr['target']), le.fit_transform(dt_linear['target']), le.fit_transform(dt_ridge['target'])
models = {"SVR": (SVR(), X_svr, y_svr),"Linear": (LinearRegression(), X_linear, y_linear),"Ridge": (Ridge(), X_ridge, y_ridge),}
for model_name, (model, X, y) in models.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f'R^2: {model.score(X_test, y_test)} | MSE: {mean_squared_error(y_test, y_pred)}')
param_grid = {"SVR": {'SVR__kernel': ['linear', 'rbf'],'SVR__C': [0.1, 1, 10, 100, 1000],'SVR__gamma': ['scale', 'auto', 0.1, 1, 10]},"Linear": {"Linear__fit_intercept": [True, False],"Linear__copy_X": [True, False],"Linear__n_jobs": [None, 1, 2, 3, 4, 5],"Linear__positive": [True, False],},"Ridge": {'Ridge__alpha': [0.01, 0.1, 1, 10, 100],'Ridge__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],'Ridge__fit_intercept': [True, False],'Ridge__copy_X': [True, False],'Ridge__max_iter': [None, 100, 1000, 5000],'Ridge__positive': [True, False],},}
tuned_models = model_tuning_with_scalers(models, param_grid, "regression")
for name in tuned_models:
    evaluate_models(tuned_models, name, "regression")

Model Linear Regression: MSE: 316.904
Model Decision Tree Regressor: MSE: 643.062
Model Random Forest Regressor: MSE: 343.446
Model SVR: MSE: 315.242
Model KNN Regressor: MSE: 364.113
Model Ridge Regression: MSE: 316.860
Model Lasso Regression: MSE: 331.007
Model XGB Regressor: MSE: 346.126
SVR Ridge Regression Linear Regression
Forward Selection
['on antithyroid medication:', 'sick:', 'pregnant:', 'thyroid surgery:', 'I131 treatment:', 'goitre:', 'psych:', 'TSH measured:', 'T3:', 'T4U:', 'target'] ['sick:', 'pregnant:', 'I131 treatment:', 'psych:', 'TSH measured:', 'T3 measured:', 'T3:', 'TT4:', 'T4U:', 'FTI:', 'target'] ['sick:', 'pregnant:', 'I131 treatment:', 'psych:', 'TSH measured:', 'T3 measured:', 'T3:', 'TT4:', 'T4U:', 'FTI:', 'target']
R^2: 0.08525374104362526 | MSE: 321.76644057439194
R^2: 0.08587947202577839 | MSE: 321.5463366615165
R^2: 0.08594919872774098 | MSE: 321.5218099553574
Pipeline: SVR Standard, Scaler: StandardScaler, Model: SVR, Changed Params: {'gamma': 1}, Bes

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.calibration import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
conditions = {("A", "B", "C", "D"): "hyperthyroid condition", ("E", "F", "G", "H"): "hypothyroid condition", ("I", "J"): "binding protein", ("K"): "general health", ("L", "M", "N"): "replacement therapy",("O", "P", "Q", "S", "T"): "other",("R"): "discordant results", ("-"): "healthy"}
def create_target(verdict):
    if len(verdict) == 1:
        for key in conditions:
            if verdict in key:
                return conditions[key]
        return "invalid verdict"
    else:
        return "other"
def dataCreator(targetColumun, dt1, dt2):
    dt1 = pd.read_csv(dt1)
    dt2 = pd.read_csv(dt2)
    dt = pd.concat([dt1, dt2], axis=1)
    dt = dt.replace('?',np.nan)
    mColumns = ['TSH:', 'T3:', 'TT4:', "T4U:", "FTI:", "TBG:"]
    dt["diagnoses"] = dt["diagnoses"].apply(create_target)
    le = LabelEncoder()
    if targetColumun != "diagnoses":
        dt['diagnoses'] = le.fit_transform(dt['diagnoses'])
    dt["target"] = dt[targetColumun]
    dt = dt.drop([targetColumun, "referral source:", "[record identification]"], axis=1)
    dt = dt.replace({ "f": 0, "t": 1, "F":0, "M":1})
    for column in mColumns:
        dt[column] = pd.to_numeric(dt[column], errors='coerce')
    numeric_columns = dt.select_dtypes(include=[np.number])
    dt[numeric_columns.columns] = numeric_columns.fillna(-1)
    return dt
def evaluate_models(tuned_models, problem_type):
    for model_name, (model, X, y, scaler) in tuned_models.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
        X_scaled_train = scaler.fit_transform(X_train)
        X_scaled_test = scaler.transform(X_test)
        model.fit(X_scaled_train, y_train)
        y_pred = model.predict(X_scaled_test)
        if problem_type == 'regression':
            print(f"Model: {model_name}, MSE: {mean_squared_error(y_test, y_pred):.3f}, RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}, R2: {model.score(X_scaled_test, y_test):.3f}")
        else:
            print(f"Model: {model_name}, Accuracy: {accuracy_score(y_test, y_pred):.3f}, F1 Score avg: {f1_score(y_test, y_pred, average='macro'):.3f}, Precision: {precision_score(y_test, y_pred, average='macro'):.3f}, Recall: {recall_score(y_test, y_pred, average='macro'):.3f}")
o1 = dataCreator("diagnoses", "proj-test-data.csv", "proj-test-class.csv")
o2 = dataCreator("sex:","proj-test-data.csv", "proj-test-class.csv")
o2age = dataCreator("age:","proj-test-data.csv", "proj-test-class.csv")
def testO1(dt):
    le = LabelEncoder()
    dt_tree, dt_forest = dt.copy(), dt.copy()
    tree_best_features = ['sex:','on thyroxine:','thyroid surgery:','tumor:','TSH:','T3:','TT4:','T4U:','FTI:','TBG:',"target"]
    forest_best_features = ['on thyroxine:','pregnant:','thyroid surgery:','TSH:','T3 measured:','T3:','TT4:','T4U:','FTI:','TBG:',"target"]
    cols_to_drop_tree, cols_to_drop_forest = dt_tree.columns.difference(tree_best_features), dt_forest.columns.difference(forest_best_features)
    dt_tree, dt_forest = dt_tree.drop(cols_to_drop_tree, axis=1), dt_forest.drop(cols_to_drop_forest, axis=1)
    X_tree, X_forest  = dt_tree.drop('target', axis=1), dt_forest.drop('target', axis=1)
    y_tree, y_forest = le.fit_transform(dt_tree['target']), le.fit_transform(dt_forest['target'])
    tuned_models = {"RandomForest": (RandomForestClassifier(max_depth=20, min_samples_split=5, n_estimators=250, random_state=4), X_forest, y_forest, StandardScaler()), "DecisionTree": (DecisionTreeClassifier(max_depth=30, min_samples_leaf=2, min_samples_split=10), X_tree, y_tree, StandardScaler())}
    evaluate_models(tuned_models, "classification")
def testO2sex(dt):
    le = LabelEncoder()
    tree_best_features = ["on thyroxine:","on antithyroid medication:","query hyperthyroid:","query hypothyroid:","tumor","psych","TSH measured:","T4U measured:","FTI measured:", "target"]
    mlp_best_features = ['on thyroxine:','query on thyroxine:','thyroid surgery:','tumor:','hypopituitary:','psych:','TSH measured:','T3 measured:','T3:','T4U:','TBG measured:','TBG:','diagnoses', "target"]
    dt_tree, dt_mlp = dt.copy(), dt.copy()
    cols_to_drop_tree, cols_to_drop_mlp = dt_tree.columns.difference(tree_best_features), dt_mlp.columns.difference(mlp_best_features)
    dt_tree, dt_mlp = dt_tree.drop(cols_to_drop_tree, axis=1), dt_mlp.drop(cols_to_drop_mlp, axis=1)
    X_tree, X_mlp = dt_tree.drop('target', axis=1), dt_mlp.drop('target', axis=1)
    y_tree, y_mlp = le.fit_transform(dt_tree['target']), le.fit_transform(dt_mlp['target'])
    tuned_models = {"DecisionTree": (DecisionTreeClassifier(max_depth=5, min_samples_split=10), X_tree, y_tree, StandardScaler()),"MLP":(MLPClassifier(alpha=0.05, hidden_layer_sizes=(50, 50, 50), learning_rate='adaptive', max_iter=1000), X_mlp, y_mlp, MinMaxScaler())}
    evaluate_models(tuned_models, "classification")
def testO3age(dt):
    le = LabelEncoder()
    linear_best_features = ['on antithyroid medication:', 'sick:', 'pregnant:', 'thyroid surgery:','I131 treatment:', 'lithium:', 'goitre:', 'psych:', 'TSH measured:','T3 measured:', 'T3:', 'TT4:', 'T4U:', 'FTI:', 'TBG:', 'target']
    ridge_best_features = ['on antithyroid medication:', 'sick:', 'pregnant:', 'thyroid surgery:','I131 treatment:', 'lithium:', 'goitre:', 'psych:', 'TSH measured:','T3 measured:', 'T3:', 'TT4:', 'T4U:', 'FTI:', 'TBG:', 'target']
    dt_linear, dt_ridge = dt.copy(), dt.copy()
    cols_to_drop_linear, cols_to_drop_ridge = dt_linear.columns.difference(linear_best_features), dt_ridge.columns.difference(ridge_best_features)
    dt_linear, dt_ridge = dt_linear.drop(cols_to_drop_linear, axis=1), dt_ridge.drop(cols_to_drop_ridge, axis=1)
    X_linear, X_ridge = dt_linear.drop('target', axis=1), dt_ridge.drop('target', axis=1)
    y_linear, y_ridge = le.fit_transform(dt_linear['target']), le.fit_transform(dt_ridge['target'])
    tuned_models = {"Ridge" : (Ridge(alpha=10, solver='lsqr'),X_ridge, y_ridge, StandardScaler()), "Linear":(LinearRegression(), X_linear, y_linear, StandardScaler())}
    evaluate_models(tuned_models, 'regression')
testO1(o1)
testO2sex(o2)
testO3age(o2age)

Model: RandomForest, Accuracy: 0.750, F1 Score avg: 0.429, Precision: 0.375, Recall: 0.500
Model: DecisionTree, Accuracy: 0.500, F1 Score avg: 0.222, Precision: 0.222, Recall: 0.222
Model: DecisionTree, Accuracy: 0.500, F1 Score avg: 0.222, Precision: 0.167, Recall: 0.333
Model: MLP, Accuracy: 0.500, F1 Score avg: 0.222, Precision: 0.167, Recall: 0.333
Model: Ridge, MSE: 14.814, RMSE: 3.8488956563790184, R2: -0.185
Model: Linear, MSE: 13.566, RMSE: 3.683245772561509, R2: -0.085
