# Imports

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif, SequentialFeatureSelector
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix, f1_score, matthews_corrcoef, mean_squared_error, mean_absolute_error, precision_score, r2_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

## All unecessary prints have been commented instead of removed, as the teacher may find the need to see them.

# Pre-processing

In [17]:
## Data loading and transformation of ? and diagnoses classes

def preprocess_data(*args):
    if len(args) == 1:
        raw_data_file = args[0]
        data = pd.read_csv(raw_data_file)
    elif len(args) == 2:
        raw_data_file = args[0]
        additional_data_file = args[1]
        data = pd.read_csv(raw_data_file)
        additional_data = pd.read_csv(additional_data_file)
        additional_column = additional_data.columns[0].rstrip(':')
        data = pd.concat([data, additional_data], axis=1)
    else:
        raise ValueError("The function accepts either one or two arguments.")
    
    # Replace the current diagnoses to the classes desired 
    class_mapping = {
        'A': 'hyperthyroid conditions',
        'B': 'hyperthyroid conditions',
        'C': 'hyperthyroid conditions',
        'D': 'hyperthyroid conditions',
        'E': 'hypothyroid conditions',
        'F': 'hypothyroid conditions',
        'G': 'hypothyroid conditions',
        'H': 'hypothyroid conditions',
        'I': 'binding protein',
        'J': 'binding protein',
        'K': 'general health',
        'L': 'replacement therapy',
        'M': 'replacement therapy',
        'N': 'replacement therapy',
        'R': 'discordant results',
        '-': 'healthy class' 
    }

    # Helper function to map labels to their categories
    def map_label_to_category(label):
        return class_mapping.get(label, 'other class')
    
    # Map diagnoses to new categories 
    data['diagnoses'] = data['diagnoses'].apply(map_label_to_category)

    # Check if we have the 8 needed classes after mapping 
    unique_diagnoses = data['diagnoses'].unique()
    #print("Unique values in 'diagnoses' column:")
    #for value in unique_diagnoses:
        #print(value)
    
    # Replace '?' with NaN to be able to see missing values in analysis 
    data.replace('?', np.nan, inplace=True)
    df = pd.DataFrame(data)
    #print(df.head())
    
    df.columns = df.columns.str.rstrip(':')
    
    # Encoder to transform the values of numbers that are being considered strings
    ordinal_encoder = OrdinalEncoder()

    # Initialize OneHotEncoder for categorical columns
    onehot_encoder = OneHotEncoder(sparse=False)
    
    # Deletion of impossible ages (<0 && >130)
    top_10_biggest = df['age'].nlargest(10)
    top_10_smallest = df['age'].nsmallest(10)
    #print("Top 10 biggest ages before drop:")
    #print(top_10_biggest)
    #print("\nTop 10 smallest ages before drop")
    #print(top_10_smallest)

    indices_to_drop_high = df[df['age'] > 130].index
    indices_to_drop_low = df[df['age'] < 0].index

    # Drop rows with ages greater than 130 and lower than 0 
    df = df.drop(index=indices_to_drop_high).reset_index(drop=True)
    df = df.drop(index=indices_to_drop_low).reset_index(drop=True)

    # Check age after the drop 
    top_10_biggest = df['age'].nlargest(10)
    top_10_smallest = df['age'].nsmallest(10)
    #print("\n\nTop 10 biggest ages after dropping ages > 130:")
    #print(top_10_biggest)
    #print("\nTop 10 smallest ages after dropping ages < 0:")
    #print(top_10_smallest)


    missing_sex_count = df['sex'].isnull().sum()
    #print("Number of missing values in 'sex:' column:", missing_sex_count)
    imputer = SimpleImputer(strategy='most_frequent')
    df['sex'] = imputer.fit_transform(df[['sex']]).flatten()
    #print("Number of missing :", df['sex'].isnull().sum())

    # Drop unique column since it won't be used for analysis only for identification
    df.drop(columns=['[record identification]'], inplace=True)

    num_columns = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG']

    trueORfalse_columns = ['on thyroxine', 'query on thyroxine', 'on antithyroid medication', 
                          'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 
                          'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 
                          'TSH measured', 'T3 measured', 'TT4 measured', 'T4U measured', 'FTI measured', 
                          'TBG measured']

    imputer = SimpleImputer(strategy='most_frequent')
    for column in trueORfalse_columns:
        df[column] = imputer.fit_transform(df[[column]]).flatten()

    for col in trueORfalse_columns:
        df[col] = df[col].apply(lambda x: 1 if x == 't' else 0)

    df['sex'] = df['sex'].apply(lambda x: 1 if x == 'F' else 0)

    referral_encoder = OneHotEncoder(sparse=False)
    referral_encoded = referral_encoder.fit_transform(df[['referral source']])
    referral_feature_names = referral_encoder.get_feature_names_out(['referral source'])
    referral_encoded_df = pd.DataFrame(referral_encoded, columns=referral_feature_names)

    #print(referral_encoder.categories_)
    df = pd.concat([df.drop(columns=['referral source']), referral_encoded_df], axis=1)
    df[num_columns] = ordinal_encoder.fit_transform(df[num_columns])
    
    # As said and shown in the report, we tested multiple values and while there was not much difference, this one was slightly better
    df.fillna(999, inplace=True)
    #print(df.isnull().sum())
    
    if len(args) == 2:
        return df.drop(columns=[additional_column]), df[additional_column]
    else:
        return df

# Check example of data 
df = preprocess_data('proj-data.csv')



# O1

## DecisionTreeClassifier

In [18]:
def dtcO1(df_data, df_class=None):

    # Instantiate Decision Tree classifier
    dtc = DecisionTreeClassifier()

    # Define Sequential Feature Selector with Decision Tree classifier
    sfs = SequentialFeatureSelector(dtc, n_features_to_select=30, direction='forward')
    
    if df_class is None:
        # Fit Sequential Feature Selector to data
        sfs.fit(df_data.drop('diagnoses', axis=1), df_data['diagnoses'])
    else:
        # Fit Sequential Feature Selector to data
        sfs.fit(df_data, df_class)
    
    
    selected_features_indices = np.arange(len(sfs.get_support()))[sfs.get_support()]
    
    if df_class is None:
        selected_columns = df_data.drop('diagnoses', axis=1).columns[selected_features_indices]
        
        # Transform the data based on selected features
        selected_features_train = sfs.transform(df_data.drop('diagnoses', axis=1))
        target = df_data['diagnoses']
        
    else:
        selected_columns = df_data.columns[selected_features_indices]
        
        # Transform the data based on selected features
        selected_features_train = sfs.transform(df_data)
        target = df_class


    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(selected_features_train, target, test_size=0.2, random_state=42)


    # Hyperparameter tuning with GridSearchCV
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'sqrt', 'log2'],
        'random_state': [42]
    }

    clf_grid = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=5)

    # N-Fold Cross Validation, separa os dados de forma a tesra com todos quais seriam os melhores de teste etc 
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    kf.get_n_splits(X_train)

    TRUTH_nfold = None
    PREDS_nfold = None

    for train_index, test_index in kf.split(X_train):

        X_train_fold, X_val_fold = X_train[train_index], X_train[test_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        # Train the model with the best parameters
        clf_grid.fit(X_train_fold, y_train_fold)

        best_clf = clf_grid.best_estimator_
        best_clf.fit(X_train_fold, y_train_fold)

        # Make predictions
        preds = best_clf.predict(X_val_fold)

        if TRUTH_nfold is None:
            PREDS_nfold = preds
            TRUTH_nfold = y_val_fold
        else:
            PREDS_nfold = np.hstack((PREDS_nfold, preds))
            TRUTH_nfold = np.hstack((TRUTH_nfold, y_val_fold))

    # Evaluation metrics
    precision = precision_score(TRUTH_nfold, PREDS_nfold, average='weighted')
    recall = recall_score(TRUTH_nfold, PREDS_nfold, average='weighted')
    f1 = f1_score(TRUTH_nfold, PREDS_nfold, average='weighted')
    mcc = matthews_corrcoef(TRUTH_nfold, PREDS_nfold)
    balanced_accuracy = balanced_accuracy_score(TRUTH_nfold, PREDS_nfold)

    # Print evaluation metrics
    print("N-Fold Cross Validation Results:")
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Matthews Correlation Coefficient:", mcc)
    print("Balanced Accuracy:", balanced_accuracy)

    # Calculate macro F1 score
    f1_macro = f1_score(TRUTH_nfold, PREDS_nfold, average='macro')

    # Print macro F1 score
    print("Macro F1 Score:", f1_macro)

    #using nfold
    # Generate classification report using mapped classes
    print("Classification Report for N-Fold Cross Validation:")
    print(classification_report(TRUTH_nfold, PREDS_nfold,zero_division=1))
    
df_copy = dtcO1(df)

N-Fold Cross Validation Results:
Precision: 0.8882428496198685
Recall: 0.8921267893660532
F1 Score: 0.8896319157706672
Matthews Correlation Coefficient: 0.7535025292895594
Balanced Accuracy: 0.7273518318466736
Macro F1 Score: 0.7387937020573921
Classification Report for N-Fold Cross Validation:
                         precision    recall  f1-score   support

        binding protein       0.63      0.51      0.57       230
     discordant results       0.66      0.69      0.68       135
         general health       0.81      0.87      0.84       274
          healthy class       0.94      0.96      0.95      4324
hyperthyroid conditions       0.72      0.69      0.71       111
 hypothyroid conditions       0.78      0.77      0.78       384
            other class       0.61      0.50      0.55       189
    replacement therapy       0.88      0.82      0.85       221

               accuracy                           0.89      5868
              macro avg       0.75      0.73      0.

## Naive Bayes

In [19]:
df_copy = df.copy()

X = df_copy.drop(columns=['diagnoses'])
y = df_copy['diagnoses']


X_df0 = df_copy.drop(columns=['diagnoses'])
X_df= pd.get_dummies(X_df0, drop_first=True)
X_df.columns=X_df0.columns

X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

mdl=CategoricalNB(alpha=0.001)

mdl.fit(X_train,y_train);


# Predict on test data
preds = mdl.predict(X_test)

def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds, average='weighted'))  # Change average to 'weighted'
    print("The Recall is: %7.4f" % recall_score(truth, preds, average='weighted'))  # Change average to 'weighted'
    print("The F1 score is: %7.4f" % f1_score(truth, preds, average='weighted'))  # Change average to 'weighted'
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(truth, preds))
    print("The Macro F1 Score is:", f1_score(truth, preds, average='macro'))



# Print the classification results
printClassResults(y_test, preds)

# Print the classification report
print(classification_report(y_test, preds))

The Accuracy is:  0.8208
The Precision is:  0.8300
The Recall is:  0.8208
The F1 score is:  0.8226
The Matthews correlation coefficient is:  0.5792
The Macro F1 Score is: 0.5607644608422453
                         precision    recall  f1-score   support

        binding protein       0.60      0.46      0.52        54
     discordant results       0.50      0.25      0.33        28
         general health       0.61      0.53      0.57        72
          healthy class       0.91      0.91      0.91      1102
hyperthyroid conditions       0.67      0.45      0.54        31
 hypothyroid conditions       0.72      0.83      0.77        92
            other class       0.18      0.38      0.25        34
    replacement therapy       0.65      0.56      0.60        55

               accuracy                           0.82      1468
              macro avg       0.60      0.55      0.56      1468
           weighted avg       0.83      0.82      0.82      1468



## Logistic Regression

In [20]:
# Create a copy
df_copy = df.copy()

# Filter columns with numeric data types
numeric_columns = df_copy.select_dtypes(include=np.number).columns.tolist()

# Filter columns with non-numeric data types
non_numeric_columns = df_copy.select_dtypes(exclude=np.number).columns.tolist()

# Print the list of numeric and non-numeric columns
#print("Numeric Columns:")
#print(numeric_columns)
#print("\nNon-Numeric Columns:")
#print(non_numeric_columns)

# Encode the 'diagnoses' column
label_encoder = LabelEncoder()
df_copy['diagnoses'] = label_encoder.fit_transform(df_copy['diagnoses'])

# Define your features (X) and target variable (y)
X = df_copy[numeric_columns]  # Features excluding the target column
y = df_copy['diagnoses']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
scaler.fit(X_train)  # Fit scaler on training set
X_train = scaler.transform(X_train)  # Apply scaler on training set
X_test = scaler.transform(X_test)  # Apply scaler on test set

# Initialize logistic regression model with increased iterations and class weights
mdl = LogisticRegression(random_state=0, max_iter=1000, class_weight='balanced').fit(X_train, y_train)

print("The bias is: ",  mdl.intercept_[0])
#print("The other parameters are: ")
#for i, beta in enumerate(mdl.coef_[0]):
#    print("\t B%02d -> %9.3f" % (i+1, beta))

coefs = [(beta, i) for i, beta in enumerate(mdl.coef_[0])]
coefss = sorted(coefs, key=lambda row: np.abs(row[0]))
coefss.reverse()
#for beta, i in coefss[:5]:
#    print("\t B%02d -> %9.3f" % (i+1, beta))

def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds, average='weighted', zero_division=0))
    print("The Recall is: %7.4f" % recall_score(truth, preds, average='weighted', zero_division=0))
    print("The F1 score is: %7.4f" % f1_score(truth, preds, average='weighted', zero_division=0))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(truth, preds))
    print("The Macro F1 Score is: %7.4f" % f1_score(truth, preds, average='macro', zero_division=0))

preds = mdl.predict(X_test)

printClassResults(y_test, preds)

print(classification_report(y_test, preds))

The bias is:  0.5785134159001989
The Accuracy is:  0.2732
The Precision is:  0.7523
The Recall is:  0.2732
The F1 score is:  0.2672
The Matthews correlation coefficient is:  0.2529
The Macro F1 Score is:  0.2826
              precision    recall  f1-score   support

           0       0.29      0.57      0.39        54
           1       0.03      0.29      0.06        28
           2       0.21      0.74      0.33        72
           3       0.93      0.14      0.25      1102
           4       0.16      0.87      0.26        31
           5       0.26      0.65      0.38        92
           6       0.11      0.32      0.17        34
           7       0.28      0.96      0.43        55

    accuracy                           0.27      1468
   macro avg       0.28      0.57      0.28      1468
weighted avg       0.75      0.27      0.27      1468



# O2 - SEX

## Decision Tree 

In [21]:
def sexO2(df_data, df_class=None):

    label_encoder = LabelEncoder()
    # Encode the 'diagnoses' column
    df_data['diagnoses'] = label_encoder.fit_transform(df_data['diagnoses'])
    
    if df_class is None:
        # Split data into features and target
        X = df_data.drop(columns=['sex'])
        y = df_data['sex']
    else:
        # Split data into features and target
        X = df_data
        y = df_class

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Hyperparameter tuning with GridSearchCV
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'sqrt', 'log2'],
        'random_state': [42]
    }

    clf_grid = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=5)

    # N-Fold Cross Validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    kf.get_n_splits(X_train)

    # Initialize lists to store predictions and true labels
    all_PREDS = []
    all_TRUTH = []

    for train_index, test_index in kf.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        # Train the model with the best parameters
        clf_grid.fit(X_train_fold, y_train_fold)
        best_clf = clf_grid.best_estimator_
        best_clf.fit(X_train_fold, y_train_fold)

        # Make predictions
        preds = best_clf.predict(X_val_fold)

        # Append predictions and true labels to the lists
        all_PREDS.append(preds)
        all_TRUTH.append(y_val_fold)

    # Concatenate predictions and true labels vertically
    PREDS_nfold = np.concatenate(all_PREDS)
    TRUTH_nfold = np.concatenate(all_TRUTH)

    # Evaluation metrics
    precision = precision_score(TRUTH_nfold, PREDS_nfold, average='weighted')
    recall = recall_score(TRUTH_nfold, PREDS_nfold, average='weighted')
    f1 = f1_score(TRUTH_nfold, PREDS_nfold, average='weighted')
    mcc = matthews_corrcoef(TRUTH_nfold, PREDS_nfold)
    balanced_accuracy = balanced_accuracy_score(TRUTH_nfold, PREDS_nfold)

    # Print evaluation metrics
    print("N-Fold Cross Validation Results:")
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Matthews Correlation Coefficient:", mcc)
    print("Balanced Accuracy:", balanced_accuracy)

    # Calculate and print macro F1 score
    f1_macro = f1_score(TRUTH_nfold, PREDS_nfold, average='macro')
    print("Macro F1 Score:", f1_macro)


    # Using nfold
    # Generate classification report using mapped classes
    print("Classification Report for N-Fold Cross Validation:")
    print(classification_report(TRUTH_nfold, PREDS_nfold, zero_division=1))

df_O2SEX = sexO2(df)

N-Fold Cross Validation Results:
Precision: 0.6516835434366557
Recall: 0.6872869802317655
F1 Score: 0.6454349036248204
Matthews Correlation Coefficient: 0.17125061486140866
Balanced Accuracy: 0.5642392010332995
Macro F1 Score: 0.5571487048782046
Classification Report for N-Fold Cross Validation:
              precision    recall  f1-score   support

           0       0.51      0.23      0.32      1855
           1       0.72      0.90      0.80      4013

    accuracy                           0.69      5868
   macro avg       0.61      0.56      0.56      5868
weighted avg       0.65      0.69      0.65      5868



## Naive Bayes

In [22]:
df_copySex = df.copy()

# Encode the 'diagnoses' column
label_encoder = LabelEncoder()
df_copySex['diagnoses'] = label_encoder.fit_transform(df_copySex['diagnoses'])

X = df_copySex.drop(columns=['sex'])
y = df_copySex['sex']

X_df0 = df_copySex.drop(columns=['sex'])
X_df= pd.get_dummies(X_df0, drop_first=True)
X_df.columns=X_df0.columns


X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

mdl=CategoricalNB(alpha=0.001)

mdl.fit(X_train,y_train);

# Predict on test data
preds = mdl.predict(X_test)

def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds, average='weighted'))
    print("The Recall is: %7.4f" % recall_score(truth, preds, average='weighted'))
    print("The F1 score is: %7.4f" % f1_score(truth, preds, average='weighted'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(truth, preds))
    print("The Macro F1 Score is:", f1_score(truth, preds, average='macro'))


# Print the classification results
printClassResults(y_test, preds)

# Print the classification report
print(classification_report(y_test, preds))

The Accuracy is:  0.6655
The Precision is:  0.6986
The Recall is:  0.6655
The F1 score is:  0.6779
The Matthews correlation coefficient is:  0.2270
The Macro F1 Score is: 0.6083318888248295
              precision    recall  f1-score   support

           0       0.40      0.53      0.46       393
           1       0.81      0.72      0.76      1075

    accuracy                           0.67      1468
   macro avg       0.61      0.62      0.61      1468
weighted avg       0.70      0.67      0.68      1468



## Logistic Regression

In [23]:
df_copySex = df.copy()

# Encode the 'diagnoses' column
label_encoder = LabelEncoder()
df_copySex['diagnoses'] = label_encoder.fit_transform(df_copySex['diagnoses'])

# Define your features (X) and target variable (y)
X = df_copySex[numeric_columns].drop('sex', axis=1)  # Features excluding the target column
y = df_copySex['sex']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
scaler.fit(X_train)  # Fit scaler on training set
X_train = scaler.transform(X_train)  # Apply scaler on training set
X_test = scaler.transform(X_test)  # Apply scaler on test set

# Initialize logistic regression model with increased iterations and class weights
mdl = LogisticRegression(random_state=0, max_iter=1000, class_weight='balanced').fit(X_train, y_train)

print("The bias is: ",  mdl.intercept_[0])
print("The other parameters are: ")
#for i, beta in enumerate(mdl.coef_[0]):
    #print("\t B%02d -> %9.3f" % (i+1, beta))

coefs = [(beta, i) for i, beta in enumerate(mdl.coef_[0])]
coefss = sorted(coefs, key=lambda row: np.abs(row[0]))
coefss.reverse()
#for beta, i in coefss[:5]:
    #print("\t B%02d -> %9.3f" % (i+1, beta))


# Predict on test data
preds = mdl.predict(X_test)

def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds, average='weighted'))
    print("The Recall is: %7.4f" % recall_score(truth, preds, average='weighted'))
    print("The F1 score is: %7.4f" % f1_score(truth, preds, average='weighted'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(truth, preds))
    print("The Macro F1 Score is:", f1_score(truth, preds, average='macro'))


# Print the classification results
printClassResults(y_test, preds)

# Print the classification report
print(classification_report(y_test, preds))

The bias is:  0.20796506873354198
The other parameters are: 
The Accuracy is:  0.6233
The Precision is:  0.7034
The Recall is:  0.6233
The F1 score is:  0.6443
The Matthews correlation coefficient is:  0.2254
The Macro F1 Score is: 0.5902328715609991
              precision    recall  f1-score   support

           0       0.38      0.63      0.47       393
           1       0.82      0.62      0.71      1075

    accuracy                           0.62      1468
   macro avg       0.60      0.63      0.59      1468
weighted avg       0.70      0.62      0.64      1468



# O2 - AGE

## Decision Tree Regressor

In [24]:
def ageO2(df_data, df_class=None):

    # Encode the diafgnoses variable
    label_encoder = LabelEncoder()
    df_data['diagnoses'] = label_encoder.fit_transform(df_data['diagnoses'])

    if df_class is None:
        # Split data into features and target
        X = df_data.drop(columns=['age'])
        y = df_data['age']
    else:
        # Split data into features and target
        X = df_data
        y = df_class

    # Initialize the Decision Tree Regressor
    dt_regressor = DecisionTreeRegressor(random_state=42)

    # Initialize Sequential Feature Selector (SFS)
    sfs = SequentialFeatureSelector(dt_regressor, n_features_to_select=30, direction='forward')

    # Fit SFS to data
    sfs.fit(X, y)

    # Get selected features indices
    selected_features_indices = sfs.get_support(indices=True)

    # Filter X with selected features
    X_selected = X.iloc[:, selected_features_indices]

    # Define hyperparameters grid for grid search
    param_grid = {
        'max_depth': [3, 5, 7, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Perform N-Fold Cross Validation with grid search
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    mse_scores = []
    mae_scores = []
    r2_scores = []

    for train_index, test_index in kf.split(X_selected):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        grid_search = GridSearchCV(estimator=dt_regressor, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)

        best_params = grid_search.best_params_

        best_dt_regressor = DecisionTreeRegressor(**best_params, random_state=42)
        best_dt_regressor.fit(X_train, y_train)

        y_pred = best_dt_regressor.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        mse_scores.append(mse)
        mae_scores.append(mae)
        r2_scores.append(r2)

    # Calculate average scores
    avg_mse = sum(mse_scores) / len(mse_scores)
    avg_mae = sum(mae_scores) / len(mae_scores)
    avg_r2 = sum(r2_scores) / len(r2_scores)

    # Print average evaluation metrics
    print("Best Hyperparameters:", best_params)
    print("Average Evaluation Metrics across Folds:")
    print("Average Mean Squared Error:", avg_mse)
    print("Average Mean Absolute Error:", avg_mae)
    print("Average R-squared Score:", avg_r2)
    
df_O2AGE = ageO2(df)

Best Hyperparameters: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
Average Evaluation Metrics across Folds:
Average Mean Squared Error: 312.65632449027817
Average Mean Absolute Error: 14.633386196185375
Average R-squared Score: 0.12002259639729909


## Naive Bayes

In [25]:
df_copyAge = df.copy()

# Encode the 'diagnoses' column
label_encoder = LabelEncoder()
df_copyAge['diagnoses'] = label_encoder.fit_transform(df_copyAge['diagnoses'])

X = df_copyAge.drop(columns=['age'])

y = df_copyAge['age']

X_df0 = df_copyAge.drop(columns=['age'])
X_df= pd.get_dummies(X_df0, drop_first=True)
X_df.columns=X_df0.columns

X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

mdl=CategoricalNB(alpha=0.001)

mdl.fit(X_train,y_train);

# Predict on test data
preds = mdl.predict(X_test)

# Define a function to print classification results
def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds, average='weighted', zero_division=0))
    print("The Recall is: %7.4f" % recall_score(truth, preds, average='weighted', zero_division=0))
    print("The F1 Score is: %7.4f" % f1_score(truth, preds, average='weighted', zero_division=0))
    print("The Matthews Correlation Coefficient is: %7.4f" % matthews_corrcoef(truth, preds))
    print("The Macro F1 Score is: %7.4f" % f1_score(truth, preds, average='macro', zero_division=0))


# Print the classification results
printClassResults(y_test, preds)

# Print the classification report
#print(classification_report(y_test, preds))

The Accuracy is:  0.0218
The Precision is:  0.0243
The Recall is:  0.0218
The F1 Score is:  0.0225
The Matthews Correlation Coefficient is:  0.0067
The Macro F1 Score is:  0.0146


## Logistic Regression

In [26]:
df_copyAge = df.copy()

# Encode the 'diagnoses' column
label_encoder = LabelEncoder()
df_copyAge['diagnoses'] = label_encoder.fit_transform(df_copyAge['diagnoses'])

# Define your features (X) and target variable (y)
X = df_copyAge[numeric_columns].drop('age', axis=1)  # Features excluding the target column
y = df_copyAge['age']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
scaler.fit(X_train)  # Fit scaler on training set
X_train = scaler.transform(X_train)  # Apply scaler on training set
X_test = scaler.transform(X_test)  # Apply scaler on test set

# Initialize logistic regression model with increased iterations and class weights
mdl = LogisticRegression(random_state=0, max_iter=1000, class_weight='balanced').fit(X_train, y_train)

print("The bias is: ",  mdl.intercept_[0])
print("The other parameters are: ")
#for i, beta in enumerate(mdl.coef_[0]):
    #print("\t B%02d -> %9.3f" % (i+1, beta))

coefs = [(beta, i) for i, beta in enumerate(mdl.coef_[0])]
coefss = sorted(coefs, key=lambda row: np.abs(row[0]))
coefss.reverse()
#for beta, i in coefss[:5]:
    #print("\t B%02d -> %9.3f" % (i+1, beta))


# Predict on test data
preds = mdl.predict(X_test)

# Define a function to print classification results
def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds, average='weighted', zero_division=0))
    print("The Recall is: %7.4f" % recall_score(truth, preds, average='weighted', zero_division=0))
    print("The F1 Score is: %7.4f" % f1_score(truth, preds, average='weighted', zero_division=0))
    print("The Matthews Correlation Coefficient is: %7.4f" % matthews_corrcoef(truth, preds))
    print("The Macro F1 Score is: %7.4f" % f1_score(truth, preds, average='macro', zero_division=0))


# Print the classification results
printClassResults(y_test, preds)

The bias is:  -0.6768203866153133
The other parameters are: 
The Accuracy is:  0.0089
The Precision is:  0.0140
The Recall is:  0.0089
The F1 Score is:  0.0073
The Matthews Correlation Coefficient is:  0.0046
The Macro F1 Score is:  0.0083


# O3

In [27]:
#------------------------------------------------------O3-------------------------------------------------------
df_copy = df.copy()
#------------------------------------only for analysis 
# Instantiate Decision Tree classifier
dtc = DecisionTreeClassifier()

# Define Sequential Feature Selector with Decision Tree classifier
sfs = SequentialFeatureSelector(dtc, n_features_to_select=10, direction='forward')

# Fit Sequential Feature Selector to data
sfs.fit(df_copy.drop('diagnoses', axis=1), df_copy['diagnoses'])
selected_features_indices = np.arange(len(sfs.get_support()))[sfs.get_support()]
selected_columns = df_copy.drop('diagnoses', axis=1).columns[selected_features_indices]

# Print the names of the selected columns
print("Selected Columns:", selected_columns.tolist())
# Transform the data based on selected features
selected_features_train = sfs.transform(df_copy.drop('diagnoses', axis=1))
target = df_copy['diagnoses']

Selected Columns: ['on antithyroid medication', 'pregnant', 'thyroid surgery', 'hypopituitary', 'TSH measured', 'TSH', 'TT4 measured', 'TBG measured', 'TBG', 'referral source_WEST']


In [28]:
#-------------------------------------------feature analysis sex O3--------------------------------------------------- 
df_copySex=df.copy()
# Encode the 'diagnoses' column
label_encoder = LabelEncoder()
df_copySex['diagnoses'] = label_encoder.fit_transform(df_copySex['diagnoses'])

# Define Sequential Feature Selector with Decision Tree classifier
dtc = DecisionTreeClassifier()
sfs = SequentialFeatureSelector(dtc, n_features_to_select=10, direction='forward')

# Fit Sequential Feature Selector to data
sfs.fit(df_copySex.drop('sex', axis=1), df_copySex['sex'])
selected_features_indices = np.arange(len(sfs.get_support()))[sfs.get_support()]
selected_columns = df_copySex.drop('sex', axis=1).columns[selected_features_indices]

# Print the names of the selected columns
print("Selected Columns:", selected_columns.tolist())

Selected Columns: ['query on thyroxine', 'pregnant', 'thyroid surgery', 'lithium', 'goitre', 'hypopituitary', 'TT4 measured', 'diagnoses', 'referral source_STMW', 'referral source_SVHC']


In [29]:
#--------------------------------------------------feature analysis O3 Age-------------------------------------------

# Copy DataFrame
df_copyAge = df.copy()

# Encode the 'diagnoses' column
label_encoder = LabelEncoder()
df_copyAge['diagnoses'] = label_encoder.fit_transform(df_copyAge['diagnoses'])

# Define Sequential Feature Selector with Decision Tree Regressor
dtr = DecisionTreeRegressor()
sfs = SequentialFeatureSelector(dtr, n_features_to_select=10, direction='forward')

# Fit Sequential Feature Selector to data
sfs.fit(df_copyAge.drop('age', axis=1), df_copyAge['age'])
selected_features_indices = np.arange(len(sfs.get_support()))[sfs.get_support()]
selected_columns = df_copyAge.drop('age', axis=1).columns[selected_features_indices]

# Print the names of the selected columns
print("Selected Columns by SFS:", selected_columns.tolist())

Selected Columns by SFS: ['sick', 'I131 treatment', 'goitre', 'hypopituitary', 'TSH measured', 'T3 measured', 'T3', 'referral source_STMW', 'referral source_SVI', 'referral source_WEST']


# CELL FOR RUNNING BY TEACHER (DEMORA 10+/- min):

In [None]:
#We realized on the last day that this was supposed to do, was train the model with the proj-data.csv. Instead, we are
#training the models with the given test files. When we realized it there was no time to change it, therefore even though
#we know it is incorrect we are sending it as is as this was all we could do in the time we had.


import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# O1
print("O1:\n")
df_data, df_class = preprocess_data('proj-test-data.csv', 'proj-test-class.csv')
df_dtc = dtcO1(df_data, df_class)

# O2 - Age
print("\n\n\nO2 - Age:\n")
df_dataAge, df_classAge = preprocess_data('test2-data.csv', 'test2-age.csv')
df_dtcAge = ageO2(df_dataAge, df_classAge)

# O2 - Sex
print("\n\n\nO2 - Sex:\n")
df_dataSex, df_classSex = preprocess_data('test3-data.csv', 'test3-sex.csv')
df_dtcSex = sexO2(df_dataSex, df_classSex)