Download [this dataset](https://drive.google.com/file/d/1Irwl_iBBarUuZNiSesy7KK6v8ImCRPfx/view?usp=sharing) and do the following:

---------------------------------------------------------------------------------------------------------------------


Part 1: Classify the 'Results' column using three models of your choice.
At least one must get 90% accuracy with at least 50% precision with ten fold cross validation. Print your confusion matrix.

Read in the dataset:

In [1]:
import pandas as pd

df = pd.read_csv('homework4.csv')
data = df.drop(columns=['Unnamed: 0'])

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,Results
0,-2.952927,1.184525,1.226833,-2.931165,-2.952927,-2.175042,-2.931165,-0.96947,-2.175042,-2.703467,-1.730694,0.67475,0
1,-2.391714,0.327538,0.876101,-2.734864,-2.391714,-1.747243,-2.734864,-1.153921,-1.747243,-1.688038,-1.551229,1.293129,0
2,-3.405944,0.540775,1.229768,-2.216754,-3.405944,-2.807417,-2.216754,-2.616736,-2.807417,-0.80327,0.872962,-0.728645,0
3,-0.699303,1.50891,-0.457514,0.043826,-0.699303,-0.892224,0.043826,-1.525033,-0.892224,0.406936,2.409556,-2.804476,0
4,-0.336467,1.690297,-1.05894,-0.340338,-0.336467,-0.036228,-0.340338,-1.671984,-0.036228,0.056554,1.708848,-1.761093,0


In [2]:
# Separate features (X) from the target (y)
X = data.drop(columns=['Results'])
y = data['Results']

In [3]:
import pandas as pd
import numpy as np
from IPython.display import display

def view_results(accuracies, precisions):
    stats = {
        'Min': [np.min(accuracies), np.min(precisions)],
        'Average': [np.mean(accuracies), np.mean(precisions)],
        'Max': [np.max(accuracies), np.max(precisions)]
    }
    
    # Create DataFrame
    df = pd.DataFrame(stats, index=['Accuracy', 'Precision'])
    
    # Format DataFrame
    df_formatted = df.style.format("{:.3f}") \
        .set_caption("Model Performance Metrics") \
        .set_table_styles([
            {'selector': 'caption',
             'props': [('font-size', '16pt'),
                       ('text-align', 'center'),
                       ('color', '#4F81BD'),
                       ('font-weight', 'bold')]},
            {'selector': 'th',
             'props': [('font-size', '12pt'),
                       ('background-color', '#F2F2F2'),
                       ('text-align', 'center')]},
            {'selector': 'td',
             'props': [('font-size', '12pt'),
                       ('text-align', 'center')]}
        ]) \
        .background_gradient(cmap='Blues', axis=None)
    
    display(df_formatted)


def view_confusion_matrix(conf_matrix):
    """
    Displays a formatted confusion matrix.

    Parameters:
    - conf_matrix: 2D array or DataFrame representing the confusion matrix
    - labels: list of label names for the classes (optional)
    """

    df_cm = pd.DataFrame(conf_matrix)
    df_cm.columns = [f'Predicted {i}' for i in df_cm.columns]
    df_cm.index = [f'Expected {i}' for i in df_cm.index]
    
    # Format DataFrame
    df_formatted = df_cm.style \
        .set_caption("Confusion Matrix") \
        .set_table_styles([
            {'selector': 'caption',
             'props': [('font-size', '16pt'),
                       ('text-align', 'center'),
                       ('color', '#4F81BD'),
                       ('font-weight', 'bold')]},
            {'selector': 'th',
             'props': [('font-size', '12pt'),
                       ('background-color', '#F2F2F2'),
                       ('text-align', 'center')]},
            {'selector': 'td',
             'props': [('font-size', '12pt'),
                       ('text-align', 'center')]}
        ]) \
        .background_gradient(cmap='Blues')
    
    display(df_formatted)

Classify 'Results' column with Model #1:

In [4]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# Initialize Model #1: Random Forest
random_forest_model = RandomForestClassifier(random_state=0)

# Perform 10-fold cross-validation and calculate accuracy and precision
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
random_forest_accuracies = cross_val_score(random_forest_model, X, y, cv=kf, scoring='accuracy')
random_forest_precisions = cross_val_score(random_forest_model, X, y, cv=kf, scoring='precision')

# Train and evaluate using the entire dataset to obtain confusion matrix
random_forest_model.fit(X, y)
random_forest_y_pred = random_forest_model.predict(X)
random_forest_conf_matrix = confusion_matrix(y, random_forest_y_pred)


In [16]:
import joblib

joblib.dump(random_forest_model, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [5]:
view_results(random_forest_accuracies, random_forest_precisions)

Unnamed: 0,Min,Average,Max
Accuracy,0.905,0.913,0.922
Precision,0.808,0.873,0.921


In [6]:
view_confusion_matrix(random_forest_conf_matrix)

Unnamed: 0,Predicted 0,Predicted 1
Expected 0,8591,0
Expected 1,0,1409


Classify 'Results' column with Model #2:

In [7]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

# Initialize Model #2: Naive Bayes
knn_model = KNeighborsClassifier()

# Perform 10-fold cross-validation and calculate accuracy and precision
kf = StratifiedKFold (n_splits=10, shuffle=True, random_state=0)
knn_accuracies = cross_val_score(knn_model, X, y, cv=kf, scoring='accuracy')
knn_precisions = cross_val_score(knn_model, X, y, cv=kf, scoring='precision')

# Train and evaluate using the entire dataset to obtain confusion matrix
knn_model.fit(X, y)
knn_y_pred = knn_model.predict(X)
knn_conf_matrix = confusion_matrix(y, knn_y_pred)

In [17]:
joblib.dump(knn_model, 'knn_model.pkl')

['knn_model.pkl']

In [8]:
view_results(knn_accuracies, knn_precisions)

Unnamed: 0,Min,Average,Max
Accuracy,0.905,0.911,0.926
Precision,0.829,0.863,0.924


In [9]:
view_confusion_matrix(knn_conf_matrix)

Unnamed: 0,Predicted 0,Predicted 1
Expected 0,8524,67
Expected 1,719,690


Classify 'Results' column with Model #3:

In [10]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier

# Initialize Model #3: Neural Network
neural_network_model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=200, random_state=0)

# Perform 10-fold cross-validation and calculate accuracy and precision
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
neural_network_accuracies = cross_val_score(neural_network_model, X, y, cv=kf, scoring='accuracy')
neural_network_precisions = cross_val_score(neural_network_model, X, y, cv=kf, scoring='precision')

# Train and evaluate using the entire dataset to obtain confusion matrix
neural_network_model.fit(X, y)
neural_network_y_pred = neural_network_model.predict(X)
neural_network_conf_matrix = confusion_matrix(y, neural_network_y_pred)

In [18]:
joblib.dump(neural_network_model, 'neural_network_model.pkl')

['neural_network_model.pkl']

In [11]:
view_results(neural_network_accuracies, neural_network_precisions)

Unnamed: 0,Min,Average,Max
Accuracy,0.907,0.914,0.926
Precision,0.843,0.888,0.932


In [12]:
view_confusion_matrix(neural_network_conf_matrix)

Unnamed: 0,Predicted 0,Predicted 1
Expected 0,8514,77
Expected 1,761,648


---------------------------------------------------------------------------------------------------------------------


Part 2: Run PCA and discover how many dimensions you can reduce the problem to before you start seeing significant decreases in accuracy.

Run PCA:

In [13]:
from sklearn.decomposition import PCA

# Initialize variables to store results
dim_accuracies = {}
rf_model_pca = RandomForestClassifier(random_state=0)


# Loop through various numbers of dimensions
for n_components in range(1, X.shape[1] + 1):
    # Apply PCA with n components
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)

    # Evaluate model accuracy with cross-validation on reduced data
    accuracy = cross_val_score(rf_model_pca, X_pca, y, cv=kf, scoring='accuracy').mean()
    
    # Store accuracy for the current number of components
    dim_accuracies[n_components] = accuracy

# Display results
dim_accuracies

{1: 0.7664000000000001,
 2: 0.8907999999999999,
 3: 0.8994,
 4: 0.9051,
 5: 0.9095000000000001,
 6: 0.9099,
 7: 0.9097,
 8: 0.9103,
 9: 0.9103999999999999,
 10: 0.9096,
 11: 0.9102,
 12: 0.9101000000000001}

In [14]:
# Initialize variables to store results
dim_accuracies = {}
knn_model_pca = KNeighborsClassifier()


# Loop through various numbers of dimensions
for n_components in range(1, X.shape[1] + 1):
    # Apply PCA with n components
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)

    # Evaluate model accuracy with cross-validation on reduced data
    accuracy = cross_val_score(knn_model_pca, X_pca, y, cv=kf, scoring='accuracy').mean()
    
    # Store accuracy for the current number of components
    dim_accuracies[n_components] = accuracy

# Display results
dim_accuracies

{1: 0.8386999999999999,
 2: 0.8906000000000001,
 3: 0.8946000000000002,
 4: 0.9046000000000001,
 5: 0.9108000000000003,
 6: 0.9108000000000003,
 7: 0.9108000000000003,
 8: 0.9108000000000003,
 9: 0.9108000000000003,
 10: 0.9108000000000003,
 11: 0.9108000000000003,
 12: 0.9108000000000003}

In [15]:
# Initialize variables to store results
dim_accuracies = {}
nn_model_pca = MLPClassifier(hidden_layer_sizes=(50,), max_iter=200, random_state=0)


# Loop through various numbers of dimensions
for n_components in range(1, X.shape[1] + 1):
    # Apply PCA with n components
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)

    # Evaluate model accuracy with cross-validation on reduced data
    accuracy = cross_val_score(nn_model_pca, X_pca, y, cv=kf, scoring='accuracy').mean()
    
    # Store accuracy for the current number of components
    dim_accuracies[n_components] = accuracy

# Display results
dim_accuracies



{1: 0.8591,
 2: 0.8991,
 3: 0.9018,
 4: 0.9082000000000001,
 5: 0.9144,
 6: 0.9145000000000001,
 7: 0.9150000000000003,
 8: 0.9145999999999999,
 9: 0.9141000000000001,
 10: 0.9137000000000001,
 11: 0.9144,
 12: 0.9138}

How many dimensions can you reduce? (Explain your answer)

After running PCA on the Ten-Fold Cross Validation over the RandomForest, KNN, and MLP classifiers, we see a 1-2% drop in accuracy when reducing 1-11 dimensions. When reducing the 12th dimension (i.e. we only have 1 dimension/feature X when predicting our target y), we see a 5-10% drop in accuracy. This implies that we can see consistently highly accurate classifications with either all 12 dimensions or even just 2 dimensions, but with 1 dimension of the feature we may have reduced the dimensions by too much via PCA. Therefore, we claim that we can reduce 11 dimensions at most through PCA analysis when making predictions for this dataset.