# Machine Learning Letter Recognize 

In [54]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier

In [33]:
# Load in data
train_data = pd.read_csv("Letters_train_2024.csv")
test_data = pd.read_csv("Letters_test_2024.csv")

# Question 1 

In [34]:
# Create new variable 
train_data['isB'] = np.where(train_data['letter'] == 'B', 'Yes', 'No')
test_data['isB'] = np.where(test_data['letter'] == 'B', 'Yes', 'No')

In [35]:
# Split into X and y
X_train = train_data.drop(columns=['Unnamed: 0', 'letter', 'isB'])
y_train = train_data['isB']

X_test = test_data.drop(columns=['Unnamed: 0', 'letter', 'isB'])
y_test = test_data['isB']
print(X_train.head())
print(y_train.head())

   xbox  ybox  width  height  onpix  xbar  ybar  x2bar  y2bar  xybar  x2ybar  \
0     6     9      6       4      4     5     8      3      6      7       4   
1     5    10      6       6      3     6    13      9      3      7       2   
2     2     2      3       4      3     7     7      5      6      7       6   
3     4     4      5       7      4     7     6      9      7      6       6   
4     4    10      3       8      7     7     6      9      6      7       7   

   xy2bar  xedge  xedgeycor  yedge  yedgexcor  
0      10      5          7      5          7  
1       9      3          7      5         10  
2       7      3          7      5          8  
3       7      2          7      9         10  
4       7      2          9      8         10  
0     No
1     No
2     No
3    Yes
4    Yes
Name: isB, dtype: object


### Part A: Baseline Model 

In [36]:
# Q1A code
most_common_label = y_train.value_counts().idxmax()
baseline_predictions = [most_common_label] * len(y_test)
# The most common label
baseline_1_acc = accuracy_score(y_test, baseline_predictions)
print(f'Baseline Test Accuracy: {baseline_1_acc:.4f}')

Baseline Test Accuracy: 0.7540


### Part B: Logistic Regression 

In [37]:
# Create and train model, then make predictions, then calculate accuracy
# Create logistic regression model
log_reg_model = LogisticRegression(random_state=2024, max_iter=1000)

# Train the model
log_reg_model.fit(X_train, y_train)

# Make predictions (using a threshold of 0.5)
y_pred_prob = log_reg_model.predict_proba(X_test)[:, 1]  # Get the probability of 'Yes'
y_pred = np.where(y_pred_prob >= 0.5, 'Yes', 'No')

# Calculate accuracy
model_1b_acc = accuracy_score(y_test, y_pred)
print(f'Logistic Regression Test Accuracy: {model_1b_acc:.4f}')

Logistic Regression Test Accuracy: 0.9369


### Part C: AUC 

In [38]:
# Q1C code
# Calculate AUC
model_1b_auc = roc_auc_score(y_test, y_pred_prob)
print(f'Logistic Regression Test AUC: {model_1b_auc:.4f}')

Logistic Regression Test AUC: 0.9791


### Part D: Cross-validated CART 

Cross-validation is performed for each ccp_alpha to identify the value that results in the highest accuracy on the training data, and a 5-fold cross-validation is used. Selection of ccp_alpha is based on maximizing the cross-validated accuracy; The value that results in the highest cross-validated accuracy is selected to train the final model.

In [39]:
# Q1D Code
# Create a base decision tree classifier
cart_model = DecisionTreeClassifier(random_state=2024)

# Train the model to get the possible ccp_alphas
cart_model.fit(X_train, y_train)
path = cart_model.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas

# Perform cross-validation for each ccp_alpha
cv_scores = []
for alpha in ccp_alphas:
    temp_model = DecisionTreeClassifier(random_state=2024, ccp_alpha=alpha)
    scores = cross_val_score(temp_model, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

# Get the best ccp_alpha and the corresponding accuracy
best_ccp_alpha_index = np.argmax(cv_scores)
best_ccp_alpha = ccp_alphas[best_ccp_alpha_index]

# Train the final CART model using the best ccp_alpha
final_cart_model = DecisionTreeClassifier(random_state=2024, ccp_alpha=best_ccp_alpha)
final_cart_model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = final_cart_model.predict(X_test)

model_1d_acc = accuracy_score(y_test, y_pred)
model_1d_best_ccp_alpha = best_ccp_alpha
print(f'CV CART Test Accuracy: {model_1d_acc:.4f}')
print(f'Best ccp_alpha: {model_1d_best_ccp_alpha:.4f}')

CV CART Test Accuracy: 0.9455
Best ccp_alpha: 0.0013


### Part E: Random Forest 


In [40]:
# Q1E Code
# Create Random Forest model with default parameters
rf_model = RandomForestClassifier(random_state=2024)
# Train the model
rf_model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

model_1e_acc = accuracy_score(y_test, y_pred)
print(f'Random Forest Test Accuracy: {model_1e_acc:.4f}')

Random Forest Test Accuracy: 0.9754


### Part F: Performance Comparison 

Based on the text accuracy, Random Forest model achieves the highest accuracy of 0.975, so it performs best on the test set. In this case, since we are dealing with letter recognition, accuracy might be more important, as the goal is to correctly recognize the letter.

In [41]:
# Q1F Code
# Create df to compare performance
performance_data = {
    'Model': ['Logistic Regression', 'CART', 'Random Forest'],
    'Test Accuracy': [model_1b_acc, model_1d_acc, model_1e_acc]
}

# Create DataFrame to compare performance
performance_df = pd.DataFrame(performance_data)
# Print the performance comparison DataFrame
print(performance_df)

                 Model  Test Accuracy
0  Logistic Regression       0.936898
1                 CART       0.945455
2        Random Forest       0.975401


***
# Question 2 

In [47]:
# Redefine target y
y_train = train_data['letter']
y_test = test_data['letter']
print(y_train.head())
print(y_test.head())

0    R
1    R
2    R
3    B
4    B
Name: letter, dtype: object
0    B
1    P
2    A
3    B
4    P
Name: letter, dtype: object


### Part A: Baseline Model 

In [48]:
# Q2A
most_common_label = y_train.value_counts().idxmax()
# Baseline prediction 
baseline_predictions = [most_common_label] * len(y_test)

baseline_2_acc = accuracy_score(y_test, baseline_predictions)
print(f'Baseline Test Accuracy: {baseline_2_acc:.4f}')

Baseline Test Accuracy: 0.2610


### Part B: LDA 

In [50]:
# Q2B code
# Create the LDA model
lda_model = LinearDiscriminantAnalysis()
# Train the LDA model
lda_model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = lda_model.predict(X_test)

model_2b_acc = accuracy_score(y_test, y_pred)
print(f'LDA Test Accuracy: {model_2b_acc:.4f}')

LDA Test Accuracy: 0.9209


### Part C: Cross-validated CART (

* We used 5-fold cross-validation on each candidate value for ccp_alpha to determine which value provides the best average performance on the training set. The ccp_alpha value with the highest cross-validation accuracy was selected to build the final model. The test set accuracy of my CART model is 0.9091.

In [51]:
# Q2C Code
# Create a base decision tree classifier
cart_model = DecisionTreeClassifier(random_state=2024)

# Train the model to get the possible ccp_alphas for pruning
cart_model.fit(X_train, y_train)
path = cart_model.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas

# Perform cross-validation for each ccp_alpha to find the optimal value
cv_scores = []
for alpha in ccp_alphas:
    temp_model = DecisionTreeClassifier(random_state=2024, ccp_alpha=alpha)
    scores = cross_val_score(temp_model, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

# Get the best ccp_alpha based on cross-validation accuracy
best_ccp_alpha_index = np.argmax(cv_scores)
model_2c_best_ccp_alpha = ccp_alphas[best_ccp_alpha_index]

# Train the final CART model using the best ccp_alpha value
final_cart_model = DecisionTreeClassifier(random_state=2024, ccp_alpha=model_2c_best_ccp_alpha)
final_cart_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = final_cart_model.predict(X_test)


model_2c_acc = accuracy_score(y_test, y_pred)
print(f'Best ccp_alpha: {model_2c_best_ccp_alpha:.4f}')
print(f'CART Test Accuracy: {model_2c_acc:.4f}')

Best ccp_alpha: 0.0006
CART Test Accuracy: 0.9091


### Part D: Vanilla Bagging (8 points)

In [52]:
# Q2D
# Create a RandomForestClassifier with max_features equal to the number of features (m = p)
bagging_model = RandomForestClassifier(n_estimators=100, max_features=X_train.shape[1], random_state=2024)
# Train the model
bagging_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = bagging_model.predict(X_test)

model_2d_acc = accuracy_score(y_test, y_pred)
print(f'No CV Random Forest Test Accuracy: {model_2d_acc:.4f}')

No CV Random Forest Test Accuracy: 0.9455


### Part E: Cross-validated Random Forest (10 points)

* We performed 5-fold cross-validation for each value of max_features in our candidate list, and the average accuracy across the five folds was calculated for each value of max_features. Then, the max_features value with the highest cross-validation accuracy was selected. The CV Random Forest test accuracy is 0.9668.

In [53]:
# Q2E
# Define possible values for max_features
max_features_options = ['sqrt', 'log2'] + list(range(1, X_train.shape[1] + 1))

# Perform cross-validation for each max_features option
cv_scores = []

for max_features in max_features_options:
    rf_model = RandomForestClassifier(n_estimators=100, max_features=max_features, random_state=2024)
    scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

# Select the best max_features value based on cross-validation accuracy
best_max_features_index = np.argmax(cv_scores)
best_max_features = max_features_options[best_max_features_index]
best_cv_accuracy = cv_scores[best_max_features_index]

# Train the final Random Forest model with the selected max_features value
final_rf_model = RandomForestClassifier(n_estimators=100, max_features=best_max_features, random_state=2024)
final_rf_model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = final_rf_model.predict(X_test)

model_2e_acc = accuracy_score(y_test, y_pred)
print(f'Max features are: {best_max_features}')
print(f'CV Random Forest Test Accuracy: {model_2e_acc:.4f}')

Max features are: 2
CV Random Forest Test Accuracy: 0.9668


### Part F: Gradient Boosting Classifier (9 points)

In [55]:
# Q2F
# Create the Gradient Boosting Classifier with specified parameters
gbc_model = GradientBoostingClassifier(n_estimators=300, max_leaf_nodes=15, random_state=2024)
gbc_model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = gbc_model.predict(X_test)

# Calculate the accuracy on the test set
model_2f_acc = accuracy_score(y_test, y_pred)
print(f'GBC Test Accuracy: {model_2f_acc:.4f}')

GBC Test Accuracy: 0.9722
