In [1]:
import requests
import time
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import dtale

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from scipy.stats import chi2_contingency

# Get user PUUID using their Riot ID and Tagline
The IDs chosen were from accessing the LoL leaderboards and using a random number generator from 1-1000 to select one person from each rank.

In [10]:
# Set Riot API key (EXPIRES AFTER 24 HOURS)
api_key = 'RGAPI-dde6b16f-e586-4e88-952c-d3c63bb6593c'

# Define the RiotID and Tagline
riot_ids = ['The Baron Buff', 'yer so vayne', 'CNY14', 'seajay28', 'Ch4t Restricted', 'Polite is matter', 'Dexless', 'Scanning', 'Jayweii', 'Ppalli']
taglines = ['Corp', 'NA1', 'NA1', 'NA1', 'NA1', 'NA1', 'NA1', '1213', 'Bae', '777']
puuids = []

for i in range(len(riot_ids)):
    # Construct the summoner API URL
    summoner_url = f'https://americas.api.riotgames.com/riot/account/v1/accounts/by-riot-id/{riot_ids[i]}/{taglines[i]}'

    # Add API key to the headers
    headers = {'X-Riot-Token': api_key}

    # Make the API request
    response = requests.get(summoner_url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        summoner_data = response.json()
        puuids.append(summoner_data['puuid'])
    else:
        print(f"Error: {response.status_code} - {response.text}")

print(puuids)

['rmx4KevIeySe_KyU0zbL4tSlB3-TEJv7rq1TnwdP7eqMQomFzJcgf9_QNvXLMDSTGwlBcs6IGwkhCw', 'QaNhg4o33-p-4iNV66Qruym2oj0KI2NSutcpbpfAeKkqyO8O7pE0kM5ppObtD220uAV49_DI6oKc-g', 'wNXNFMDN2pcXHK7uruuT5uSiZBKGsiuXLbBxt_iiUhcxyYzKD0hqTV5Ba4lJmUnjnXO_uo2qea0kDg', 'oTvLhuNxvAaecqftOUJ5-P2GGiXWmmkXbxmj-kDPHTenjwRqUQWuIjPu0OGW7Ucrqs9GR2T3VV-hxg', 'XaPR5pNeVfA5TjOcJlZUgbA4A-qh3vQTTPljvKQUjPTymMF2yB2FthiCuwgoVIPKg8pie4L67gBzaQ', 'oyTXQGNARDnGUreTHWuUw0gq2GQ8CvqSH7gkHE28Ur019cTYqEWBuZ1tZS2bIdcEpekFCNr3P24ZKg', 'T-LtTmI2PtGX3EJ65XwJnL1OoqNDoVA82vDbIg61VQrfK36nSdj-nS-mHwcIXaRe8nRdPJl7n0AOcA', '0m7V-AdBUCBR0IPX3X9uI8c7_lUu7Y2wZvHKY-1E_3lnc6B63vnYIiEVIQAs-O99tTheDmVrWEVe1A', 'ygS_CFnIxzk8rQsX1NMk2AA_xr3CxGd9qehpuIAM_9yeF_cbrGlGaZCQTWrS_XEmn14xSsU9oPqKiA', '2d2kRbLq5OvaWwOjdFeRyqjyQIihawpVqPiFouj6Y9JoXUR-4CRjHMr27S04ZLuXtR4c1dDpK-Eruw']


# Get user match history (past 50 games)

In [11]:
# Request recent XXX matches from user using puuid
# Maximum of 100 previous games
number_of_matches = 50
match_ids = []

for puuid in puuids:
    # Match Id API URL
    match_ids_url = f'https://americas.api.riotgames.com/lol/match/v5/matches/by-puuid/{puuid}/ids?start=0&count={number_of_matches}'

    # Make the API request
    response = requests.get(match_ids_url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        match_ids.extend(response.json())
    else:
        print(f"Error: {response.status_code} - {response.text}")

# Get match information

In [12]:
# Request info on matches based on match IDs
match_info_list = []
counter = 0

for match_id in match_ids:
    # Match info API URL
    match_info_url = f'https://americas.api.riotgames.com/lol/match/v5/matches/{match_id}'

    # Make the API request
    response = requests.get(match_info_url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        match_info_list.append(response.json())
    else:
        print(f"Error: {response.status_code} - {response.text}")
    
    counter += 1
    
    if counter == 100:
        # Reset counter
        counter = 0
        
        # Need to wait 120 seconds because of request rate limitations
        time.sleep(120)

# Extract desired user stats for every match using their PUUID

In [20]:
# teamId dictionary
teamId_dict = {100: 'Blue', 200: 'Red'}

# Stores match stats for last XXX games
match_stats = {}
challenge_stats = {}

# Stores team stats for last XXX games
objectives_blue_total = {}
objectives_red_total = {}

# Create dictionary containing the role the users in the match were playing
role_dict = {}

# Participants order in match history: Blue (Top, Jungle, Mid, Bot, Support), Red (Top, Jungle, Mid, Bot, Support)
role_values = ['Top', 'Jungle', 'Mid', 'Bot', 'Support', 'Top', 'Jungle', 'Mid', 'Bot', 'Support']

# Fill in dictionary
for i in range(10):
    role_dict[i] = role_values[i]


for i in range(len(match_info_list)):
    match_info = match_info_list[i]
    match_id = match_ids[i]
    
    # Used to store team objective stats
    objectives_blue = {}
    objectives_red = {}
    
    # Isolate the data containing team objectives (total kills, baron, dragons, grubs/horde, rift herald, towers, inhibitors)
    objectives_blue[match_id] = match_info['info']['teams'][0]['objectives']
    objectives_red[match_id] = match_info['info']['teams'][1]['objectives']
    
    # Store the team stats for each game
    objectives_blue_renamed = {}
    objectives_red_renamed = {}
    
    # Creating a new dictionary restructuring objectives_blue and objectives_red
    for key1, values1 in objectives_blue[match_id].items():
        for key2, values2 in values1.items():
            column_name = f'{key1}{key2.capitalize()}'
            objectives_blue_renamed[column_name] = values2

    for key1, values1 in objectives_red[match_id].items():
        for key2, values2 in values1.items():
            column_name = f'{key1}{key2.capitalize()}'
            objectives_red_renamed[column_name] = values2
    
    # Adding if the team won the game or not to the dictionary
    objectives_blue_renamed['win'] = match_info['info']['teams'][0]['win']
    objectives_red_renamed['win'] = match_info['info']['teams'][1]['win']

    # Appending the stats for the game to a new dictionary that uses match ids as keys
    objectives_blue_total.update({match_id: objectives_blue_renamed})
    objectives_red_total.update({match_id: objectives_red_renamed})
    

# Convert to DataFrame
match_stats_df = pd.DataFrame.from_dict(match_stats, orient='index')
challenge_stats_df = pd.DataFrame.from_dict(challenge_stats, orient='index')
objectives_blue_df = pd.DataFrame.from_dict(objectives_blue_total, orient='index')
objectives_red_df = pd.DataFrame.from_dict(objectives_red_total, orient='index')

# Create dummy variables for inhibitors, hordes and dragons
objectives_blue_df = pd.get_dummies(objectives_blue_df, columns=['inhibitorKills', 'dragonKills', 'hordeKills'], prefix=['inhibitorKills', 'dragonKills', 'hordeKills'])
objectives_red_df = pd.get_dummies(objectives_red_df, columns=['inhibitorKills', 'dragonKills', 'hordeKills'], prefix=['inhibitorKills', 'dragonKills', 'hordeKills'])

# Reposition win column for clarity
col_pop = objectives_blue_df.pop('win')
objectives_blue_df.insert(0, 'win', col_pop)

col_pop = objectives_red_df.pop('win')
objectives_red_df.insert(0, 'win', col_pop)

#dtale.show(objectives_blue_df).open_browser()
#dtale.show(objectives_red_df).open_browser()

# Combine the two DataFrames
objectives_data = pd.concat([objectives_blue_df, objectives_red_df], ignore_index = True)
dtale.show(objectives_data).open_browser()

2024-03-20 00:09:49,343 - INFO     - Executing shutdown...
2024-03-20 00:09:49,348 - INFO     - Not running with the Werkzeug Server, exiting by searching gc for BaseWSGIServer


# Selecting a Machine Learning Model

## Using sklearn machine learning libraries
### Supervised Learning Algorithms

**The data and results seen below are from a set of 500 previous games which is an everchanging list. So, the results below will not always match the results from the code and should just be used to understand the thought process.**

Since the data we are working with is labelled with wins and losses, it makes the most sense to use a supervised learning algorithm.

Potential options include:
- Logistic Regression
- K Nearest Neighbour
- Decision Tree
- Random Forest
- Gradient Boosting

The data used to train these models are binary/categorical that contain whether or not the blue team get the first baron, champion kill, dragon, horde, inhibitor, rift herald, and tower.

It is worth mentioning that due to the nature of these variables, if they are TRUE for the blue team, they are FALSE for the red team. So, the categorical variables in objectives_red_df are the opposite of objectives_blue_df, making it unnecessary to model both data frames.

**Strength of Association**

We can use a chi2 contingency table to view the association between the features and the target variable 'win'.
```
            feature        chi2          pval
0    inhibitorFirst  206.514115  7.914379e-47
1        baronFirst  116.699649  3.340054e-27
2     dragonKills_0   71.839575  2.334235e-17
3        towerFirst   66.724524  3.122265e-16
4     dragonKills_4   32.554770  1.158804e-08
5     dragonKills_1   22.728502  1.865798e-06
6       dragonFirst   22.412765  2.199076e-06
7     championFirst   22.406803  2.205912e-06
8     dragonKills_2   20.979788  4.641544e-06
9   riftHeraldFirst   18.314750  1.872518e-05
10     hordeKills_6   14.930831  1.115254e-04
11    dragonKills_3   13.692772  2.152814e-04
12     hordeKills_0   13.430207  2.476048e-04
13    dragonKills_5    9.110863  2.540959e-03
14       hordeFirst    7.957599  4.788590e-03
15     hordeKills_1    2.051322  1.520741e-01
16     hordeKills_5    1.531607  2.158714e-01
17     hordeKills_2    0.640297  4.236033e-01
18     hordeKills_3    0.491195  4.833947e-01
19     hordeKills_4    0.385742  5.345460e-01
20    dragonKills_6    0.001944  9.648352e-01
```

### Machine Learning Model Metrics
Due to the nature of League of Legends, a player's win/loss ratio should be close to 50%, meaning the target variable "win" is well balanced. So, accuracy acts as a simple yet useful indicator of the effectiveness of our machine learning models and will be the primary metric taken into consideration when choosing a model. At the time of testing, the current win/loss ratio is 47.8% (239/500).

Confusion Matrix = [[TN, FP], [FN, TP]]

Accuracy = (TP + TN)/(TP + FP + TN + FN)

Precision = TP/(TP + FP)

Recall = TP/(TP + FN)

F1 score: weighted average of precision and recall

In [None]:
# Extract binary objective results
X_cols = []
for col in objectives_blue_df:
    val_type = objectives_blue_df[col].dtypes
    if val_type == bool and col != 'win':
        X_cols.append(col)

# Separating X and y
X_data = objectives_blue_df[X_cols]
y_data = objectives_blue_df['win']

# Create a contingency table
chi2_list = []
pval_list = []

# Checking the association between each X variable and the target variable
for col in X_data:
    contingency_table = pd.crosstab(X_data[col], y_data)
    chi2_val, pval, _, _ = chi2_contingency(contingency_table)
    chi2_list.append(chi2_val)
    pval_list.append(pval)
    
chi2_df = pd.DataFrame({'feature': X_cols, 'chi2': chi2_list, 'pval': pval_list})

chi2_df = chi2_df.sort_values(by = 'chi2', ascending = False).reset_index(drop=True)
print(chi2_df)

# Separate into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.2, random_state = 42)


print('\n#===================== Simple Logistic Regression =====================#')
# Create logistic regression model
lr = LogisticRegression()

# Fit training data
lr.fit(X_train, y_train)

# Predict if the game is a win or a loss on the test data
y_pred = lr.predict(X_test)

# Confidence the model has in each of its predictions
y_pred_prob = lr.predict_proba(X_test)

# Store summary of results in a new DataFrame
y_results = pd.DataFrame(y_test)
y_results['y_pred'] = y_pred
y_results['y_loss_prob'] = y_pred_prob[:, 0]
y_results['y_win_prob'] = y_pred_prob[:, 1]

# View results
y_results = pd.concat([y_results, X_test], axis=1)
#dtale.show(y_results).open_browser()

# Create confusion matrix comparing the predicted outcomes to the real outcomes
confusion_mat = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion_mat)

# Accuracy score
lr_accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy Score: {lr_accuracy}')

# Precision score
lr_precision = precision_score(y_test, y_pred)
print(f'Precision Score: {lr_precision}')

# Recall Score
lr_recall = recall_score(y_test, y_pred)
print(f'Recall Score: {lr_recall}')

# F1 Score
lr_f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {lr_f1}')


print('\n#===================== K Nearest Neighbour =====================#')
knn = KNeighborsClassifier()

# Fit training data
knn.fit(X_train, y_train)

# Predict if the game is a win or a loss on the test data
y_pred = knn.predict(X_test)

# Confidence the model has in each of its predictions
y_pred_prob = knn.predict_proba(X_test)

# Store summary of results in a new DataFrame
y_results = pd.DataFrame(y_test)
y_results['y_pred'] = y_pred
y_results['y_loss_prob'] = y_pred_prob[:, 0]
y_results['y_win_prob'] = y_pred_prob[:, 1]

# View results
y_results = pd.concat([y_results, X_test], axis=1)
#dtale.show(y_results).open_browser()

# Create confusion matrix comparing the predicted outcomes to the real outcomes
confusion_mat = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion_mat)

# Accuracy score
knn_accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy Score: {knn_accuracy}')

# Precision score
knn_precision = precision_score(y_test, y_pred)
print(f'Precision Score: {knn_precision}')

# Recall Score
knn_recall = recall_score(y_test, y_pred)
print(f'Recall Score: {knn_recall}')

# F1 Score
knn_f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {knn_f1}')

print('\n#===================== Decision Tree Classifier =====================#')
dtc = DecisionTreeClassifier()

# Fit training data
dtc.fit(X_train, y_train)

# Predict if the game is a win or a loss on the test data
y_pred = dtc.predict(X_test)

# Confidence the model has in each of its predictions
y_pred_prob = dtc.predict_proba(X_test)

# Store summary of results in a new DataFrame
y_results = pd.DataFrame(y_test)
y_results['y_pred'] = y_pred
y_results['y_loss_prob'] = y_pred_prob[:, 0]
y_results['y_win_prob'] = y_pred_prob[:, 1]

# View results
y_results = pd.concat([y_results, X_test], axis=1)
#dtale.show(y_results).open_browser()

# Create confusion matrix comparing the predicted outcomes to the real outcomes
confusion_mat = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion_mat)

# Accuracy score
dtc_accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy Score: {dtc_accuracy}')

# Precision score
dtc_precision = precision_score(y_test, y_pred)
print(f'Precision Score: {dtc_precision}')

# Recall Score
dtc_recall = recall_score(y_test, y_pred)
print(f'Recall Score: {dtc_recall}')

# F1 Score
dtc_f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {dtc_f1}')


print('\n#===================== Random Forest Classifier =====================#')
rfc = RandomForestClassifier()

# Fit training data
rfc.fit(X_train, y_train)

# Predict if the game is a win or a loss on the test data
y_pred = rfc.predict(X_test)

# Confidence the model has in each of its predictions
y_pred_prob = rfc.predict_proba(X_test)

# Store summary of results in a new DataFrame
y_results = pd.DataFrame(y_test)
y_results['y_pred'] = y_pred
y_results['y_loss_prob'] = y_pred_prob[:, 0]
y_results['y_win_prob'] = y_pred_prob[:, 1]

# View results
y_results = pd.concat([y_results, X_test], axis=1)
#dtale.show(y_results).open_browser()

# Create confusion matrix comparing the predicted outcomes to the real outcomes
confusion_mat = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion_mat)

# Accuracy score
rfc_accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy Score: {rfc_accuracy}')

# Precision score
rfc_precision = precision_score(y_test, y_pred)
print(f'Precision Score: {rfc_precision}')

# Recall Score
rfc_recall = recall_score(y_test, y_pred)
print(f'Recall Score: {rfc_recall}')

# F1 Score
rfc_f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {rfc_f1}')

print('\n#===================== Gradient Boosting Classifier =====================#')
gbc = GradientBoostingClassifier()

# Fit training data
gbc.fit(X_train, y_train)

# Predict if the game is a win or a loss on the test data
y_pred = gbc.predict(X_test)

# Confidence the model has in each of its predictions
y_pred_prob = gbc.predict_proba(X_test)

# Store summary of results in a new DataFrame
y_results = pd.DataFrame(y_test)
y_results['y_pred'] = y_pred
y_results['y_loss_prob'] = y_pred_prob[:, 0]
y_results['y_win_prob'] = y_pred_prob[:, 1]

# View results
y_results = pd.concat([y_results, X_test], axis=1)
#dtale.show(y_results).open_browser()

# Create confusion matrix comparing the predicted outcomes to the real outcomes
confusion_mat = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion_mat)

# Accuracy score
gbc_accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy Score: {gbc_accuracy}')

# Precision score
gbc_precision = precision_score(y_test, y_pred)
print(f'Precision Score: {gbc_precision}')

# Recall Score
gbc_recall = recall_score(y_test, y_pred)
print(f'Recall Score: {gbc_recall}')

# F1 Score
gbc_f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {gbc_f1}')

# Incorporating K-Fold Cross-Validation

This makes it so that every piece of data is used for training and testing and allows us to collect the mean score of every metric across various training/testing fits, making our results more reliable.

**Results**

After running the code several times, the accuracy of the logistic regression model consistently outperforms or is equal to the next best model, making it the model of choice.

In [None]:
# Extract binary objective results
X_cols = []
for col in objectives_blue_df:
    val_type = objectives_blue_df[col].dtypes
    if val_type == bool and col != 'win':
        X_cols.append(col)

# Separating X and y
X_data = objectives_blue_df[X_cols]
y_data = objectives_blue_df['win']

# Extract features and target as NumPy arrays
X = X_data.values
y = y_data.values

# Initialize KFold
# Use 5 folds to match the 80/20 train/test split of our previous tests
kf = KFold(n_splits=5)

# Store score functions in dictionary for easier looping
metric_functions = {'accuracy': accuracy_score, 'precision': precision_score, 'recall': recall_score, 'f1': f1_score}

# Will store predicted target variables for each model
model_pred = {}

# Dictionary that will summarize all of the scores
metrics = {'logisticRegression': {'accuracy': [], 'precision': [], 'recall': [], 'f1': []},\
           'kNearestNeighbour': {'accuracy': [], 'precision': [], 'recall': [], 'f1': []},\
           'decisionTreeClassifier': {'accuracy': [], 'precision': [], 'recall': [], 'f1': []},\
           'randomForestClassifier': {'accuracy': [], 'precision': [], 'recall': [], 'f1': []},\
           'gradientBoostingClassifier': {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}}

# Split the data using KFold and train on our models
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    lr.fit(X_train, y_train)
    model_pred['logisticRegression'] = lr.predict(X_test)
    
    knn.fit(X_train, y_train)
    model_pred['kNearestNeighbour'] = knn.predict(X_test)
    
    dtc.fit(X_train, y_train)
    model_pred['decisionTreeClassifier'] = dtc.predict(X_test)
    
    rfc.fit(X_train, y_train)
    model_pred['randomForestClassifier'] = rfc.predict(X_test)
    
    gbc.fit(X_train, y_train)
    model_pred['gradientBoostingClassifier'] = gbc.predict(X_test)
    
    # Add metrics to dictionary
    # Loop through every model name
    for model_name in metrics:
        
        # Loop through every metric function
        for metric_name in metric_functions:
            
            # Predicted values for current model
            y_pred = model_pred[model_name]
            
            # Apply corresponding score function
            score = metric_functions[metric_name](y_test, y_pred)
            
            # Append score to list for corresponding model and metric
            metrics[model_name][metric_name].append(score)

metrics = pd.DataFrame(metrics)

# Round the values to 2 decimal points
metrics = metrics.applymap(lambda x: [round(num, 2) for num in x])

# Add the mean of each score for each model
metrics['lrMean'] = metrics.logisticRegression.apply(lambda row: np.mean(row))
metrics['knnMean'] = metrics.kNearestNeighbour.apply(lambda row: np.mean(row))
metrics['dtcMean'] = metrics.decisionTreeClassifier.apply(lambda row: np.mean(row))
metrics['rfcMean'] = metrics.randomForestClassifier.apply(lambda row: np.mean(row))
metrics['gbcMean'] = metrics.gradientBoostingClassifier.apply(lambda row: np.mean(row))

print(metrics)

# After running the code several times, the accuracy of the logistic regression model consistently
# outperforms or is equal to the next best model, making it the model of choice.

# Logistic Regressor - Improving Model Performance
## Looking at the confusion matrix

By looking at the confusion matrix, we can see that there are consistently and noticeably more flase positives than false negatives. We may be able to improve our accuracy by changing the acceptance threshold of our model.

It was determined that the optimal threshold is 0.55 with an accuracy of 0.88. 

In [None]:
# Extract binary objective results
X_cols = []
for col in objectives_blue_df:
    val_type = objectives_blue_df[col].dtypes
    if val_type == bool and col != 'win':
        X_cols.append(col)

# Separating X and y
X_data = objectives_blue_df[X_cols]
y_data = objectives_blue_df['win']

# Separate into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.2, random_state = 42)

# Create logistic regression model
lr = LogisticRegression()

# Fit training data
lr.fit(X_train, y_train)

# Predict if the game is a win or a loss on the test data
y_pred = lr.predict(X_test)

# Confidence the model has in each of its predictions
y_pred_prob = lr.predict_proba(X_test)[:, 1]

# Create array of thresholds
custom_thresholds = np.linspace(0, 1, 21)
accuracies = []

for threshold in custom_thresholds:
    # y_pred that abides by the custom threshold
    y_pred_custom = (y_pred_prob >= threshold)

    # Evaluate the model using the custom threshold
    accuracies.append(accuracy_score(y_test, y_pred_custom))

max_accuracy = max(accuracies)
max_index = accuracies.index(max_accuracy)

optimal_threshold = custom_thresholds[max_index]

# Plotting accuracy for various acceptance thresholds
ax = sns.lineplot(x = custom_thresholds, y = accuracies)
ax.scatter(optimal_threshold, max_accuracy, color='red', label='Max Accuracy')

# Set x and y labels
ax.set(xlabel='Thresholds', ylabel='Accuracies')

ax.annotate(f'{optimal_threshold}, {max_accuracy}', (optimal_threshold, max_accuracy))

plt.legend()
plt.show()

# Hyperparameter Tuning
Now that we have determined an optimal acceptance threshold, we can tune the hyperparamter C the inverse of regularization stregth. **It has been determined that the default value of 1 is ideal.**

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create logistic regression model
lr = LogisticRegression()

# Define hyperparameter grid for grid search
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# Create a grid search object with cross-validation
grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Use the best model to make predictions
best_model = grid_search.best_estimator_
# Confidence model has in each of its predictions
y_pred = best_model.predict(X_test)
y_pred_prob = best_model.predict_proba(X_test)[:, 1]

y_pred_optimal = (y_pred_prob >= optimal_threshold)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_optimal)

print(f"Best hyperparameters: {best_params}")
print(f"Accuracy on the test set: {accuracy}")