In [1]:
# Using real data from 2023 League of Legends World Championship
# 2023_LoL_esports_match_data_from_OraclesElixir_20210622.csv


In [2]:
# Preprocessing
# 1. I create a script called "dataset.py" to preprocess the data and get only the information that i needed.

# For example purposes, the csv file has the following columns:
# gameid,datacompleteness,url,league,year,split,playoffs,date,game,patch,participantid,side,position,playername,playerid,teamname,teamid,champion,ban1,ban2,ban3,ban4,ban5,gamelength,result,kills,deaths,assists,teamkills,teamdeaths,doublekills,triplekills,quadrakills,pentakills,firstblood,firstbloodkill,firstbloodassist,firstbloodvictim,team kpm,ckpm,firstdragon,dragons,opp_dragons,elementaldrakes,opp_elementaldrakes,infernals,mountains,clouds,oceans,chemtechs,hextechs,dragons (type unknown),elders,opp_elders,firstherald,heralds,opp_heralds,firstbaron,barons,opp_barons,firsttower,towers,opp_towers,firstmidtower,firsttothreetowers,turretplates,opp_turretplates,inhibitors,opp_inhibitors,damagetochampions,dpm,damageshare,damagetakenperminute,damagemitigatedperminute,wardsplaced,wpm,wardskilled,wcpm,controlwardsbought,visionscore,vspm,totalgold,earnedgold,earned gpm,earnedgoldshare,goldspent,gspd,total cs,minionkills,monsterkills,monsterkillsownjungle,monsterkillsenemyjungle,cspm,goldat10,xpat10,csat10,opp_goldat10,opp_xpat10,opp_csat10,golddiffat10,xpdiffat10,csdiffat10,killsat10,assistsat10,deathsat10,opp_killsat10,opp_assistsat10,opp_deathsat10,goldat15,xpat15,csat15,opp_goldat15,opp_xpat15,opp_csat15,golddiffat15,xpdiffat15,csdiffat15,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15

# Using the script, i chose the following columns:
# 'champion', 'league', 'result', 'kills', 'deaths', 'assists','side', 'damagetochampions','position'
# I split the "position" collumn into individual collumns and added 2 new collumns called " Difficulty" and "Items_needed_for_spike" to help me with the analysis. The choices made were from another csv file that had all the champions and their respective difficulty and items needed for spike called "fabian.csv"

In [1]:
from random import random

import pandas as pd
import math
from pprint import pprint

from sklearn.svm._libsvm import predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## 1. Preprocessing

# Read in data
data = pd.read_csv('champions_info.csv')

# b. Drop unnecessary columns
data = data.dropna()

# Describe the dataset
print(data.describe())

# a. Identify attributes and target attribute
attributes = data.columns[:-1]  # all columns except the last one
target_attribute = 'first_blood_kill'

# a. Identify discrete and continuous attributes
discrete_attributes = ['champion', 'league', 'result', 'side', 'Difficulty', 'Items_For_Spike', 'Attack_Type', 'top',
                       'mid', 'jng', 'sup', 'bot']
continuous_attributes = ['kills', 'deaths', 'assists', 'damagetochampions']

# c. Calculate the mean and variance for each numerical attribute
mean_variances = {}
for attribute in continuous_attributes:
    mean_variances[attribute] = [data[attribute].mean(), data[attribute].var()]

print(mean_variances)


## 3. Probabilities and Information Theory

# a. Calculate the probability mass function of discrete attributes.

def compute_probabilities(data, attribute):
    return data[attribute].value_counts(normalize=True).to_dict()


# e.g. compute_probabilities(data, 'league')
probabilites = compute_probabilities(data, 'league')
print("Probabilities : ", probabilites)


# b. Calculate the entropy for discrete attributes.

def calculate_entropy(data, attribute):
    probabilities = compute_probabilities(data, attribute)
    entropy = -sum(p * math.log2(p) for p in probabilities.values())
    return entropy


# e.g. calculate_entropy(data, 'league')
entropy_league = calculate_entropy(data, 'league')
print("Entropy : ", entropy_league)


# c. Calculate conditional entropy for target attribute and a discrete attribute.

def calculate_conditional_entropy(data, attribute, target_attribute):
    conditional_entropy = 0
    for attribute_values in data[attribute].unique():
        subset = data[data[attribute] == attribute_values]
        probability_attribute_values = len(subset) / len(data)
        entropy_attribute_value = calculate_entropy(subset, target_attribute)
        conditional_entropy += probability_attribute_values * entropy_attribute_value
    return conditional_entropy


# e.g. calculate_conditional_entropy(data, 'league', 'first_blood_kill')
conditional_entropy = calculate_conditional_entropy(data, 'league', 'first_blood_kill')
print("Condititonal Entropy : ", conditional_entropy)


# d. Calculate the infroamtion gain for discrete attributes.

def calculate_information_gain(data, attribute, target_attribute):
    target_entropy = calculate_entropy(data, target_attribute)
    conditional_entropy = calculate_conditional_entropy(data, attribute, target_attribute)
    information_gain = target_entropy - conditional_entropy
    return information_gain


# e.g. calculate_information_gain(data, 'league', 'first_blood_kill')
information_gain = calculate_information_gain(data, 'league', 'first_blood_kill')
print("Information Gain : ", information_gain)


## 3. ID3

# a. Find the root node of the decision tree.

def find_root_node(data, attributes, target_attribute):
    information_gains = [(attribute, calculate_information_gain(data, attribute, target_attribute)) for attribute in
                         attributes]
    best_attribute, best_information_gain = max(information_gains, key=lambda x: x[1])
    return best_attribute, best_information_gain


# e.g. find_root_node(data, attributes, target_attribute)
root_node, root_ig = find_root_node(data, discrete_attributes, 'first_blood_kill')
print("Root Node:", root_node, "Information Gain:", root_ig)


# b. . Write a function id3_discrete that implements the ID3 algorithm for the discrete attributes. The function should return a dictionary following this
# structure
def id3_discrete(data, attributes, target_attribute, unknown_handling="most_common"):
    if len(attributes) == 0:
        most_common_value = data[target_attribute].mode().iloc[0]
        return most_common_value

    root_node, root_ig = find_root_node(data, attributes, target_attribute)
    tree = {
        "node_attribute": root_node,
        "observations": dict(data[root_node].value_counts()),
        "information_gain": root_ig,
        "values": {}
    }

    unique_values = data[root_node].unique()
    for value in unique_values:
        subset = data[data[root_node] == value]
        if len(subset[target_attribute].unique()) == 1:
            tree["values"][value] = subset[target_attribute].values[0]
        else:
            subtree = id3_discrete(subset.drop(columns=[root_node]), [attr for attr in attributes if attr != root_node],
                                   target_attribute, unknown_handling)
            tree["values"][value] = subtree

    return tree


def predict_id3(tree, sample, unknown_handling="most_common"):
    current_node = tree
    while isinstance(current_node, dict):
        attribute_value = sample[current_node["node_attribute"]]
        if attribute_value in current_node["values"]:
            current_node = current_node["values"][attribute_value]
        else:
            if unknown_handling == "most_common":
                most_common_class = max(current_node["observations"], key=current_node["observations"].get)
                return most_common_class
            elif unknown_handling == "random":
                return "unknown_random"
            else:
                return "unknown_custom"

    return current_node


# Example usage
tree = id3_discrete(data, discrete_attributes, 'first_blood_kill')


# pprint(tree, width=40)

# c. Run id3_discrete on the dataset containing only discrete attributes. Compare the results with the ones from sklearn . (make your comparison as
# thorough as possible)

def run_id3(data, discrete_attributes, target_attribute, unknown_handling="most_common"):
    # Split data into training and test set
    unique_champions = data['champion'].unique()
    train, test = train_test_split(data[data['champion'].isin(unique_champions)], test_size=0.2, random_state=42)

    # Train ID3 tree with handling unknown values using the most common strategy
    tree_id3_most_common = id3_discrete(train, discrete_attributes, target_attribute, unknown_handling=unknown_handling)

    # Predict using ID3 with handling unknown values using the most common strategy
    predictions_id3_test_most_common = [predict_id3(tree_id3_most_common, sample, unknown_handling=unknown_handling) for
                                        _, sample in test.iterrows()]

    # Convert the list to a pandas Series
    predictions_id3_test_most_common_series = pd.Series(predictions_id3_test_most_common)

    # Convert the data type of predictions_id3_test_most_common_series to match the data type of test[target_attribute]
    predictions_id3_test_most_common_series = predictions_id3_test_most_common_series.astype(
        test[target_attribute].dtype)

    # Calculate accuracy
    accuracy_id3_test_most_common = accuracy_score(test[target_attribute], predictions_id3_test_most_common_series)
    print(f"Accuracy ID3 with {unknown_handling} Handling: {accuracy_id3_test_most_common}")

    return accuracy_id3_test_most_common


def run_sklearn_decision_tree(data, discrete_attributes, target_attribute):
    # One-hot encode categorical features
    X = data[discrete_attributes]
    Y = data[target_attribute]
    X_encoded = pd.get_dummies(X, columns=['champion', 'league', 'side', 'Difficulty', 'Items_For_Spike', 'Attack_Type',
                                           'top', 'mid', 'jng', 'sup', 'bot'])
    X_encoded[target_attribute] = Y

    # Split data into training and test set
    X_train, X_test, Y_train, Y_test = train_test_split(X_encoded.drop(columns=[target_attribute]),
                                                        X_encoded[target_attribute], test_size=0.3, random_state=42)

    # Run scikit-learn Decision Tree
    clf = DecisionTreeClassifier()
    clf.fit(X_train, Y_train)

    # Make Predictions
    predictions_sklearn_test = clf.predict(X_test)

    accuracy_sklearn_test = accuracy_score(Y_test, predictions_sklearn_test)

    print(f"Scikit-learn Accuracy on Test Data: {accuracy_sklearn_test}")

    return accuracy_sklearn_test


run_id3(data, discrete_attributes, target_attribute)
run_sklearn_decision_tree(data, discrete_attributes, target_attribute)


# d. Write a function get_splits which, given a continuous attribute and the labels, will identify the splits that could be used to discretization of the
# variable. Test your function on an example.

def get_splits(labels, attribute_values):
    # Combine attribute values and labels into a DataFrame for easier sorting
    data = pd.DataFrame({'labels': labels, 'values': attribute_values})

    # Sort data by values
    sorted_data = data.sort_values(by=['values'])

    # Identify candidate split points as the midpoints between consecutive values
    splits = [(sorted_data['values'].iloc[i] + sorted_data['values'].iloc[i + 1]) / 2 for i in
              range(len(sorted_data['values']) - 1)]

    return splits


# Example usage using my data
splits = get_splits(data['first_blood_kill'], data['kills'])
print(splits)


           result       kills      deaths     assists  damagetochampions
count  300.000000  300.000000  300.000000  300.000000         300.000000
mean     0.506667    2.796667    2.420000    6.173333       14555.590000
std      0.500791    2.606011    1.800297    4.613122        9107.012622
min      0.000000    0.000000    0.000000    0.000000        1327.000000
25%      0.000000    1.000000    1.000000    3.000000        7832.250000
50%      1.000000    2.000000    2.000000    5.000000       12564.500000
75%      1.000000    4.000000    4.000000    8.250000       19273.500000
max      1.000000   12.000000    9.000000   23.000000       50928.000000
{'kills': [2.796666666666667, 6.79129319955407], 'deaths': [2.42, 3.241070234113713], 'assists': [6.173333333333333, 21.28089186176143], 'damagetochampions': [14555.59, 82937678.89153846]}
Probabilities :  {'LCK': 0.25, 'LCS': 0.25, 'LEC': 0.25, 'LPL': 0.25}
Entropy :  2.0
Condititonal Entropy :  0.6205651394665301
Information Gain :  0.0057