In [1]:
import requests 
import csv
import os.path
import numpy as np
import pandas as pd
import time
from os import path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

accessToken = "3360286107350243" 

def main():
    with open('heroes.csv', 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter = ',')
        spamwriter.writerow(["name", "gender", "eyeColor", "hairType", 
                             "intelligence", "strength", "speed", "power", "combat", "role"])

        for charId in range(1,732):
            URL         = "https://www.superheroapi.com/api/{}/{}".format(accessToken, charId)
            response    = requests.get(url = URL).json() 

            spamwriter.writerow([response["name"], response["appearance"]["gender"],
                                 response["appearance"]["eye-color"], response["appearance"]["hair-color"], 
                                 response["powerstats"]["intelligence"], response["powerstats"]["strength"], 
                                 response["powerstats"]["speed"], response["powerstats"]["power"], 
                                 response["powerstats"]["combat"], response["biography"]["alignment"]])

if not path.exists("heroes.csv"):
    start = time.perf_counter()
    main()
    end = time.perf_counter()
    print("Download time: {}".format(start - end))

# Database exploring

In [2]:
df = pd.read_csv("heroes.csv") 
df.describe() #describing numeric values

Unnamed: 0,intelligence,strength,speed,power,combat
count,566.0,629.0,566.0,566.0,566.0
mean,64.44523,44.00159,40.249117,64.243816,61.597173
std,19.989369,32.886676,24.416224,29.797914,23.349979
min,6.0,4.0,1.0,0.0,5.0
25%,50.0,10.0,23.0,38.0,42.0
50%,63.0,35.0,33.0,63.5,64.0
75%,75.0,80.0,50.0,100.0,80.0
max,100.0,100.0,100.0,100.0,101.0


In [3]:
df["role"].value_counts()

good       494
bad        206
neutral     24
-            7
Name: role, dtype: int64

In [4]:
df["gender"].value_counts()

Male      502
Female    200
-          29
Name: gender, dtype: int64

In [5]:
df["eyeColor"].value_counts()

Blue                       219
-                          171
Brown                      123
Green                       72
Red                         46
Black                       23
Yellow                      19
White                       17
Hazel                        6
blue                         6
Grey                         6
Purple                       4
Gold                         3
Violet                       2
Yellow (without irises)      2
brown                        2
Amber                        2
Green / Blue                 1
Silver                       1
Indigo                       1
Bown                         1
White / Red                  1
Yellow / Blue                1
Blue / White                 1
Yellow / Red                 1
Name: eyeColor, dtype: int64

In [6]:
df["hairType"].value_counts()

-                   171
Black               158
Blond                98
Brown                85
No Hair              75
Red                  51
White                23
Auburn               13
Green                 8
Strawberry Blond      7
Grey                  5
Purple                5
Silver                4
Brown / White         4
blond                 3
black                 3
Blue                  3
Orange                2
Yellow                2
Red / Orange          1
Brown / Black         1
Indigo                1
Pink                  1
Orange / White        1
Gold                  1
Brownn                1
Red / Grey            1
Red / White           1
Black / Blue          1
Magenta               1
Name: hairType, dtype: int64

# Database processing

In [7]:
def formatPowerStat(ps):
    if ps < 20:
        return 0
    elif ps >= 20 and ps < 40:
        return 1
    elif ps >= 40 and ps < 60:
        return 2
    elif ps >= 60 and ps < 80:
        return 3
    else:
        return 4
    
def formatEyeColor(ec):
    if "blue" in ec:
        return 0
    elif "brown" in ec:
        return 1
    elif "green" in ec:
        return 2
    else:
        return 3

def formatHairType(ht):
    if "black" in ht:
        return 0
    elif "blond" in ht:
        return 1
    elif "brown" in ht:
        return 2
    else:
        return 3

In [8]:
df = df.fillna(df.mean()) #setting mean in NaN values
df = df.applymap(lambda s : s.lower() if type(s) == str else s) #converting strings to lowercase
df = df.applymap(lambda ps : formatPowerStat(ps) if type(ps) != str else ps) #converting power stats in categories

df["eyeColor"] = df["eyeColor"].map(lambda ec : formatEyeColor(ec)) #categorizing eye color
df["hairType"] = df["hairType"].map(lambda ht : formatHairType(ht)) #categorizing hair type

le         = LabelEncoder()
df["role"] = df["role"].replace("-", "bad") 
df["role"] = df["role"].replace("neutral", "bad") 
df["role"] = le.fit_transform(df["role"]) #categorizing role
df         = df.drop("gender", axis = 1)

#counting heroes and villains
countHero, countVillain = df["role"].value_counts()

dfHero    = df[df["role"] == 1] #hero dataframe
dfVillain = df[df["role"] == 0] #villain dataframe

newDfVillain = dfVillain.sample(countHero, replace = True, random_state = 1) #undersampling hero dataframe
balancedDf = pd.concat([newDfVillain, dfHero], axis = 0) #new balanced dataframe
print(balancedDf["role"].value_counts())

1    494
0    494
Name: role, dtype: int64


# Final database

In [9]:
'''
0 - Very weak
1 - Weak
2 - Normal
3 - Strong
4 - Very strong
'''

balancedDf.describe() #describing numeric values

Unnamed: 0,eyeColor,hairType,intelligence,strength,speed,power,combat,role
count,988.0,988.0,988.0,988.0,988.0,988.0,988.0,988.0
mean,1.706478,1.92915,2.788462,1.828947,1.62247,2.733806,2.729757,0.5
std,1.302666,1.245474,0.939286,1.549616,1.079505,1.204093,1.042581,0.500253
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,2.0,0.0,1.0,2.0,2.0,0.0
50%,2.0,3.0,3.0,2.0,2.0,3.0,3.0,0.5
75%,3.0,3.0,3.0,3.0,2.0,4.0,3.0,1.0
max,3.0,3.0,4.0,4.0,4.0,4.0,4.0,1.0


In [10]:
'''
0 - Villain
1 - Hero
'''

balancedDf["role"].value_counts()

1    494
0    494
Name: role, dtype: int64

In [11]:
'''
0 - Blue
1 - Brown
2 - Green
3 - Other
'''

balancedDf["eyeColor"].value_counts()

3    449
0    286
1    167
2     86
Name: eyeColor, dtype: int64

In [12]:
'''
0 - Black
1 - Blond
2 - Brown
3 - Other
'''

balancedDf["hairType"].value_counts()

3    509
0    224
1    131
2    124
Name: hairType, dtype: int64

# Machine learning

In [13]:
# Defining features and classes
X = balancedDf.iloc[:, 1:-1].values
y = balancedDf.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [14]:
# KNN classifier
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5)
scores     = cross_val_score(classifier, X, y, cv = 5)

print(scores)
print("Cross validation mean: {}".format(np.mean(scores)))

classifier.fit(X_train, y_train)
conf_mat = confusion_matrix(y_true = y_test, y_pred = classifier.predict(X_test))
print('Confusion matrix:\n', conf_mat)

[0.67171717 0.65656566 0.65151515 0.66161616 0.61734694]
Cross validation mean: 0.6517522160379303
Confusion matrix:
 [[66 31]
 [44 57]]


In [15]:
# Decision tree classifier
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
scores     = cross_val_score(classifier, X, y, cv = 5)

print(scores)
print("Cross validation mean: {}".format(np.mean(scores)))

classifier.fit(X_train, y_train)
conf_mat = confusion_matrix(y_true = y_test, y_pred = classifier.predict(X_test))
print('Confusion matrix:\n', conf_mat)

[0.75757576 0.73232323 0.75252525 0.73737374 0.77040816]
Cross validation mean: 0.7500412286126572
Confusion matrix:
 [[76 21]
 [32 69]]


In [16]:
# Random forest classifier
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
scores     = cross_val_score(classifier, X, y, cv = 5)

print(scores)
print("Cross validation mean: {}".format(np.mean(scores)))

classifier.fit(X_train, y_train)
conf_mat = confusion_matrix(y_true = y_test, y_pred = classifier.predict(X_test))
print('Confusion matrix:\n', conf_mat)

[0.74242424 0.76262626 0.74242424 0.76767677 0.74489796]
Cross validation mean: 0.7520098948670377
Confusion matrix:
 [[75 22]
 [34 67]]


In [17]:
# Logistic Regression classifier
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state = 0, solver = "lbfgs", multi_class = "auto")
scores     = cross_val_score(classifier, X, y, cv = 5)

print(scores)
print("Cross validation mean: {}".format(np.mean(scores)))

classifier.fit(X_train, y_train)
conf_mat = confusion_matrix(y_true = y_test, y_pred = classifier.predict(X_test))
print('Confusion matrix:\n', conf_mat)

[0.61111111 0.56565657 0.55050505 0.62121212 0.48979592]
Cross validation mean: 0.5676561533704392
Confusion matrix:
 [[54 43]
 [43 58]]


In [18]:
# Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
scores     = cross_val_score(classifier, X, y, cv = 5)

print(scores)
print("Cross validation mean: {}".format(np.mean(scores)))

classifier.fit(X_train, y_train)
conf_mat = confusion_matrix(y_true = y_test, y_pred = classifier.predict(X_test))
print('Confusion matrix:\n', conf_mat)

[0.63636364 0.63636364 0.54545455 0.57575758 0.56122449]
Cross validation mean: 0.5910327767470624
Confusion matrix:
 [[58 39]
 [44 57]]


In [19]:
# SVC classifier
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
scores     = cross_val_score(classifier, X, y, cv = 5)

print(scores)
print("Cross validation mean: {}".format(np.mean(scores)))

classifier.fit(X_train, y_train)
conf_mat = confusion_matrix(y_true = y_test, y_pred = classifier.predict(X_test))
print('Confusion matrix:\n', conf_mat)

[0.56060606 0.5959596  0.52525253 0.57575758 0.5255102 ]
Cross validation mean: 0.5566171923314781
Confusion matrix:
 [[54 43]
 [43 58]]
