# Prediction of the superhero's overall score

The goal of this notebook is to predict a superhero's overall score based on the hero's history and powers description.
Different techniques are used to achieve this goal.

- __Section 2__ uses a bag of words (BoW) approach to encode the text into a fixed length vector.
    - In __Section 2.2__ this representation is the input to a linear regression model that aims to predict the superhero's overall score.
    - In __Section 2.3__ the same inputs are used to train a multilayer perceptron.
- __Section 3__ Follows a similar approach while replacing the linear regression model by a multilayer perceptron.

Sources:

- [Evaluating regression models](https://towardsdatascience.com/what-are-the-best-metrics-to-evaluate-your-regression-model-418ca481755b)

## 0. Setup

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

In [2]:
# global variables
MLP_LAYER_CONFIG = (800, 400, 200, 100)
experiments = []

# data classes to reduce the cluttering in the namespace
class DataSet:

    def __init__(self, name: str):
        self.name = name
        self.x_train = None
        self.y_train = None
        self.x_test = None
        self.y_test = None

    def set_data(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test

    def get_bow_encoding(self):

        bow_transformer = CountVectorizer(analyzer='word').fit(self.x_train)

        ds_bow = DataSet(self.name + ' BoW')
        ds_bow.x_train = bow_transformer.transform(self.x_train)
        ds_bow.y_train = self.y_train
        ds_bow.x_test = bow_transformer.transform(self.x_test)
        ds_bow.y_test = self.y_test

        return ds_bow

    def get_tfidf_encoding(self):

        tfidf_transformer = TfidfVectorizer(analyzer='word').fit(self.x_train)

        tfidf_bow = DataSet(self.name + ' tf-idf')
        tfidf_bow.x_train = tfidf_transformer.transform(self.x_train)
        tfidf_bow.y_train = self.y_train
        tfidf_bow.x_test = tfidf_transformer.transform(self.x_test)
        tfidf_bow.y_test = self.y_test

        return tfidf_bow


class ErrorData:

    def __init__(self):
        self.errors = None  # errors for all predicted values
        self.mae = None  # mean absolute error
        self.mse = None  # mean square error
        self.r_square = None  # R-square

    def compute(self, y, y_predicted):
        self.errors = y - y_predicted
        self.mae = np.linalg.norm(self.errors, 1) / len(y)
        self.mse = np.linalg.norm(self.errors, 2) / len(y)


class Experiment:

    def __init__(self, name: str, data_set: DataSet):
        self.name = name
        self.data_set = data_set
        # models
        self.model = None
        self.train_e = ErrorData()  # training errors
        self.test_e = ErrorData()  # test errors

    def train(self):
        self.model.fit(self.data_set.x_train, self.data_set.y_train)

        self.train_e.compute(np.array(self.data_set.y_train.values), self.model.predict(self.data_set.x_train))
        self.train_e.r_square = self.model.score(self.data_set.x_train, self.data_set.y_train.values)

        self.test_e.compute(np.array(self.data_set.y_test.values), self.model.predict(self.data_set.x_test))
        self.test_e.r_square = self.model.score(self.data_set.x_test, self.data_set.y_test.values)

    @staticmethod
    def compare(experiments):
        comparison_df = pd.DataFrame(columns=[
            'R-square training', 'mean absolute error training', 'mean square error training',
            'R-square test', 'mean absolute error test', 'mean square error test'])
        for ex in experiments:
            comparison_df.loc[ex.name, :] = [
                ex.train_e.r_square, ex.train_e.mae, ex.train_e.mae, ex.test_e.r_square, ex.test_e.mae, ex.test_e.mse]
        return comparison_df

    @staticmethod
    def plot_error_summary(experiments, data_type):
        error_df = pd.DataFrame()
        for ex in experiments:
            if data_type == 'test':
                error_df.loc[:, ex.name] = ex.test_e.errors
            elif data_type == 'training':
                error_df.loc[:, ex.name] = ex.train_e.errors
        fig = px.box(error_df, labels={"value": "error", "variable": "model"},
                     title=f"Error boxplot for the different models on {data_type} data")
        fig.show()

    @staticmethod
    def plot_prediction(experiments, data_type):

        if data_type == 'test':
            y = experiments[0].data_set.y_test
        elif data_type == 'training':
            y = experiments[0].data_set.y_train

        prediction_df = pd.DataFrame(columns=[f'{data_type} data'])
        prediction_df.loc[:, f'{data_type} data'] = y
        prediction_df.loc[:, 'future_idx'] = y
        for ex in experiments:
            if data_type == 'test':
                x = ex.data_set.x_test
            elif data_type == 'training':
                x = ex.data_set.x_train
            prediction_df.loc[:, ex.name] = ex.model.predict(x)
        prediction_df = prediction_df.sort_values(by='future_idx')
        prediction_df = prediction_df.set_index('future_idx')
        fig =  px.line(prediction_df,
                       labels={
                           "future_idx": "overall score",
                           "value": "prediction",
                           "variable": "model"
                 },title=f"Overall score predictions by the different models on {data_type} data")
        fig.show()


class LinRegExperiment(Experiment):

    def __init__(self, name: str, data_set: DataSet, model: LinearRegression):
        super().__init__(name, data_set)
        self.model = model

class MLPExperiment(Experiment):

    def __init__(self, name: str, data_set: DataSet, model: MLPRegressor):
        super().__init__(name, data_set)
        self.model = model
        self.training_history_nn = None

## 1. Data Preparation

While loading the data, the following additional preprocessing steps are applied.

- The columns `history_text` and `powers_text` are concatenated in a new column `text`.
- The rows with `NaN` values in the `overall_score` column are dropped.

In [3]:
superheros = pd.read_csv('datasets/Preprocessed.csv')
superheros.loc[:, 'text'] = superheros.loc[:, 'history_text'].astype(str) + superheros.loc[:, 'powers_text'].astype(str)
superheros = superheros.dropna(subset=['overall_score'])
superheros.head(2)

Unnamed: 0,name,overall_score,history_text,powers_text,superpowers,creator,alignment,text
0,A-Bomb,20.0,richard rick jone orphan young age expel sever...,rare occasion unusual circumstance jone able t...,"['Accelerated Healing', 'Agility', 'Berserk Mo...",Marvel Comics,Good,richard rick jone orphan young age expel sever...
1,Abe Sapien,10.0,sapien begin life langdon everett caul victori...,abe humanoid amphibious creature pair gill nec...,"['Accelerated Healing', 'Agility', 'Cold Resis...",Other,Good,sapien begin life langdon everett caul victori...


Two pairs of inputs and outputs are created.
The first one contains the `history_text` and the second one the `text` column as input.
Both contain the `overall_score` as output.

The dataset is split up into training ($65\ \%$) and test data ($35\ \%$).

In [4]:
seed = 42
test_ratio = 0.35

ds_hist = DataSet('history')
ds_cnct = DataSet('concatenated')

ds_hist.set_data(*train_test_split(superheros.loc[:, 'history_text'], superheros.loc[:, 'overall_score'],
                                  test_size=test_ratio, random_state=seed))
ds_cnct.set_data(*train_test_split(superheros.loc[:, 'text'], superheros.loc[:, 'overall_score'],
                                  test_size=test_ratio, random_state=seed))

print(f'training data size: {ds_hist.x_train.shape}')
print(f'test data size: {ds_hist.x_test.shape}')

training data size: (605,)
test data size: (327,)


## 2. BoW

### 2.1 Linear Regression Model

In [5]:
bow_lin_hist = LinRegExperiment('BoW Lin Reg history', ds_hist.get_bow_encoding(), LinearRegression())
bow_lin_cnct = LinRegExperiment('BoW Lin Reg concatenated', ds_cnct.get_bow_encoding(), LinearRegression())
experiments.append(bow_lin_hist)
experiments.append(bow_lin_cnct)

# Training and evaluating the models.
bow_lin_hist.train()
bow_lin_cnct.train()

# comparing the model
Experiment.compare(experiments)

Unnamed: 0,R-square training,mean absolute error training,mean square error training,R-square test,mean absolute error test,mean square error test
BoW Lin Reg history,0.999763,0.041878,0.041878,-0.786048,27.414058,2.801669
BoW Lin Reg concatenated,0.999894,0.023142,0.023142,0.112207,19.521665,1.975268


Ideas

- [ ] find out which word have the highest weights in the regression model

Observations

- The model that has more data at its disposal performs better.
- The model performs well in most cases but makes large mistakes for a few superheros.

### 2.2 Multilayer Perceptron Regressor

In [6]:
bow_nn_hist = MLPExperiment('BoW MLP history', ds_hist.get_bow_encoding(),
                               MLPRegressor(hidden_layer_sizes=MLP_LAYER_CONFIG, max_iter=25))
bow_nn_cnct = MLPExperiment('BoW MLP concatenated', ds_cnct.get_bow_encoding(),
                               MLPRegressor(hidden_layer_sizes=MLP_LAYER_CONFIG, max_iter=25))
experiments.append(bow_nn_hist)
experiments.append(bow_nn_cnct)

# Training and evaluating the models.
bow_nn_hist.train()
bow_nn_cnct.train()

# comparing the model
Experiment.compare(experiments)



Unnamed: 0,R-square training,mean absolute error training,mean square error training,R-square test,mean absolute error test,mean square error test
BoW Lin Reg history,0.999763,0.041878,0.041878,-0.786048,27.414058,2.801669
BoW Lin Reg concatenated,0.999894,0.023142,0.023142,0.112207,19.521665,1.975268
BoW MLP history,0.948787,1.795521,1.795521,0.075539,12.822303,2.015648
BoW MLP concatenated,0.953409,2.838553,2.838553,0.162347,12.359514,1.918679


Observations

- The model trained on more data outperforms the other one again by a small margin.
- The MLP models seem to perform a little better than the linear regression models but not much despite being much more complex.
This suggests that the limiting factor is not the linear regression model but another element in the approach like the
encoding of the text, the available amount of data or the data itself.

## 3 tf-idf

### 3.1 Linear Regression Model

In [7]:
tfidf_lin_hist = LinRegExperiment('tf-idf Lin Reg history', ds_hist.get_tfidf_encoding(), LinearRegression())
tfidf_lin_cnct = LinRegExperiment('tf-idf Lin Reg concatenated', ds_cnct.get_tfidf_encoding(), LinearRegression())
experiments.append(tfidf_lin_hist)
experiments.append(tfidf_lin_cnct)

# Training and evaluating the models.
tfidf_lin_hist.train()
tfidf_lin_cnct.train()

# comparing the model
Experiment.compare(experiments)

Unnamed: 0,R-square training,mean absolute error training,mean square error training,R-square test,mean absolute error test,mean square error test
BoW Lin Reg history,0.999763,0.041878,0.041878,-0.786048,27.414058,2.801669
BoW Lin Reg concatenated,0.999894,0.023142,0.023142,0.112207,19.521665,1.975268
BoW MLP history,0.948787,1.795521,1.795521,0.075539,12.822303,2.015648
BoW MLP concatenated,0.953409,2.838553,2.838553,0.162347,12.359514,1.918679
tf-idf Lin Reg history,0.999763,0.041873,0.041873,0.204062,14.697725,1.870293
tf-idf Lin Reg concatenated,0.999894,0.023141,0.023141,0.312649,15.025488,1.738038


- [ ] todo: observation and error plots

### 3.3 tf-idf and Multilayer Perceptron Regressor

In [8]:
tfidf_nn_hist = MLPExperiment('tf-idf MLP history', ds_hist.get_tfidf_encoding(),
                              MLPRegressor(hidden_layer_sizes=MLP_LAYER_CONFIG, max_iter=25))
tfidf_nn_cnct = MLPExperiment('tf-idf MLP Reg concatenated', ds_cnct.get_tfidf_encoding(),
                              MLPRegressor(hidden_layer_sizes=MLP_LAYER_CONFIG, max_iter=25))
experiments.append(tfidf_nn_hist)
experiments.append(tfidf_nn_cnct)

# Training and evaluating the models.
tfidf_nn_hist.train()
tfidf_nn_cnct.train()

# comparing the model
Experiment.compare(experiments)



Unnamed: 0,R-square training,mean absolute error training,mean square error training,R-square test,mean absolute error test,mean square error test
BoW Lin Reg history,0.999763,0.041878,0.041878,-0.786048,27.414058,2.801669
BoW Lin Reg concatenated,0.999894,0.023142,0.023142,0.112207,19.521665,1.975268
BoW MLP history,0.948787,1.795521,1.795521,0.075539,12.822303,2.015648
BoW MLP concatenated,0.953409,2.838553,2.838553,0.162347,12.359514,1.918679
tf-idf Lin Reg history,0.999763,0.041873,0.041873,0.204062,14.697725,1.870293
tf-idf Lin Reg concatenated,0.999894,0.023141,0.023141,0.312649,15.025488,1.738038
tf-idf MLP history,0.996099,0.728596,0.728596,0.170847,11.303492,1.908919
tf-idf MLP Reg concatenated,0.992289,0.990561,0.990561,0.291009,11.219529,1.765186


In [11]:
fig = px.histogram(superheros, ['overall_score'])
fig.show()
# plot y-histogram

## 4 Comparing the models

In [12]:
Experiment.plot_prediction(experiments, 'training')

In [13]:
Experiment.plot_prediction(experiments, 'test')

In [14]:
Experiment.plot_error_summary(experiments, 'training')

In [15]:
Experiment.plot_error_summary(experiments, 'test')

Thoughts
