In [73]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

Some data science tools will only work when the input data are numeric. This particularly true of machine learning. Many machine learning algorithms – like linear regression and logistic regression – strictly require numeric input data. If you try to use them with string-based categorical data, they will throw an error.

So before you use such tools, you need to encode your categorical data as numeric dummy variables.

In [102]:
classes = {
    "V": 1, #vittoria casa
    "P": 2, #vittoria trasferta
    "N": 0 #pareggio
}

In [103]:
dataset = pd.read_csv("merged_matches.csv", index_col=0)
dataset['Risultato'] = dataset["Risultato"].replace(classes, inplace=False)

dataset.head()

Unnamed: 0,h_team,Risultato,h_goals,a_team,a_goals,h_total_shots,h_shots_on_target,h_goals_on_penalty,h_total_penalties,h_completed_passings,...,a_shots_on_target,a_goals_on_penalty,a_total_penalties,a_corners,a_yellow_cards,a_red_cards,a_fouls,a_completed_passings,a_total_passings,a_percentage_possession
0,Hellas Verona,2,2,Sassuolo,3,14.0,3.0,1,1,282.0,...,7.0,0,0,4.0,2,0,15.0,481.0,540.0,60.0
0,Inter,1,4,Genoa,0,17.0,8.0,0,0,563.0,...,4.0,0,0,2.0,2,0,7.0,365.0,430.0,41.0
0,Torino,2,1,Atalanta,2,21.0,7.0,0,0,304.0,...,2.0,0,0,1.0,2,0,14.0,328.0,462.0,53.0
0,Empoli,2,1,Lazio,3,17.0,4.0,0,0,410.0,...,3.0,1,1,3.0,1,0,10.0,517.0,589.0,54.0
0,Udinese,0,2,Juventus,2,11.0,5.0,1,1,363.0,...,3.0,0,0,3.0,3,0,16.0,480.0,551.0,56.0


In [104]:
dummy_match = pd.get_dummies(dataset
               ,columns = ['h_team','a_team']
               ,prefix = ['h', 'a']
               )
dummy_match.to_csv("dummy_match.csv")