# Clustering and Classification using Knowledge Graphs Embeddings (KGEs)

## Requirements

In [4]:
import numpy as np 
import pandas as pd 
import ampligraph

## Dataset

In [5]:
import requests

url = 'https://ampligraph.s3-eu-west-1.amazonaws.com/datasets/football_graph.csv'
open('football_results.csv', 'wb').write(requests.get(url).content)

3033782

In [6]:
df = pd.read_csv('football_results.csv').sort_values('date')
df.isna().sum()

date          0
home_team     0
away_team     0
home_score    2
away_score    2
tournament    0
city          0
country       0
neutral       0
dtype: int64

In [7]:
df.shape

(40773, 9)

In [8]:
df[df['tournament'] != 'Friendly']

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
29,1884-01-26,Northern Ireland,Scotland,0.0,5.0,British Championship,Belfast,Republic of Ireland,False
30,1884-02-09,Wales,Northern Ireland,6.0,0.0,British Championship,Wrexham,Wales,False
31,1884-02-23,Northern Ireland,England,1.0,8.0,British Championship,Belfast,Republic of Ireland,False
32,1884-03-15,Scotland,England,1.0,0.0,British Championship,Glasgow,Scotland,False
33,1884-03-17,Wales,England,0.0,4.0,British Championship,Wrexham,Wales,False
...,...,...,...,...,...,...,...,...,...
40768,2019-07-11,Madagascar,Tunisia,0.0,3.0,African Cup of Nations,Cairo,Egypt,True
40769,2019-07-14,Algeria,Nigeria,2.0,1.0,African Cup of Nations,Cairo,Egypt,True
40770,2019-07-14,Senegal,Tunisia,1.0,0.0,African Cup of Nations,Cairo,Egypt,True
40771,2019-07-17,Tunisia,Nigeria,,,African Cup of Nations,Cairo,Egypt,True


In [9]:
df[df != df.isna()]

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,,,Friendly,Glasgow,Scotland,
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,
4,1876-03-04,Scotland,England,3.0,,Friendly,Glasgow,Scotland,
...,...,...,...,...,...,...,...,...,...
40768,2019-07-11,Madagascar,Tunisia,,3.0,African Cup of Nations,Cairo,Egypt,True
40769,2019-07-14,Algeria,Nigeria,2.0,1.0,African Cup of Nations,Cairo,Egypt,True
40770,2019-07-14,Senegal,Tunisia,1.0,,African Cup of Nations,Cairo,Egypt,True
40771,2019-07-17,Tunisia,Nigeria,,,African Cup of Nations,Cairo,Egypt,True


In [10]:
df.dropna(inplace=True)
df.shape

(40771, 9)

## Splitting Test and Train Datasets

In [11]:
df['train'] = df.date < '2014-01-01'
df.train.value_counts()

train
True     35714
False     5057
Name: count, dtype: int64

#### Making sure the Team and Geography entities are different (eg. Brazil Team and Brazil country)

In [12]:
df['match_id'] = df.index.values.astype(str)
df['match_id'] = "Match" + df.match_id
df['city_id'] = "City" + df.city.str.title().str.replace(" ", "")
df['country'] = "Country" + df.country.str.title().str.replace(" ", "")
df['home_team_id'] = "Team" + df.home_team.str.title().str.replace(" ", "")
df['away_team_id'] = "Team" + df.away_team.str.title().str.replace(" ", "")
df['tournament_id'] = "Tournament" + df.tournament.str.title().str.replace(" ", "")
df['neutral'] = df.neutral.astype(str)
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,train,match_id,city_id,home_team_id,away_team_id,tournament_id
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,CountryScotland,False,True,Match0,CityGlasgow,TeamScotland,TeamEngland,TournamentFriendly
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,CountryEngland,False,True,Match1,CityLondon,TeamEngland,TeamScotland,TournamentFriendly
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,CountryScotland,False,True,Match2,CityGlasgow,TeamScotland,TeamEngland,TournamentFriendly
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,CountryEngland,False,True,Match3,CityLondon,TeamEngland,TeamScotland,TournamentFriendly
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,CountryScotland,False,True,Match4,CityGlasgow,TeamScotland,TeamEngland,TournamentFriendly


### Triples Generation

In [19]:
triples = []
for _, row in df[df['train']].iterrows():
    home_team = (row['home_team_id'], 'isHomeTeamIn', row["match_id"])
    away_team = (row['away_team_id'], 'isAwayTeamIn', row['match_id'])

    if row['home_score'] > row['away_score']:
        score_home = (row['home_team_id'], 'winnerOf', row['match_id'])
        score_away = (row['away_team_id'], 'loserOf', row['match_id'])
    elif row['home_score'] < row['away_score']:
        score_home = (row['home_team_id'], 'loserOf', row['match_id'])
        score_away = (row['away_team_id'], 'winnerOf', row['match_id'])
    else:
        score_home = (row['home_team_id'], 'draws', row['match_id'])
        score_away = (row['away_team_id'], 'draws', row['match_id'])

    home_score = (row['match_id'], 'homeScores', np.clip(int(row['home_score']), 0, 5))
    away_score = (row['match_id'], 'awayScores', np.clip(int(row['away_score']), 0, 5))

    tournament = (row['match_id'], 'inTournament', row['tournament_id'])
    city = (row['match_id'], 'inCity', row['city_id'])
    country = (row['match_id'], 'inCountry', row['country'])
    neutral = (row['match_id'], 'isNeutarl', row['neutral'])
    year = (row['match_id'], 'inYear', row['date'][:4])
    
    triples.extend((home_team, away_team, score_home, score_away, home_score, away_score, tournament, city, country, neutral, year))


### Triples are ready
#### Try out and check the triples generated!!!

In [24]:
triples[19: 36]

[('Match1', 'inCountry', 'CountryEngland'),
 ('Match1', 'isNeutarl', 'False'),
 ('Match1', 'inYear', '1873'),
 ('TeamScotland', 'isHomeTeamIn', 'Match2'),
 ('TeamEngland', 'isAwayTeamIn', 'Match2'),
 ('TeamScotland', 'winnerOf', 'Match2'),
 ('TeamEngland', 'loserOf', 'Match2'),
 ('Match2', 'homeScores', 2),
 ('Match2', 'awayScores', 1),
 ('Match2', 'inTournament', 'TournamentFriendly'),
 ('Match2', 'inCity', 'CityGlasgow'),
 ('Match2', 'inCountry', 'CountryScotland'),
 ('Match2', 'isNeutarl', 'False'),
 ('Match2', 'inYear', '1874'),
 ('TeamEngland', 'isHomeTeamIn', 'Match3'),
 ('TeamScotland', 'isAwayTeamIn', 'Match3'),
 ('TeamEngland', 'draws', 'Match3')]

#### Dataframing Triples

In [27]:
triples_df = pd.DataFrame(triples, columns=['subject', 'predicate', 'object'])
triples_df[(triples_df.subject == 'Match2551')| (triples_df.object == 'Match2551')]

Unnamed: 0,subject,predicate,object
28061,TeamBolivia,isHomeTeamIn,Match2551
28062,TeamBrazil,isAwayTeamIn,Match2551
28063,TeamBolivia,loserOf,Match2551
28064,TeamBrazil,winnerOf,Match2551
28065,Match2551,homeScores,0
28066,Match2551,awayScores,3
28067,Match2551,inTournament,TournamentCopaAmérica
28068,Match2551,inCity,CityBuenosAires
28069,Match2551,inCountry,CountryArgentina
28070,Match2551,isNeutarl,True
