<a href="https://colab.research.google.com/github/Krutik-Patel/ChatGPT_Scraper/blob/main/KGE_CLustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Clustering and Classification using Knowledge Graphs Embeddings (KGEs)

## Requirements

In [1]:
pip install ampligraph

Collecting ampligraph
  Downloading ampligraph-2.0.1-py3-none-any.whl (204 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.0/204.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting myst-parser==0.18.0 (from ampligraph)
  Downloading myst_parser-0.18.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docutils<0.18 (from ampligraph)
  Downloading docutils-0.17.1-py2.py3-none-any.whl (575 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m575.5/575.5 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sphinx-rtd-theme==1.0.0 (from ampligraph)
  Downloading sphinx_rtd_theme-1.0.0-py2.py3-none-any.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sphinxcontrib-bibtex==2.4.2 (from ampligraph)
  Downloading sphinxcontrib_bibtex-

In [2]:
import numpy as np
import pandas as pd
import ampligraph

## Dataset

In [3]:
import requests

url = 'https://ampligraph.s3-eu-west-1.amazonaws.com/datasets/football_graph.csv'
open('football_results.csv', 'wb').write(requests.get(url).content)

3033782

In [4]:
df = pd.read_csv('football_results.csv').sort_values('date')
df.isna().sum()

date          0
home_team     0
away_team     0
home_score    2
away_score    2
tournament    0
city          0
country       0
neutral       0
dtype: int64

In [5]:
df.shape

(40773, 9)

In [6]:
df[df['tournament'] != 'Friendly']

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
29,1884-01-26,Northern Ireland,Scotland,0.0,5.0,British Championship,Belfast,Republic of Ireland,False
30,1884-02-09,Wales,Northern Ireland,6.0,0.0,British Championship,Wrexham,Wales,False
31,1884-02-23,Northern Ireland,England,1.0,8.0,British Championship,Belfast,Republic of Ireland,False
32,1884-03-15,Scotland,England,1.0,0.0,British Championship,Glasgow,Scotland,False
33,1884-03-17,Wales,England,0.0,4.0,British Championship,Wrexham,Wales,False
...,...,...,...,...,...,...,...,...,...
40768,2019-07-11,Madagascar,Tunisia,0.0,3.0,African Cup of Nations,Cairo,Egypt,True
40769,2019-07-14,Algeria,Nigeria,2.0,1.0,African Cup of Nations,Cairo,Egypt,True
40770,2019-07-14,Senegal,Tunisia,1.0,0.0,African Cup of Nations,Cairo,Egypt,True
40771,2019-07-17,Tunisia,Nigeria,,,African Cup of Nations,Cairo,Egypt,True


In [7]:
df[df != df.isna()]

  output = repr(obj)
  df_html=dataframe._repr_html_(),  # pylint: disable=protected-access


Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,,,Friendly,Glasgow,Scotland,
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,
4,1876-03-04,Scotland,England,3.0,,Friendly,Glasgow,Scotland,
...,...,...,...,...,...,...,...,...,...
40768,2019-07-11,Madagascar,Tunisia,,3.0,African Cup of Nations,Cairo,Egypt,True
40769,2019-07-14,Algeria,Nigeria,2.0,1.0,African Cup of Nations,Cairo,Egypt,True
40770,2019-07-14,Senegal,Tunisia,1.0,,African Cup of Nations,Cairo,Egypt,True
40771,2019-07-17,Tunisia,Nigeria,,,African Cup of Nations,Cairo,Egypt,True


In [8]:
df.dropna(inplace=True)
df.shape

(40771, 9)

## Splitting Test and Train Datasets

In [9]:
df['train'] = df.date < '2014-01-01'
df.train.value_counts()

True     35714
False     5057
Name: train, dtype: int64

#### Making sure the Team and Geography entities are different (eg. Brazil Team and Brazil country)

In [10]:
df['match_id'] = df.index.values.astype(str)
df['match_id'] = "Match" + df.match_id
df['city_id'] = "City" + df.city.str.title().str.replace(" ", "")
df['country'] = "Country" + df.country.str.title().str.replace(" ", "")
df['home_team_id'] = "Team" + df.home_team.str.title().str.replace(" ", "")
df['away_team_id'] = "Team" + df.away_team.str.title().str.replace(" ", "")
df['tournament_id'] = "Tournament" + df.tournament.str.title().str.replace(" ", "")
df['neutral'] = df.neutral.astype(str)
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,train,match_id,city_id,home_team_id,away_team_id,tournament_id
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,CountryScotland,False,True,Match0,CityGlasgow,TeamScotland,TeamEngland,TournamentFriendly
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,CountryEngland,False,True,Match1,CityLondon,TeamEngland,TeamScotland,TournamentFriendly
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,CountryScotland,False,True,Match2,CityGlasgow,TeamScotland,TeamEngland,TournamentFriendly
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,CountryEngland,False,True,Match3,CityLondon,TeamEngland,TeamScotland,TournamentFriendly
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,CountryScotland,False,True,Match4,CityGlasgow,TeamScotland,TeamEngland,TournamentFriendly


### Triples Generation

In [11]:
triples = []
for _, row in df[df['train']].iterrows():
    home_team = (row['home_team_id'], 'isHomeTeamIn', row["match_id"])
    away_team = (row['away_team_id'], 'isAwayTeamIn', row['match_id'])

    if row['home_score'] > row['away_score']:
        score_home = (row['home_team_id'], 'winnerOf', row['match_id'])
        score_away = (row['away_team_id'], 'loserOf', row['match_id'])
    elif row['home_score'] < row['away_score']:
        score_home = (row['home_team_id'], 'loserOf', row['match_id'])
        score_away = (row['away_team_id'], 'winnerOf', row['match_id'])
    else:
        score_home = (row['home_team_id'], 'draws', row['match_id'])
        score_away = (row['away_team_id'], 'draws', row['match_id'])

    home_score = (row['match_id'], 'homeScores', np.clip(int(row['home_score']), 0, 5))
    away_score = (row['match_id'], 'awayScores', np.clip(int(row['away_score']), 0, 5))

    tournament = (row['match_id'], 'inTournament', row['tournament_id'])
    city = (row['match_id'], 'inCity', row['city_id'])
    country = (row['match_id'], 'inCountry', row['country'])
    neutral = (row['match_id'], 'isNeutarl', row['neutral'])
    year = (row['match_id'], 'inYear', row['date'][:4])

    triples.extend((home_team, away_team, score_home, score_away, home_score, away_score, tournament, city, country, neutral, year))


### Triples are ready
#### Try out and check the triples generated!!!

In [12]:
triples[19: 36]

[('Match1', 'inCountry', 'CountryEngland'),
 ('Match1', 'isNeutarl', 'False'),
 ('Match1', 'inYear', '1873'),
 ('TeamScotland', 'isHomeTeamIn', 'Match2'),
 ('TeamEngland', 'isAwayTeamIn', 'Match2'),
 ('TeamScotland', 'winnerOf', 'Match2'),
 ('TeamEngland', 'loserOf', 'Match2'),
 ('Match2', 'homeScores', 2),
 ('Match2', 'awayScores', 1),
 ('Match2', 'inTournament', 'TournamentFriendly'),
 ('Match2', 'inCity', 'CityGlasgow'),
 ('Match2', 'inCountry', 'CountryScotland'),
 ('Match2', 'isNeutarl', 'False'),
 ('Match2', 'inYear', '1874'),
 ('TeamEngland', 'isHomeTeamIn', 'Match3'),
 ('TeamScotland', 'isAwayTeamIn', 'Match3'),
 ('TeamEngland', 'draws', 'Match3')]

#### Dataframing Triples

In [13]:
triples_df = pd.DataFrame(triples, columns=['subject', 'predicate', 'object'])
triples_df[(triples_df.subject == 'Match2551')| (triples_df.object == 'Match2551')]

Unnamed: 0,subject,predicate,object
28061,TeamBolivia,isHomeTeamIn,Match2551
28062,TeamBrazil,isAwayTeamIn,Match2551
28063,TeamBolivia,loserOf,Match2551
28064,TeamBrazil,winnerOf,Match2551
28065,Match2551,homeScores,0
28066,Match2551,awayScores,3
28067,Match2551,inTournament,TournamentCopaAmérica
28068,Match2551,inCity,CityBuenosAires
28069,Match2551,inCountry,CountryArgentina
28070,Match2551,isNeutarl,True


#### Splitting Dataset in Train and Test

In [14]:
from ampligraph.evaluation import train_test_split_no_unseen

X_train, X_test = train_test_split_no_unseen(np.array(triples), test_size=10000)
print(X_train.shape, X_test.shape)

(382854, 3) (10000, 3)


#### Training The Model for Knowledge Graph Embeddings

In [15]:
from ampligraph.latent_features import ScoringBasedEmbeddingModel

model = ScoringBasedEmbeddingModel(
    k=100,
    eta=20,
    scoring_type='ComplEx'
)

#### Compiling the model

In [16]:
from ampligraph.latent_features.loss_functions import get as get_loss
from ampligraph.latent_features.regularizers import get as get_regularizer
from tensorflow.keras.optimizers import Adam


optimizer = Adam(learning_rate=1e-4)
regularizer = get_regularizer('LP', {'p': 3, 'lambda': 1e-5})
loss = get_loss('multiclass_nll')

model.compile(
    loss=loss,
    optimizer=optimizer,
    entity_relation_regularizer=regularizer,
    entity_relation_initializer='glorot_uniform'
)

#### Training the model

#### Setting up GPU Environment

In [17]:
import tensorflow as tf

tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
model.fit(X_train, batch_size=10000, epochs=200, verbose=True)

Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 1

In [None]:
model.save()