In [None]:
from modules.module_for_imports import *

# Inconsistent values removal
> ### In this file we will find out how to remove all the elements we don't want in the SQL server.
1. We clean up the players, checking the **current_club_id** column
2. We clean up BOTH **game_events** and **club_games** checking both the foreign keys from each set
    - ***game_events***: player_id and game_id
    - ***club_games***: game_id and club_id
        
---
## Results:
- `game_events.player_id` data are **NOT** consistent
- `club_games.club_id` data are **NOT** consistent -> we will have to remove also the other tuple with the same `game_id`
- All other data checked are consistent

In [None]:
# locate = '../' # usable location vrb 
games = clean_games(get_games('../'))
clubs = clean_clubs(get_clubs('../'))
#players will be modified!
players = clean_players(get_players('../'))

---
# Defining the functions used in this section

In [None]:
def check_consistency(df1, df2, column_name1, column_name2):
    result = df1[df1[column_name1].isin(df2[column_name2])]
    print('Modified.' if (result.shape[0] != df1.shape[0]) else 'Not modified.')
    return result

---
# 1. Cleaning players 
- For now, players seems to have all `current_club_id` consistent with clubs table.

In [None]:
players.info()

In [None]:
player = check_consistency(players, clubs, 'current_club_id', 'club_id') # Not modified.

---
# 2.a Cleaning game_events
- `player_id` in game_events will be **not** consistent with players table.
- `game_id` is consistent with the games table.

In [None]:
game_events = clean_game_events(get_game_events('../'))
game_events.info()

In [None]:
ge_consistency = check_consistency(game_events, players, 'player_id', 'player_id') # Modified.
neg_game_events = game_events.query('not player_id.isin(@ge_consistency["player_id"])', engine='python')
print('Inconsistent rows:', neg_game_events.shape[0], 'of', game_events.shape[0])

In [None]:
# It seems that there are player_id's in game_events that are not in players:
game_events[~game_events['player_id'].isin(players['player_id'])]

In [None]:
players = None

In [None]:
game_events = check_consistency(game_events, games, 'game_id', 'game_id') # Not modified.
game_events.shape

In [None]:
game_events = None

---
# 2.b Cleaning club_games... 
- `game_id` is consistent with games table
- `club_id` is **NOT** consistent with clubs table

In [None]:
club_games = clean_club_games(get_club_games('../'), '../')
club_games.info()

In [None]:
games.query('game_id.isin(@club_games["game_id"])', engine='python').shape[0] == games.shape[0]

In [None]:
check_consistency(club_games, games, 'game_id', 'game_id').head() # Not modified.

In [None]:
cg_consistency = check_consistency(club_games, clubs, 'club_id', 'club_id') # Modified.

In [None]:
neg_club_games = club_games.query('not club_id.isin(@cg_consistency["club_id"])', engine='python')
print('Inconsistent rows:', neg_club_games.shape[0], 'of', club_games.shape[0])

In [None]:
club_games.query('game_id.isin(@neg_club_games["game_id"])', engine='python').shape[0] 

In [None]:
games = None
clubs = None
club_games = None

# Order of removal:
1. `game_id` from **club_games** where `club_id` is not found in **clubs**
2. `game_id` from **games** according to **club_games** foreign_keys
3. `game_id` from **game_events** according to **games** foreign_keys
4. `player_id` from **game_events** according to **players** foreign_keys

---
# Making Appearances check

In [None]:
loc = '../'
app = clean_appearances(get_appearances(loc), loc)
games = clean_club_games(get_club_games(loc), loc)[1]
players = clean_players(get_players(loc))
app.info()

In [None]:
print(app.shape[0])
check_consistency(app, games, 'game_id', 'game_id').shape[0]

In [None]:
check_consistency(app, players, 'player_id', 'player_id').shape[0]

# We should remove 63k tuples for consistency