In [3]:
import pandas as pd
import numpy as np

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when
spark = SparkSession.builder.appName("Football-bronze").getOrCreate()

In [64]:
ls /home/gnana/football-lakehouse/bronze/

[0m[01;34mappearances[0m/  [01;34mclubs[0m/         [01;34mgame_events[0m/   [01;34mgames[0m/              [01;34mplayers[0m/
[01;34mclub_games[0m/   [01;34mcompetitions[0m/  [01;34mgame_lineups[0m/  [01;34mplayer_valuations[0m/  [01;34mtransfers[0m/


In [5]:
bronze_path = "/home/gnana/football-lakehouse/bronze"

In [6]:
players = spark.read.parquet(f"{bronze_path}/players")
players.printSchema()
players.count()

                                                                                

root
 |-- player_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- name: string (nullable = true)
 |-- last_season: integer (nullable = true)
 |-- current_club_id: integer (nullable = true)
 |-- player_code: string (nullable = true)
 |-- country_of_birth: string (nullable = true)
 |-- city_of_birth: string (nullable = true)
 |-- country_of_citizenship: string (nullable = true)
 |-- date_of_birth: timestamp (nullable = true)
 |-- sub_position: string (nullable = true)
 |-- position: string (nullable = true)
 |-- foot: string (nullable = true)
 |-- height_in_cm: integer (nullable = true)
 |-- contract_expiration_date: timestamp (nullable = true)
 |-- agent_name: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- url: string (nullable = true)
 |-- current_club_domestic_competition_id: string (nullable = true)
 |-- current_club_name: string (nullable = true)
 |-- market_value_in_eur: integer (nullable =

32601

In [11]:
players.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in players.columns
]).toPandas().T.sort_values(by=0,ascending=False)

                                                                                

Unnamed: 0,0
agent_name,16019
contract_expiration_date,12091
country_of_birth,2799
foot,2536
city_of_birth,2455
height_in_cm,2256
first_name,2062
highest_market_value_in_eur,1523
market_value_in_eur,1523
country_of_citizenship,383


In [91]:
games = spark.read.parquet(f"{bronze_path}/games")
games.printSchema()
games.count()

root
 |-- game_id: integer (nullable = true)
 |-- competition_id: string (nullable = true)
 |-- season: integer (nullable = true)
 |-- round: string (nullable = true)
 |-- date: date (nullable = true)
 |-- home_club_id: integer (nullable = true)
 |-- away_club_id: integer (nullable = true)
 |-- home_club_goals: integer (nullable = true)
 |-- away_club_goals: integer (nullable = true)
 |-- home_club_position: integer (nullable = true)
 |-- away_club_position: integer (nullable = true)
 |-- home_club_manager_name: string (nullable = true)
 |-- away_club_manager_name: string (nullable = true)
 |-- stadium: string (nullable = true)
 |-- attendance: integer (nullable = true)
 |-- referee: string (nullable = true)
 |-- url: string (nullable = true)
 |-- home_club_formation: string (nullable = true)
 |-- away_club_formation: string (nullable = true)
 |-- home_club_name: string (nullable = true)
 |-- away_club_name: string (nullable = true)
 |-- aggregate: timestamp (nullable = true)
 |-- comp

74026

In [118]:
games.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in games.columns
]).toPandas().T.sort_values(by=0, ascending=False)

Unnamed: 0,0
away_club_position,22467
home_club_position,22467
home_club_name,12850
away_club_name,11455
attendance,9948
home_club_formation,6975
away_club_formation,6806
away_club_manager_name,828
home_club_manager_name,828
referee,652


In [90]:
appearances = spark.read.parquet(f"{bronze_path}/appearances")
appearances.printSchema()
appearances.count()

root
 |-- appearance_id: string (nullable = true)
 |-- game_id: integer (nullable = true)
 |-- player_id: integer (nullable = true)
 |-- player_club_id: integer (nullable = true)
 |-- player_current_club_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- player_name: string (nullable = true)
 |-- competition_id: string (nullable = true)
 |-- yellow_cards: integer (nullable = true)
 |-- red_cards: integer (nullable = true)
 |-- goals: integer (nullable = true)
 |-- assists: integer (nullable = true)
 |-- minutes_played: integer (nullable = true)



1706806

In [88]:
appearances.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in appearances.columns
]).toPandas().T.sort_values(by=0, ascending=False)

Unnamed: 0,0
player_name,6
game_id,0
player_id,0
player_club_id,0
appearance_id,0
player_current_club_id,0
date,0
competition_id,0
yellow_cards,0
red_cards,0


In [93]:
club_games = spark.read.parquet(f"{bronze_path}/club_games")
club_games.printSchema()
club_games.count()

root
 |-- game_id: integer (nullable = true)
 |-- club_id: integer (nullable = true)
 |-- own_goals: integer (nullable = true)
 |-- own_position: integer (nullable = true)
 |-- own_manager_name: string (nullable = true)
 |-- opponent_id: integer (nullable = true)
 |-- opponent_goals: integer (nullable = true)
 |-- opponent_position: integer (nullable = true)
 |-- opponent_manager_name: string (nullable = true)
 |-- hosting: string (nullable = true)
 |-- is_win: integer (nullable = true)



148052

In [97]:
club_games.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in club_games.columns
]).toPandas().T.sort_values(by=0, ascending=False)

Unnamed: 0,0
own_position,44934
opponent_position,44934
opponent_manager_name,1656
own_manager_name,1656
opponent_goals,24
own_goals,24
club_id,18
opponent_id,18
game_id,0
hosting,0


In [98]:
clubs = spark.read.parquet(f"{bronze_path}/clubs")
clubs.printSchema()
clubs.count()

root
 |-- club_id: integer (nullable = true)
 |-- club_code: string (nullable = true)
 |-- name: string (nullable = true)
 |-- domestic_competition_id: string (nullable = true)
 |-- total_market_value: string (nullable = true)
 |-- squad_size: integer (nullable = true)
 |-- average_age: double (nullable = true)
 |-- foreigners_number: integer (nullable = true)
 |-- foreigners_percentage: double (nullable = true)
 |-- national_team_players: integer (nullable = true)
 |-- stadium_name: string (nullable = true)
 |-- stadium_seats: integer (nullable = true)
 |-- net_transfer_record: string (nullable = true)
 |-- coach_name: string (nullable = true)
 |-- last_season: integer (nullable = true)
 |-- filename: string (nullable = true)
 |-- url: string (nullable = true)



439

In [99]:
clubs.select([
    count(when(col(c).isNull(),c)).alias(c)
    for c in clubs.columns
]).toPandas().T.sort_values(by=0, ascending=False)

Unnamed: 0,0
total_market_value,439
coach_name,439
foreigners_percentage,49
average_age,38
club_id,0
domestic_competition_id,0
name,0
squad_size,0
club_code,0
foreigners_number,0


In [102]:
competitions=spark.read.parquet(f"{bronze_path}/competitions")
competitions.printSchema()
competitions.count()

root
 |-- competition_id: string (nullable = true)
 |-- competition_code: string (nullable = true)
 |-- name: string (nullable = true)
 |-- sub_type: string (nullable = true)
 |-- type: string (nullable = true)
 |-- country_id: integer (nullable = true)
 |-- country_name: string (nullable = true)
 |-- domestic_league_code: string (nullable = true)
 |-- confederation: string (nullable = true)
 |-- url: string (nullable = true)
 |-- is_major_national_league: boolean (nullable = true)



44

In [103]:
competitions.select([
    count(when(col(c).isNull(),c)).alias(c)
    for c in competitions.columns
]).toPandas().T.sort_values(by=0, ascending=False)

Unnamed: 0,0
country_name,8
domestic_league_code,8
competition_id,0
name,0
competition_code,0
type,0
sub_type,0
country_id,0
confederation,0
url,0


In [104]:
game_events = spark.read.parquet(f"{bronze_path}/game_events")
game_events.printSchema()
game_events.count()

root
 |-- game_event_id: string (nullable = true)
 |-- date: date (nullable = true)
 |-- game_id: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- club_id: integer (nullable = true)
 |-- player_id: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- player_in_id: integer (nullable = true)
 |-- player_assist_id: integer (nullable = true)



1035043

In [105]:
game_events.select([
    count(when(col(c).isNull(),c)).alias(c)
    for c in game_events.columns
]).toPandas().T.sort_values(by=0, ascending=False)

Unnamed: 0,0
player_assist_id,878284
player_in_id,537365
description,87327
game_event_id,0
date,0
game_id,0
club_id,0
type,0
minute,0
player_id,0


In [107]:
game_lineups = spark.read.parquet(f"{bronze_path}/game_lineups")
game_lineups.printSchema()
game_lineups.count()

root
 |-- game_lineups_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- game_id: integer (nullable = true)
 |-- player_id: integer (nullable = true)
 |-- club_id: integer (nullable = true)
 |-- player_name: string (nullable = true)
 |-- type: string (nullable = true)
 |-- position: string (nullable = true)
 |-- number: string (nullable = true)
 |-- team_captain: integer (nullable = true)



2285289

In [108]:
game_lineups.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in game_lineups.columns
]).toPandas().T.sort_values(by=0, ascending=False)

                                                                                

Unnamed: 0,0
team_captain,140067
position,93381
player_name,93378
club_id,93378
game_id,93378
player_id,93378
type,93378
number,93378
date,46689
game_lineups_id,0


In [109]:
player_val = spark.read.parquet(f"{bronze_path}/player_valuations")
player_val.printSchema()
player_val.count()

root
 |-- player_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- market_value_in_eur: integer (nullable = true)
 |-- current_club_id: integer (nullable = true)
 |-- player_club_domestic_competition_id: string (nullable = true)



496606

In [110]:
player_val.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in player_val.columns
]).toPandas().T.sort_values(by=0, ascending=False)

Unnamed: 0,0
player_id,0
date,0
market_value_in_eur,0
current_club_id,0
player_club_domestic_competition_id,0


In [113]:
transfers = spark.read.parquet(f"{bronze_path}/transfers")
transfers.printSchema()
transfers.count()

root
 |-- player_id: integer (nullable = true)
 |-- transfer_date: date (nullable = true)
 |-- transfer_season: string (nullable = true)
 |-- from_club_id: integer (nullable = true)
 |-- to_club_id: integer (nullable = true)
 |-- from_club_name: string (nullable = true)
 |-- to_club_name: string (nullable = true)
 |-- transfer_fee: double (nullable = true)
 |-- market_value_in_eur: double (nullable = true)
 |-- player_name: string (nullable = true)



79646

In [114]:
transfers.select([
    count(when(col(c).isNull(),c)).alias(c)
    for c in transfers.columns
]).toPandas().T.sort_values(by=0, ascending=False)

Unnamed: 0,0
market_value_in_eur,30316
transfer_fee,27715
transfer_date,0
player_id,0
transfer_season,0
from_club_id,0
from_club_name,0
to_club_id,0
to_club_name,0
player_name,0
