In [9]:
import os
import re
from typing import List
import joblib
import logging
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pandas as pd


from object_types import CAREER_FRAME, DRAFT, WIKI_PLAYER

MERGED_DIR = './players_merged'

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Parse data") \
    .config("spark.driver.host", "0.0.0.0") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.dynamicAllocation.enabled", "false") \
    .config("spark.shuffle.service.enabled", "false") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.sql.execution.pythonUTF8StringEncoding", "true") \
    .getOrCreate()

In [27]:
def load_df(file_path = './processed_pages.joblib') -> DataFrame:
    pd_df: pd.DataFrame = joblib.load(file_path)
    return spark.createDataFrame(pd_df)

wiki_df = load_df('./wiki_df.joblib')
html_df = load_df('./html_df.joblib')

In [28]:
html_df.describe()

DataFrame[summary: string, file_path: string, download_url: string, player_name: string, dob: string, draft_team: string, position: string, hand: string, height: string, weight: string, games_played: string, wins: string, losses: string, ties_ot_losses: string, minutes: string, shootouts: string, gaa: string, save_percentage: string, goals: string, assists: string, points: string, plus_minus: string, point_shares: string, penalty_minutes: string, shots_on_goal: string, game_winning_goals: string]

In [30]:
wiki_df.describe()

DataFrame[summary: string, full_name: string, birthplace: string, career_start: string, career_end: string, draft: string, draft_year: string, draft_team: string, current_league: string, national_team: string, current_team: string, nationality: string]

In [31]:
html_df.count()

                                                                                

12165

In [32]:
wiki_df.count()

19722

In [35]:
merged_df = html_df.join(wiki_df, wiki_df['full_name'] == html_df['player_name'], how="full")

In [36]:
merged_df.count()

                                                                                

27042

In [44]:
temp_view_name = "players"
merged_df.createOrReplaceTempView(temp_view_name)

spark.sql(f"select * from {temp_view_name} where player_name is not null and full_name is not null limit 5").show()

                                                                                

+--------------------+--------------------+--------------------+------------------+----------+---------+-----+------+------+------------+-----+------+--------------+-------+---------+---+---------------+-----+-------+------+----------+-------------------+---------------+-------------+------------------+--------------------+--------------------+------------+----------+--------------------+----------+--------------------+--------------------+-------------+-------------------+-----------+
|           file_path|        download_url|         player_name|               dob|draft_team| position| hand|height|weight|games_played| wins|losses|ties_ot_losses|minutes|shootouts|gaa|save_percentage|goals|assists|points|plus_minus|       point_shares|penalty_minutes|shots_on_goal|game_winning_goals|           full_name|          birthplace|career_start|career_end|               draft|draft_year|          draft_team|      current_league|national_team|       current_team|nationality|
+-----------------