# Project - Option 1

## Task 1

In [421]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline,Transformer
from pyspark.ml.feature import Imputer,StandardScaler,StringIndexer,OneHotEncoder, VectorAssembler
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import numpy as np
import os
import sys
from pyspark.sql.functions import udf, col, when
from pyspark.sql.types import ArrayType, FloatType, StringType
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType

In [2]:
appName = "Big Data Analytics"
master = "local[*]"

# Create Configuration object for Spark.
conf = pyspark.SparkConf()\
    .set('spark.driver.host','127.0.0.1')\
    .setAppName(appName)\
    .setMaster(master)

spark = SparkSession.builder.config(conf = conf).getOrCreate()

24/11/09 14:52:26 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [6]:
db_properties={}
db_properties['username']="postgres"
db_properties['password']=""
db_properties['url']= "jdbc:postgresql://localhost:5432/postgres"
db_properties['table']="fifa.fifa"
db_properties['driver']="org.postgresql.Driver"

In [7]:
df_male = spark.read.csv('./Data/players_15.csv', header = True)
combined_df = df_male.withColumn("year", lit(2015))
combined_df = combined_df.withColumn("record_id", monotonically_increasing_id()).select("record_id", *combined_df.columns)
combined_df = combined_df.withColumn("gender", lit("Male"))
folder_path = "./Data"
for file_name in os.listdir(folder_path):
    if file_name == "players_15.csv":
        continue
    year = "20" + file_name[-6:-4]
    file_path = os.path.join(folder_path, file_name)
    df_read = spark.read.csv(file_path, header = True)
    df_read = df_read.withColumn("year", lit(int(year)))
    df_read = df_read.withColumn("record_id", monotonically_increasing_id()).select("record_id", *df_read.columns)
    if "female" in file_name:
        df_read = df_read.withColumn("gender", lit("Female"))
    else:
        df_read = df_read.withColumn("gender", lit("Male"))
    combined_df = combined_df.union(df_read)

In [8]:
# Write to PostgreSQL
combined_df.write.format("jdbc")\
.mode("overwrite")\
.option("url", db_properties['url'])\
.option("dbtable", db_properties['table'])\
.option("user", db_properties['username'])\
.option("password", db_properties['password'])\
.option("Driver", db_properties['driver'])\
.save()

                                                                                

In [9]:
# Read from PostgreSQL to verify
df_from_postgres = spark.read \
    .format("jdbc") \
    .option("url", db_properties['url'])\
    .option("dbtable", db_properties['table'])\
    .option("user", db_properties['username'])\
    .option("password", db_properties['password'])\
    .option("Driver", db_properties['driver'])\
    .load()

In [10]:
df_from_postgres.columns

['record_id',
 'sofifa_id',
 'player_url',
 'short_name',
 'long_name',
 'player_positions',
 'overall',
 'potential',
 'value_eur',
 'wage_eur',
 'age',
 'dob',
 'height_cm',
 'weight_kg',
 'club_team_id',
 'club_name',
 'league_name',
 'league_level',
 'club_position',
 'club_jersey_number',
 'club_loaned_from',
 'club_joined',
 'club_contract_valid_until',
 'nationality_id',
 'nationality_name',
 'nation_team_id',
 'nation_position',
 'nation_jersey_number',
 'preferred_foot',
 'weak_foot',
 'skill_moves',
 'international_reputation',
 'work_rate',
 'body_type',
 'real_face',
 'release_clause_eur',
 'player_tags',
 'player_traits',
 'pace',
 'shooting',
 'passing',
 'dribbling',
 'defending',
 'physic',
 'attacking_crossing',
 'attacking_finishing',
 'attacking_heading_accuracy',
 'attacking_short_passing',
 'attacking_volleys',
 'skill_dribbling',
 'skill_curve',
 'skill_fk_accuracy',
 'skill_long_passing',
 'skill_ball_control',
 'movement_acceleration',
 'movement_sprint_speed',


In [11]:
df_from_postgres.count()

144323

## Task 2

In [12]:
def read_from_spark(spark, db_properties):
    df_from_postgres = spark.read \
        .format("jdbc") \
        .option("url", db_properties['url'])\
        .option("dbtable", db_properties['table'])\
        .option("user", db_properties['username'])\
        .option("password", db_properties['password'])\
        .option("Driver", db_properties['driver'])\
        .load()
    df = df_from_postgres.filter(df_from_postgres["gender"] == "Male")
    return df

In [13]:
def get_top_clubs_with_contracts_ending(spark, db_properties, X, Y, Z):
    df = read_from_spark(spark, db_properties)
    df_filtered = df.filter(col("year") == X)
    df_expiring = df_filtered.filter(col("club_contract_valid_until").cast("int") >= Z)
    result = df_expiring.groupBy("club_name") \
        .count() \
        .orderBy(col("count").desc()) \
        .limit(Y)
    return result.collect()

In [14]:
def find_clubs_by_average_age(spark, db_properties, X, Y, highest=True):
    if X <= 0:
        return "X must be a positive integer"
    if Y < 2015 or Y > 2022:
        return "Y must be a year between 2015 and 2022 inclusively"
    df = read_from_spark(spark, db_properties)
    
    # Filter data for specified year Y
    df_filtered = df.filter(col("year") == Y)
    avg_age_per_club = df_filtered.groupBy("club_name") \
        .agg(round(avg("age").cast("float"),2).alias("average_age"))
    if highest:
        sorted_clubs = avg_age_per_club.orderBy(desc("average_age"))
    else:
        sorted_clubs = avg_age_per_club.orderBy(asc("average_age"))

    top_clubs = sorted_clubs.limit(X)
    last_club = top_clubs.collect()[-1]
    threshold_age = last_club["average_age"]
    if highest:
        result_clubs = sorted_clubs.filter(col("average_age") >= threshold_age).collect()
    else:
        result_clubs = sorted_clubs.filter(col("average_age") <= threshold_age).collect()
    return result_clubs

In [15]:
def get_most_popular_nationality(spark, db_properties):
    df = read_from_spark(spark, db_properties)
    # df_filtered = df.filter((col("year") >= 2015) & (col("year") <= 2022))
    nationality_counts = df.groupBy("year", "nationality_name") \
        .agg(count("*").alias("count"))
    # Create a window partitioned by year and ordered by count descending
    window = Window.partitionBy("year").orderBy(desc("count"))
    
    # Add row number within each year partition
    ranked_nationalities = nationality_counts.withColumn("rank", row_number().over(window))
    # Filter for the top nationality for each year
    most_popular_nationalities = ranked_nationalities.filter(col("rank") == 1) \
        .select("year", "nationality_name", "count") \
        .orderBy("year")
    
    return most_popular_nationalities.collect()

<h1>Task-2.1</h1>

In [16]:
top_clubs = get_top_clubs_with_contracts_ending(spark=spark, db_properties=db_properties, X=2021, Y=10, Z=2023)

In [17]:
top_clubs

[Row(club_name='GwangJu FC', count=28),
 Row(club_name='Zamora Fútbol Club', count=27),
 Row(club_name='Club Plaza de Deportes Colonia', count=27),
 Row(club_name='SL Benfica', count=26),
 Row(club_name='Club Deportivo El Nacional', count=26),
 Row(club_name='Sociedad Deportiva Aucas', count=26),
 Row(club_name='Gangwon FC', count=26),
 Row(club_name='Club Atlético Nacional Potosí', count=26),
 Row(club_name='Busan IPark', count=26),
 Row(club_name='Club Sportivo Luqueño', count=25)]

<h1>Task-2.2</h1>

In [18]:
clubs_by_age = find_clubs_by_average_age(spark=spark, db_properties=db_properties, X=10, Y=2017, highest=False)

In [19]:
clubs_by_age

[Row(club_name='Sevilla Atlético', average_age=19.920000076293945),
 Row(club_name='Swindon Town', average_age=21.3700008392334),
 Row(club_name='CD Huachipato', average_age=21.40999984741211),
 Row(club_name='FC Nordsjælland', average_age=21.40999984741211),
 Row(club_name='FC Twente', average_age=21.59000015258789),
 Row(club_name='Envigado FC', average_age=21.610000610351562),
 Row(club_name='KRC Genk', average_age=21.6299991607666),
 Row(club_name='Crewe Alexandra', average_age=21.81999969482422),
 Row(club_name='Barnsley', average_age=21.8700008392334),
 Row(club_name='Ajax', average_age=21.969999313354492)]

<h1>Task-2.3</h1>

In [20]:
popular_nationalities = get_most_popular_nationality(spark=spark, db_properties=db_properties)

In [21]:
popular_nationalities

[Row(year=2015, nationality_name='England', count=1627),
 Row(year=2016, nationality_name='England', count=1519),
 Row(year=2017, nationality_name='England', count=1627),
 Row(year=2018, nationality_name='England', count=1633),
 Row(year=2019, nationality_name='England', count=1625),
 Row(year=2020, nationality_name='England', count=1670),
 Row(year=2021, nationality_name='England', count=1685),
 Row(year=2022, nationality_name='England', count=1719)]

<h1>Task-3</h1>

In [467]:
spark_df = read_from_spark(spark, db_properties)

In [468]:
# Define UDF for safe evaluation
def safe_eval(x):
    try:
        return float(eval(str(x)))
    except:
        return None

In [469]:
# Define UDF for trait assignment
def assign_traits(player_positions, *traits):
    traits = [float(t) if t is not None else None for t in traits]
    if player_positions and 'GK' in player_positions:
        return traits[6:12]
    else:
        return traits[:6]

In [470]:
def data_engineering_pipeline(spark_df):
    # Drop columns
    columns_to_drop = ['record_id', 'sofifa_id', 'player_url', 'short_name', 'long_name',
                       'dob', 'club_name', 'league_name', 'club_position', 'club_jersey_number', 'club_loaned_from',
                       'club_contract_valid_until', 'nationality_id', 'nationality_name', 'nation_team_id',
                       'nation_position', 'nation_jersey_number', 'real_face', 'player_tags', 'player_traits', 
                       'player_face_url', 'club_logo_url', 'club_flag_url', 'nation_logo_url', 'nation_flag_url',
                       'year', 'gender', 'club_joined']

    positions = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw',
             'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm',
             'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb',
             'rcb', 'rb', 'gk']
    attacking_positions = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw',
                           'lam', 'cam', 'ram']
    midfield_positions = ['lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm',
                          'ldm', 'cdm', 'rdm']
    defensive_positions = ['rwb', 'lb', 'lcb', 'cb', 'lwb', 'rcb', 'rb',
                           'ldm', 'cdm', 'rdm']
    gk_positions = ['gk']

    # Define main traits
    main_traits_to_merge = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 
                            'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
                            'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed']

    # List of skill columns to average
    skill_columns = ['skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control']

    #Trait1 = pace/goalkeeping_diving
    #Trait2 = shooting/goalkeeping_handling
    #Trait3 = passing/goalkeeping_kicking
    #Trait4 = dribbling/goalkeeping_positioning
    #Trait5 = defending/goalkeeping_reflexes
    #Trait6 = physic/goalkeeping_speed
    
    safe_eval_udf = F.udf(safe_eval, FloatType())
    assign_traits_udf = udf(assign_traits, ArrayType(FloatType()))
    
    #Drop Columns
    spark_df = spark_df.drop(*columns_to_drop)
    
    # Apply safe_eval to position columns
    for pos in positions:
        spark_df = spark_df.withColumn(pos, safe_eval_udf(pos))

    #Add new columns 'average_val_attacking', 'average_val_midfield', 'average_val_defensive', 'average_val_gk'
    spark_df = spark_df.withColumn(
        'average_val_attacking', 
        F.aggregate(
            F.array(*[F.col(pos) for pos in attacking_positions]),
            F.lit(0.0),
            lambda acc, x: acc + x,
            lambda acc: acc / F.size(F.array(*[F.col(pos) for pos in attacking_positions]))
        )
    )
    spark_df = spark_df.withColumn(
        'average_val_midfield', 
        F.aggregate(
            F.array(*[F.col(pos) for pos in midfield_positions]),
            F.lit(0.0),
            lambda acc, x: acc + x,
            lambda acc: acc / F.size(F.array(*[F.col(pos) for pos in midfield_positions]))
        )
    )
    spark_df = spark_df.withColumn(
        'average_val_defensive', 
        F.aggregate(
            F.array(*[F.col(pos) for pos in defensive_positions]),
            F.lit(0.0),
            lambda acc, x: acc + x,
            lambda acc: acc / F.size(F.array(*[F.col(pos) for pos in defensive_positions]))
        )
    )
    spark_df = spark_df.withColumn(
        'average_val_gk', 
        F.aggregate(
            F.array(*[F.col(pos) for pos in gk_positions]),
            F.lit(0.0),
            lambda acc, x: acc + x,
            lambda acc: acc / F.size(F.array(*[F.col(pos) for pos in gk_positions]))
        )
    )

    # Apply trait assignment
    spark_df = spark_df.withColumn('traits', 
        assign_traits_udf(col('player_positions'), *[col(trait) for trait in main_traits_to_merge])
    )
    
    # Extract individual traits
    for i in range(1, 7):
        spark_df = spark_df.withColumn(f'trait{i}', col('traits').getItem(i-1))

    # Drop unnecessary columns
    spark_df = spark_df.drop(*positions, *main_traits_to_merge, 'player_positions', 'traits')
    
    # Drop rows with null values in specific columns
    spark_df = spark_df.na.drop(subset=['wage_eur', 'value_eur', 'trait6'])
    
    # Drop additional columns
    spark_df = spark_df.drop('release_clause_eur', 'league_level')
    
    # Fill null values in club_team_id with -1
    spark_df = spark_df.fillna({'club_team_id': -1})

    #Average out skills as highly correlated
    spark_df = spark_df.withColumn(
        'corr_av_skills', 
        F.aggregate(
            F.array(*[F.col(pos) for pos in skill_columns]),
            F.lit(0.0),
            lambda acc, x: acc + x,
            lambda acc: acc / F.size(F.array(*[F.col(pos) for pos in skill_columns]))
        )
    )
    #Drop the skills as added one new average skill column
    spark_df = spark_df.drop('skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control')

    #Drop movement_acceleration as highly correlated to movement_speed
    spark_df = spark_df.drop('movement_acceleration')

    #Drop movement_acceleration as highly correlated to movement_speed
    spark_df = spark_df.drop('preferred_foot')

    s = spark_df.toPandas()
    print(s)
    print(f'Shape {s.shape}')
    return s

In [471]:
s = data_engineering_pipeline(spark_df)

                                                                                

       overall potential    value_eur  wage_eur age height_cm weight_kg  \
0           93        93  103500000.0  560000.0  33       170        72   
1           92        92   63000000.0  220000.0  35       187        83   
2           91        91  111000000.0  240000.0  31       184        80   
3           91        91  132000000.0  270000.0  28       175        68   
4           91        91  129000000.0  370000.0  29       181        70   
...        ...       ...          ...       ...  ..       ...       ...   
140176      59        62     160000.0    3000.0  25       183        76   
140177      59        67     230000.0    2000.0  22       177        75   
140178      59        73     350000.0    2000.0  20       176        70   
140179      59        67     240000.0    3000.0  23       169        57   
140180      59        63     160000.0    3000.0  25       176        74   

       club_team_id weak_foot skill_moves  ... average_val_midfield  \
0             241.0         

From correlation analysis
average - skill columns
remove - movement_acceleration
Preffered foot is not correlated to overall. so drop