# 1. Imports

In [89]:
from src.configuration import Configuration
from src.fetch_data import DataFetcher
from src.dim import Dims
from src.facts import Facts
import pandas as pd
import time
import datetime
from pyspark.sql import functions as f 
from pyspark.sql import  SparkSession, Window




pd.set_option('display.max_columns', None)



# 2. Variables

In [83]:
config = Configuration('src/config.cfg')
fetcher = DataFetcher(config)
DIMS = Dims(config, fetcher)
FACTS = Facts(config, fetcher)
spark = SparkSession.builder.appName("MojaSesja").master("local").getOrCreate()



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/11 12:30:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# 3. Get last  results of an race per driver.

In [14]:
fact_session_results = FACTS.fact_session_results()
dim_session = DIMS.dim_sessions()
dim_driver_number = DIMS.dim_driver_number()

In [13]:
dim_session_race = dim_session[dim_session['session_name'] == "Race"]
dim_session_race.head()

Unnamed: 0,session_key,location,date_start,date_end,session_name,country_code,country_name,year,is_current_season,key
1,7953,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023
3,7779,Jeddah,2023-03-19,2023-03-19,Race,KSA,Saudi Arabia,2023,0,Jeddah2023
5,7787,Melbourne,2023-04-02,2023-04-02,Race,AUS,Australia,2023,0,Melbourne2023
9,9070,Baku,2023-04-30,2023-04-30,Race,AZE,Azerbaijan,2023,0,Baku2023
11,9078,Miami,2023-05-07,2023-05-07,Race,USA,United States,2023,0,Miami2023


In [19]:
fact_session_race_results = fact_session_results.merge(
    dim_session_race,
    on="session_key",
    how="inner"
)

In [20]:
fact_session_race_results.head()

Unnamed: 0,position,driver_number,number_of_laps,dnf,dns,dsq,duration,gap_to_leader,meeting_key,session_key,points,location,date_start,date_end,session_name,country_code,country_name,year,is_current_season,key
0,1.0,1,57.0,False,False,False,5636.736,0.0,1141,7953,25.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023
1,2.0,11,57.0,False,False,False,5648.723,11.987,1141,7953,18.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023
2,3.0,14,57.0,False,False,False,5675.373,38.637,1141,7953,15.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023
3,4.0,55,57.0,False,False,False,5684.788,48.052,1141,7953,12.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023
4,5.0,44,57.0,False,False,False,5687.713,50.977,1141,7953,10.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023


In [36]:
fact_session_race_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1277 entries, 0 to 1276
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   position           1137 non-null   float64
 1   driver_number      1277 non-null   int64  
 2   number_of_laps     1272 non-null   float64
 3   dnf                1277 non-null   bool   
 4   dns                1277 non-null   bool   
 5   dsq                1277 non-null   bool   
 6   duration           838 non-null    object 
 7   gap_to_leader      1118 non-null   object 
 8   meeting_key        1277 non-null   int64  
 9   session_key        1277 non-null   int64  
 10  points             1277 non-null   float64
 11  location           1277 non-null   object 
 12  date_start         1277 non-null   object 
 13  date_end           1277 non-null   object 
 14  session_name       1277 non-null   object 
 15  country_code       1277 non-null   object 
 16  country_name       1277 

In [38]:
fact_session_race_results['position'] = fact_session_race_results['position'].fillna(value=21)

In [41]:
fact_session_race_results = fact_session_race_results.sort_values(['date_end','driver_number'], ascending=False)

In [42]:
fact_session_race_results

Unnamed: 0,position,driver_number,number_of_laps,dnf,dns,dsq,duration,gap_to_leader,meeting_key,session_key,points,location,date_start,date_end,session_name,country_code,country_name,year,is_current_season,key
1245,9.0,87,62.0,False,False,False,6115.894,93.527,1270,9896,2.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025
1240,4.0,81,62.0,False,False,False,6030.513,8.146,1270,9896,12.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025
1237,1.0,63,62.0,False,False,False,6022.367,0,1270,9896,25.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025
1246,10.0,55,61.0,False,False,False,,+1 LAP,1270,9896,1.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025
1244,8.0,44,62.0,False,False,False,6107.618,85.251,1270,9896,4.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,2.0,11,57.0,False,False,False,5648.723,11.987,1141,7953,18.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023
8,9.0,10,57.0,False,False,False,5710.489,73.753,1141,7953,2.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023
16,17.0,4,55.0,False,False,False,,+2 LAPS,1141,7953,0.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023
11,12.0,2,56.0,False,False,False,,+1 LAP,1141,7953,0.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023


In [52]:
fact_session_race_results



Unnamed: 0,position,driver_number,number_of_laps,dnf,dns,dsq,duration,gap_to_leader,meeting_key,session_key,points,location,date_start,date_end,session_name,country_code,country_name,year,is_current_season,key,last_race_1,last_race_2,last_race_3,last_race_4,last_race_5
1245,9.0,87,62.0,False,False,False,6115.894,93.527,1270,9896,2.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025,,,,,
1240,4.0,81,62.0,False,False,False,6030.513,8.146,1270,9896,12.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025,,,,,
1237,1.0,63,62.0,False,False,False,6022.367,0,1270,9896,25.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025,,,,,
1246,10.0,55,61.0,False,False,False,,+1 LAP,1270,9896,1.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025,,,,,
1244,8.0,44,62.0,False,False,False,6107.618,85.251,1270,9896,4.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,2.0,11,57.0,False,False,False,5648.723,11.987,1141,7953,18.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023,,,,,
8,9.0,10,57.0,False,False,False,5710.489,73.753,1141,7953,2.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023,,,,,
16,17.0,4,55.0,False,False,False,,+2 LAPS,1141,7953,0.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023,,,,,
11,12.0,2,56.0,False,False,False,,+1 LAP,1141,7953,0.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023,,,,,


In [53]:
fact_session_race_results.reset_index()

Unnamed: 0,index,position,driver_number,number_of_laps,dnf,dns,dsq,duration,gap_to_leader,meeting_key,session_key,points,location,date_start,date_end,session_name,country_code,country_name,year,is_current_season,key,last_race_1,last_race_2,last_race_3,last_race_4,last_race_5
0,1245,9.0,87,62.0,False,False,False,6115.894,93.527,1270,9896,2.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025,,,,,
1,1240,4.0,81,62.0,False,False,False,6030.513,8.146,1270,9896,12.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025,,,,,
2,1237,1.0,63,62.0,False,False,False,6022.367,0,1270,9896,25.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025,,,,,
3,1246,10.0,55,61.0,False,False,False,,+1 LAP,1270,9896,1.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025,,,,,
4,1244,8.0,44,62.0,False,False,False,6107.618,85.251,1270,9896,4.0,Marina Bay,2025-10-05,2025-10-05,Race,SGP,Singapore,2025,1,Marina Bay2025,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1272,1,2.0,11,57.0,False,False,False,5648.723,11.987,1141,7953,18.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023,,,,,
1273,8,9.0,10,57.0,False,False,False,5710.489,73.753,1141,7953,2.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023,,,,,
1274,16,17.0,4,55.0,False,False,False,,+2 LAPS,1141,7953,0.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023,,,,,
1275,11,12.0,2,56.0,False,False,False,,+1 LAP,1141,7953,0.0,Sakhir,2023-03-05,2023-03-05,Race,BRN,Bahrain,2023,0,Sakhir2023,,,,,


In [85]:
fact_session_race_results[['driver_number','position','session_key','date_end']]



Unnamed: 0,driver_number,position,session_key,date_end
1245,87,9.0,9896,2025-10-05
1240,81,4.0,9896,2025-10-05
1237,63,1.0,9896,2025-10-05
1246,55,10.0,9896,2025-10-05
1244,44,8.0,9896,2025-10-05
...,...,...,...,...
1,11,2.0,7953,2023-03-05
8,10,9.0,7953,2023-03-05
16,4,17.0,7953,2023-03-05
11,2,12.0,7953,2023-03-05


In [87]:
spark_df = spark.createDataFrame(fact_session_race_results[['driver_number','position','session_key','date_end']])

In [88]:
spark_df.show()

                                                                                

+-------------+--------+-----------+----------+
|driver_number|position|session_key|  date_end|
+-------------+--------+-----------+----------+
|           87|     9.0|       9896|2025-10-05|
|           81|     4.0|       9896|2025-10-05|
|           63|     1.0|       9896|2025-10-05|
|           55|    10.0|       9896|2025-10-05|
|           44|     8.0|       9896|2025-10-05|
|           43|    16.0|       9896|2025-10-05|
|           31|    18.0|       9896|2025-10-05|
|           30|    15.0|       9896|2025-10-05|
|           27|    20.0|       9896|2025-10-05|
|           23|    14.0|       9896|2025-10-05|
|           22|    12.0|       9896|2025-10-05|
|           18|    13.0|       9896|2025-10-05|
|           16|     6.0|       9896|2025-10-05|
|           14|     7.0|       9896|2025-10-05|
|           12|     5.0|       9896|2025-10-05|
|           10|    19.0|       9896|2025-10-05|
|            6|    11.0|       9896|2025-10-05|
|            5|    17.0|       9896|2025

In [None]:
window = Window.partitionBy("driver_number").orderBy(f.col("date_end")).rowsBetween(-5, -1)
df_with_last5 = spark_df.withColumn(
    "last_5_positions",
    f.collect_list("position").over(window)
)

for i in range(5):
    df_with_last5 = df_with_last5.withColumn(
        f"last_pos_{i+1}",
        f.expr(f"element_at(last_5_positions, {i+1})")
    )

# Wyświetlenie wyniku
df_with_last5.show(truncate=False)

+-------------+--------+-----------+----------+-------------------------+----------+----------+----------+----------+----------+
|driver_number|position|session_key|date_end  |last_5_positions         |last_pos_1|last_pos_2|last_pos_3|last_pos_4|last_pos_5|
+-------------+--------+-----------+----------+-------------------------+----------+----------+----------+----------+----------+
|1            |1.0     |7953       |2023-03-05|[]                       |NULL      |NULL      |NULL      |NULL      |NULL      |
|1            |2.0     |7779       |2023-03-19|[1.0]                    |1.0       |NULL      |NULL      |NULL      |NULL      |
|1            |1.0     |7787       |2023-04-02|[1.0, 2.0]               |1.0       |2.0       |NULL      |NULL      |NULL      |
|1            |2.0     |9070       |2023-04-30|[1.0, 2.0, 1.0]          |1.0       |2.0       |1.0       |NULL      |NULL      |
|1            |1.0     |9078       |2023-05-07|[1.0, 2.0, 1.0, 2.0]     |1.0       |2.0       |1.

In [107]:
df_with_last5_pd = df_with_last5.toPandas()

In [108]:
df_with_last5_pd.sort_values('date_end',ascending=False)

Unnamed: 0,driver_number,position,session_key,date_end,last_5_positions,last_pos_1,last_pos_2,last_pos_3,last_pos_4,last_pos_5
1276,87,9.0,9896,2025-10-05,"[11.0, 21.0, 6.0, 12.0, 12.0]",11.0,21.0,6.0,12.0,12.0
1258,81,4.0,9896,2025-10-05,"[1.0, 2.0, 1.0, 3.0, 21.0]",1.0,2.0,1.0,3.0,21.0
486,16,6.0,9896,2025-10-05,"[3.0, 4.0, 21.0, 4.0, 9.0]",3.0,4.0,21.0,4.0,9.0
63,1,2.0,9896,2025-10-05,"[4.0, 9.0, 2.0, 1.0, 1.0]",4.0,9.0,2.0,1.0,1.0
730,23,14.0,9896,2025-10-05,"[6.0, 15.0, 5.0, 7.0, 13.0]",6.0,15.0,5.0,7.0,13.0
...,...,...,...,...,...,...,...,...,...,...
777,27,15.0,7953,2023-03-05,[],,,,,
731,24,16.0,7953,2023-03-05,[],,,,,
487,18,6.0,7953,2023-03-05,[],,,,,
1195,81,21.0,7953,2023-03-05,[],,,,,


In [109]:
df_with_last5_pd[df_with_last5_pd['driver_number']==87]

Unnamed: 0,driver_number,position,session_key,date_end,last_5_positions,last_pos_1,last_pos_2,last_pos_3,last_pos_4,last_pos_5
1259,87,14.0,9693,2025-03-16,[],,,,,
1260,87,8.0,9998,2025-03-23,[14.0],14.0,,,,
1261,87,10.0,10006,2025-04-06,"[14.0, 8.0]",14.0,8.0,,,
1262,87,10.0,10014,2025-04-13,"[14.0, 8.0, 10.0]",14.0,8.0,10.0,,
1263,87,13.0,10022,2025-04-20,"[14.0, 8.0, 10.0, 10.0]",14.0,8.0,10.0,10.0,
1264,87,21.0,10033,2025-05-04,"[14.0, 8.0, 10.0, 10.0, 13.0]",14.0,8.0,10.0,10.0,13.0
1265,87,17.0,9987,2025-05-18,"[8.0, 10.0, 10.0, 13.0, 21.0]",8.0,10.0,10.0,13.0,21.0
1266,87,12.0,9979,2025-05-25,"[10.0, 10.0, 13.0, 21.0, 17.0]",10.0,10.0,13.0,21.0,17.0
1267,87,17.0,9971,2025-06-01,"[10.0, 13.0, 21.0, 17.0, 12.0]",10.0,13.0,21.0,17.0,12.0
1268,87,11.0,9963,2025-06-15,"[13.0, 21.0, 17.0, 12.0, 17.0]",13.0,21.0,17.0,12.0,17.0
