In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Kshitij\\Downloads\\F1_CICD\\F1_ML_Ops_CI-CD\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Kshitij\\Downloads\\F1_CICD\\F1_ML_Ops_CI-CD'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [7]:
from F1_Stint_Prediction.constants import *
from F1_Stint_Prediction.utils.common import read_yaml,create_directories

In [12]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [13]:
import os
from sklearn.model_selection import train_test_split
import pandas as pd
from F1_Stint_Prediction import logger
from sklearn.preprocessing import LabelEncoder

In [None]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        
    def get_transformed_data(self):    
        df = pd.read_csv(self.config.data_path)
        #Add stint_number
        df['stint_num'] = df.groupby(['EventName', 'RoundNumber', 'EventYear', 'Team', 'Driver']).cumcount() + 1
        # Add total number of stints
        grouped = df.groupby(['EventName','EventYear','Driver'])
        strategy_data = grouped.apply(lambda x: pd.Series({
            'num_stints': len(x),
            'stint_compounds': list(x['Compound']),
            'stint_lengths': list(x['StintLen'])
        })).reset_index()
        merged_data = pd.merge(df, strategy_data, on=['EventName', 'EventYear', 'Driver'])

        # Remove data where number of Stints is 1
        merged_data = merged_data[merged_data['num_stints'] != 1]

        #Encode categorical variables
        le_event = LabelEncoder()
        le_compound = LabelEncoder()

        merged_data['EventEncoded'] = le_event.fit_transform(merged_data['EventName'])
        merged_data['CompoundEncoded'] = le_compound.fit_transform(merged_data['Compound'])

        #Remove data in which number of stints is more than 5 or less than 1
        merged_data=merged_data[merged_data['num_stints']!=1]
        merged_data=merged_data[merged_data['num_stints']<5]

        #Removing data in which stint lentgh is more than 35 or less than 5
        merged_data = merged_data[merged_data['StintLen']>5]
        merged_data = merged_data[merged_data['StintLen']<35]  
        
        #Create temporal features
        merged_data['prev_stint_length'] = merged_data.groupby(['EventName', 'EventYear', 'Driver'])['StintLen'].shift(1)
        merged_data['cumulative_laps'] = merged_data.groupby(['EventName', 'EventYear', 'Driver'])['StintLen'].cumsum()
        merged_data.fillna(0, inplace=True)
        
        features_stint_num = ['CircuitLength', 'DesignedLaps','TrackTemp', 'AirTemp','EventEncoded'] 
        features_stint_compound = ['CircuitLength', 'cumulative_laps', 'TrackTemp', 'AirTemp','stint_num','EventEncoded','Humidity', 'Rainfall','SafetyCar']
        features_stint_length = ['CircuitLength', 'TrackTemp', 'AirTemp','prev_stint_length','EventEncoded','DegradationSlope', 'DegradationBias','DesignedLaps','Humidity', 'Rainfall','SafetyCar']
            
        target_total_stints = 'num_stints'
        target_compound = 'CompoundEncoded'
        target_stint_len = 'StintLen'
        
        X_stint_count = merged_data.drop_duplicates(subset=['EventName', 'EventYear', 'Driver'])[features_stint_num]
        y_stint_count = merged_data.drop_duplicates(subset=['EventName', 'EventYear', 'Driver'])[target_total_stints]
        
        self.train_test_spliting(X_stint_count,y_stint_count,"stitn_count",0)
        
        s1_df = merged_data[merged_data['stint_num']==1]
        s1_df = s1_df[features_stint_compound]

        s2_df = merged_data[merged_data['stint_num']==2]
        s2_df = s2_df[features_stint_compound]

        s3_df = merged_data[merged_data['stint_num']==3]
        s3_df = s3_df[features_stint_compound]

        s4_df = merged_data[merged_data['stint_num']==4]
        s4_df = s4_df[features_stint_compound]
        
        y_s1 = merged_data.loc[merged_data['stint_num'] == 1][target_compound]
        y_s2 = merged_data.loc[merged_data['stint_num'] == 2][target_compound]
        y_s3 = merged_data.loc[merged_data['stint_num'] == 3][target_compound]
        y_s4 = merged_data.loc[merged_data['stint_num'] == 4][target_compound]
        
        self.train_test_spliting(s1_df,y_s1,"compound",1)
        self.train_test_spliting(s2_df,y_s2,"compound",2)
        self.train_test_spliting(s3_df,y_s3,"compound",3)
        self.train_test_spliting(s4_df,y_s4,"compound",4)
        
        s1_len_df = merged_data[merged_data['stint_num']==1]
        s1_len_df = s1_len_df[features_stint_length]

        s2_len_df = merged_data[merged_data['stint_num']==2]
        s2_len_df = s2_len_df[features_stint_length]

        s3_len_df = merged_data[merged_data['stint_num']==3]
        s3_len_df = s3_len_df[features_stint_length]

        s4_len_df = merged_data[merged_data['stint_num']==4]
        s4_len_df = s4_len_df[features_stint_length]
        
        y_s1_len = merged_data.loc[merged_data['stint_num'] == 1][target_stint_len]
        y_s2_len = merged_data.loc[merged_data['stint_num'] == 2][target_stint_len]
        y_s3_len = merged_data.loc[merged_data['stint_num'] == 3][target_stint_len]
        y_s4_len = merged_data.loc[merged_data['stint_num'] == 4][target_stint_len]
        
        self.train_test_spliting(s1_len_df,y_s1_len,"stint_len",1)
        self.train_test_spliting(s2_len_df,y_s2_len,"stint_len",2)
        self.train_test_spliting(s3_len_df,y_s3_len,"stint_len",3)
        self.train_test_spliting(s4_len_df,y_s4_len,"stint_len",4)
    
    def train_test_spliting(self,X,y,name,num):
        X=X
        y=y
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

        X_train.to_csv(os.path.join(self.config.root_dir, f"X_train_{name}_{num}.csv"),index = False)
        X_test.to_csv(os.path.join(self.config.root_dir, f"X_test_{name}_{num}.csv"),index = False)
        y_train.to_csv(os.path.join(self.config.root_dir, f"y_train_{name}_{num}.csv"),index = False)
        y_test.to_csv(os.path.join(self.config.root_dir,f"y_test_{name}_{num}.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(X_train.shape)
        logger.info(X_test.shape)
        logger.info(y_train.shape)
        logger.info(y_test.shape)

        print(X_train.shape)
        print(X_test.shape)
        print(y_train.shape)
        print(y_test.shape)

In [20]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.get_transformed_data()
except Exception as e:
    raise e

[2025-03-24 20:16:21,387: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-24 20:16:21,387: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-24 20:16:21,395: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-24 20:16:21,395: INFO: common: created directory at: artifacts]
[2025-03-24 20:16:21,404: INFO: common: created directory at: artifacts/data_transformation]
[2025-03-24 20:16:21,805: INFO: 3571002178: Splited data into training and test sets]
[2025-03-24 20:16:21,805: INFO: 3571002178: (584, 5)]
[2025-03-24 20:16:21,805: INFO: 3571002178: (584,)]
[2025-03-24 20:16:21,805: INFO: 3571002178: (147, 5)]
[2025-03-24 20:16:21,805: INFO: 3571002178: (147,)]
(584, 5)
(584,)
(147, 5)
(147,)
[2025-03-24 20:16:21,852: INFO: 3571002178: Splited data into training and test sets]
[2025-03-24 20:16:21,852: INFO: 3571002178: (453, 9)]
[2025-03-24 20:16:21,860: INFO: 3571002178: (453,)]
[2025-03-24 20:16:21,860: INFO: 3571002178: (114, 9)]

  strategy_data = grouped.apply(lambda x: pd.Series({


[2025-03-24 20:16:21,948: INFO: 3571002178: (64, 9)]
[2025-03-24 20:16:21,952: INFO: 3571002178: (64,)]
[2025-03-24 20:16:21,952: INFO: 3571002178: (16, 9)]
[2025-03-24 20:16:21,954: INFO: 3571002178: (16,)]
(64, 9)
(64,)
(16, 9)
(16,)
[2025-03-24 20:16:21,993: INFO: 3571002178: Splited data into training and test sets]
[2025-03-24 20:16:22,001: INFO: 3571002178: (453, 11)]
[2025-03-24 20:16:22,001: INFO: 3571002178: (453,)]
[2025-03-24 20:16:22,007: INFO: 3571002178: (114, 11)]
[2025-03-24 20:16:22,010: INFO: 3571002178: (114,)]
(453, 11)
(453,)
(114, 11)
(114,)
[2025-03-24 20:16:22,042: INFO: 3571002178: Splited data into training and test sets]
[2025-03-24 20:16:22,043: INFO: 3571002178: (376, 11)]
[2025-03-24 20:16:22,043: INFO: 3571002178: (376,)]
[2025-03-24 20:16:22,048: INFO: 3571002178: (94, 11)]
[2025-03-24 20:16:22,052: INFO: 3571002178: (94,)]
(376, 11)
(376,)
(94, 11)
(94,)
[2025-03-24 20:16:22,102: INFO: 3571002178: Splited data into training and test sets]
[2025-03-24 20