In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)

In [144]:
df = pd.read_csv('../Data/train.csv')
df.columns = [x.title().strip() for x in df.columns]
df = df.dropna(subset=['S1'])

drop_cols = ['S1_Large', 'S2_Large', 'S3_Large']

In [145]:
df['Event'].value_counts()

for index, row in df.iterrows():
    number = str(row['Number'])
    location_number = row['Location'][-1:]
    if row['Event'] == 'Free Practice 1':
        event = 'FP1'
    elif row['Event'] == 'Free Practice 2':
        event = 'FP2'
    elif row['Event'] == 'Free Practice 3':
        event = 'FP3'
    elif row['Event'] == 'Qualifying Group 1':
        event = 'QG1'
    elif row['Event'] == 'Qualifying Group 2':
        event = 'QG2'
    elif row['Event'] == 'Qualifying Group 3':
        event = 'QG3'
    elif row['Event'] == 'Qualifying Group 4':
        event = 'QG4'
    df.at[index, 'Trial_ID'] = event + '-' + location_number + '-' + number

In [146]:
for x in df['Trial_ID'].unique():
    temp = df.loc[df['Trial_ID'] == x]
    
    lap_number_previous = 1
    trial_identifier = 1
    
    for index, row in temp.iterrows():
        if row['Lap_Number'] >= lap_number_previous:
            df.at[index, 'Trial_Number'] = trial_identifier
            lap_number_previous += 1
        elif row['Lap_Number'] < lap_number_previous:
            trial_identifier += 1
            df.at[index, 'Trial_Number'] = trial_identifier
            lap_number_previous = 1
            
df['Trial_ID_2'] = df['Trial_ID'] + '-' + df['Trial_Number'].astype(int).astype(str)

In [147]:
def TimeConversion(x):
    x = str(x)
    if x != 'nan':
        try:
            y = datetime.strptime(x, '%M:%S.%f').time()
        except ValueError:
            try:
                y = datetime.strptime(x, '%S.%f').time()
            except ValueError:
                try:
                    y = datetime.strptime(x, '%S').time()
                except ValueError:
                    y = datetime.strptime('0', '%S').time()
    if x == 'nan':
        y = datetime.strptime('0', '%S').time()
    z = timedelta(minutes=y.minute, seconds=y.second, microseconds=y.microsecond)
    return z

In [148]:
time_cols = [
    'S1',
    'S2',
    'S3',
    'Elapsed',
    'Hour',
    'S1_Large',
    'S2_Large',
    'S3_Large',
    'Pit_Time',
]

daytime_cols = ['Hour']

for x in time_cols:
    df[x] = df[x].apply(TimeConversion)

In [149]:
for x in df['Trial_ID_2']:
    temp = df.loc[df['Trial_ID_2'] == x]
    laps = len(temp)
    pit_time = timedelta(0)
    for index, row in temp.iterrows():
        if ~pd.isna(row['Pit_Time']):
            pit_amount = row['Pit_Time']
            pit_time += pit_amount
            
    df.loc[df['Trial_ID_2'] == x, 'Pit_Time'] = pit_time / laps

In [150]:
test = df[df['Trial_ID_2'] == 'FP2-2-10-1']
test

Unnamed: 0,Number,Driver_Number,Lap_Number,Lap_Time,Lap_Improvement,Crossing_Finish_Line_In_Pit,S1,S1_Improvement,S2,S2_Improvement,S3,S3_Improvement,Kph,Elapsed,Hour,S1_Large,S2_Large,S3_Large,Driver_Name,Pit_Time,Group,Team,Power,Location,Event,Trial_ID,Trial_Number,Trial_ID_2
0,10,1,1,92,0,,00:05:43.300000,0,00:00:35.427000,0,00:00:43.313000,0,28.8,00:07:02,00:22:02,00:05:43.300000,00:00:35.400000,00:00:43.300000,SB,00:01:44.900000,,JR,,Location 2,Free Practice 2,FP2-2-10,1.0,FP2-2-10-1
1,10,1,2,87,2,,00:00:25.674000,2,00:00:33.399000,2,00:00:41.922000,2,120.5,00:08:43,00:23:43,00:00:25.700000,00:00:33.400000,00:00:41.900000,SB,00:01:44.900000,,JR,,Location 2,Free Practice 2,FP2-2-10,1.0,FP2-2-10-1
2,10,1,3,73,0,B,00:00:28.129000,0,00:00:34.091000,0,00:00:57.248000,0,101.9,00:10:42.500000,00:25:42.500000,00:00:28.100000,00:00:34.100000,00:00:57.200000,SB,00:01:44.900000,,JR,,Location 2,Free Practice 2,FP2-2-10,1.0,FP2-2-10-1


In [151]:
df = df.drop(columns=[
    'S1_Large',
    'S2_Large',
    'S3_Large',
    'Trial_Number',
    'Trial_ID',
    'Number',
    'Driver_Number',
    'Crossing_Finish_Line_In_Pit'
])

In [152]:
def ConvertToSeconds(x):
    y = x.total_seconds()
    return y

df['S1'] = df['S1'].apply(ConvertToSeconds)
df['S2'] = df['S2'].apply(ConvertToSeconds)
df['S3'] = df['S3'].apply(ConvertToSeconds)
df['Pit_Time'] = df['Pit_Time'].apply(ConvertToSeconds)

df['Time_Minutes'] = [(x.total_seconds() / 60) for x in df['Hour']]

In [153]:
wdf_train = pd.read_csv('../Data/train_weather.csv')
wdf_test = pd.read_csv('../Data/test_weather.csv')
wdf_test = wdf_test.rename(columns={'EVENTS': 'EVENT'})
wdf_test['RAIN'] = wdf_test['RAIN'].astype(str)

wdf = pd.concat([wdf_train, wdf_test])

wdf['TIME_UTC_STR'] = pd.to_datetime(wdf['TIME_UTC_STR'], dayfirst=True)
wdf['TIME_UTC_MINUTE'] = [x.minute for x in wdf['TIME_UTC_STR']]

num_cols = ['AIR_TEMP', 'TRACK_TEMP', 'HUMIDITY', 'PRESSURE', 'WIND_SPEED', 'RAIN']
wdf['RAIN'] = [x.replace(',', '.') for x in wdf['RAIN']]

for col in num_cols:
    wdf[col] = wdf[col].str.replace(',', '').replace('.', '').astype(float)

wdf.dtypes

for index, row in df.iterrows():
    location = row['Location']
    event = row['Event']
    time = row['Time_Minutes']
    
    try:
        weather = wdf.loc[(wdf['LOCATION'] == location) & (wdf['EVENT'] == event) & (wdf['TIME_UTC_MINUTE'] <= time) & (time < (wdf['TIME_UTC_MINUTE'] + 1))]
        weather = weather.iloc[0]
        df.at[index, 'Air_Temp'] = weather['AIR_TEMP']
        df.at[index, 'Track_Temp'] = weather['TRACK_TEMP']
        df.at[index, 'Humidity'] = weather['HUMIDITY']
        df.at[index, 'Pressure'] = weather['PRESSURE']
        df.at[index, 'Wind_Speed'] = weather['WIND_SPEED']
        df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION']
        df.at[index, 'Rain'] = weather['RAIN']
    except IndexError:
        weather = wdf.loc[(wdf['LOCATION'] == location) & (wdf['EVENT'] == event)]
        if not weather.empty:
            df.at[index, 'Air_Temp'] = weather['AIR_TEMP'].mean()
            df.at[index, 'Track_Temp'] = weather['TRACK_TEMP'].mean()
            df.at[index, 'Humidity'] = weather['HUMIDITY'].mean()
            df.at[index, 'Pressure'] = weather['PRESSURE'].mean()
            df.at[index, 'Wind_Speed'] = weather['WIND_SPEED'].mean()
            df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION'].mean()
            df.at[index, 'Rain'] = weather['RAIN'].mode()[0]
        else:
            weather = wdf.loc[wdf['LOCATION'] == location]
            df.at[index, 'Air_Temp'] = weather['AIR_TEMP'].mean()
            df.at[index, 'Track_Temp'] = weather['TRACK_TEMP'].mean()
            df.at[index, 'Humidity'] = weather['HUMIDITY'].mean()
            df.at[index, 'Pressure'] = weather['PRESSURE'].mean()
            df.at[index, 'Wind_Speed'] = weather['WIND_SPEED'].mean()
            df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION'].mean()
            df.at[index, 'Rain'] = weather['RAIN'].mode()[0]
    

df['Power'] = df['Power'].fillna(df['Power'].mode()[0])
df['Kph'] = df['Kph'].fillna(df['Kph'].mean())
df = df.drop(columns=['Group', 'Hour', 'Trial_ID_2', 'Time_Minutes'])

In [154]:
df

Unnamed: 0,Lap_Number,Lap_Time,Lap_Improvement,S1,S1_Improvement,S2,S2_Improvement,S3,S3_Improvement,Kph,Elapsed,Hour,Driver_Name,Pit_Time,Team,Power,Location,Event,Trial_ID_2,Time_Minutes,Air_Temp,Track_Temp,Humidity,Pressure,Wind_Speed,Wind_Direction,Rain
0,1,92,0,343.300,0,35.427,0,43.313,0,28.8,00:07:02,00:22:02,SB,104.900000,JR,250.0,Location 2,Free Practice 2,FP2-2-10-1,22.033333,15.0556,18.6,60.0,1018.25,3.18280,175.0,-1.0
1,2,87,2,25.674,2,33.399,2,41.922,2,120.5,00:08:43,00:23:43,SB,104.900000,JR,250.0,Location 2,Free Practice 2,FP2-2-10-1,23.716667,15.0556,18.7,60.0,1018.25,4.24374,161.0,-1.0
2,3,73,0,28.129,0,34.091,0,57.248,0,101.9,00:10:42.500000,00:25:42.500000,SB,104.900000,JR,250.0,Location 2,Free Practice 2,FP2-2-10-1,25.708333,15.0556,18.7,60.0,1018.22,3.18280,148.0,-1.0
3,1,73,0,65.000,0,38.416,0,56.833,0,75.9,00:02:40.200000,00:17:40.200000,LGRA,8.250000,AD,250.0,Location 2,Free Practice 2,FP2-2-11-1,17.670000,15.0556,18.5,60.0,1018.12,2.12187,157.0,-1.0
4,2,73,0,28.013,0,36.743,0,44.716,0,111.2,00:04:29.700000,00:19:29.700000,LGRA,8.250000,AD,250.0,Location 2,Free Practice 2,FP2-2-11-1,19.495000,15.1111,18.5,60.0,1018.15,3.18280,149.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10271,17,84,0,22.936,0,21.231,0,23.701,0,124.9,00:23:54.700000,00:23:54.700000,PWEHRL,12.580952,TAG,250.0,Location 8,Free Practice 1,FP1-8-99-2,23.911667,2973.0000,36.0,3847.0,101409.00,45.00000,171.0,0.0
10272,18,70,0,23.610,0,22.432,0,30.281,0,111.1,00:25:11,00:25:11,PWEHRL,12.580952,TAG,235.0,Location 8,Free Practice 1,FP1-8-99-2,25.183333,2986.0000,361.0,3834.0,101413.00,0.00000,202.0,0.0
10273,19,70,0,137.100,0,22.681,0,24.308,0,46.1,00:28:15.100000,00:28:15.100000,PWEHRL,12.580952,TAG,250.0,Location 8,Free Practice 1,FP1-8-99-2,28.251667,3005.0000,362.0,383.0,101419.00,1391.00000,203.0,0.0
10274,20,155,2,22.539,2,21.057,2,23.548,2,126.3,00:29:22.300000,00:29:22.300000,PWEHRL,12.580952,TAG,250.0,Location 8,Free Practice 1,FP1-8-99-2,29.371667,2983.0000,362.0,3859.0,101416.00,254.00000,273.0,0.0


## Function

In [15]:
def DataProcessing(csv_url):
    df = pd.read_csv(csv_url)
    df.columns = [x.title().strip() for x in df.columns]
    df = df.dropna(subset=['S1'])

    drop_cols = ['S1_Large', 'S2_Large', 'S3_Large']

    df['Event'].value_counts()

    for index, row in df.iterrows():
        number = str(row['Number'])
        location_number = row['Location'][-1:]
        if row['Event'] == 'Free Practice 1':
            event = 'FP1'
        elif row['Event'] == 'Free Practice 2':
            event = 'FP2'
        elif row['Event'] == 'Free Practice 3':
            event = 'FP3'
        elif row['Event'] == 'Qualifying Group 1':
            event = 'QG1'
        elif row['Event'] == 'Qualifying Group 2':
            event = 'QG2'
        elif row['Event'] == 'Qualifying Group 3':
            event = 'QG3'
        elif row['Event'] == 'Qualifying Group 4':
            event = 'QG4'
        df.at[index, 'Trial_ID'] = event + '-' + location_number + '-' + number

    for x in df['Trial_ID'].unique():
        temp = df.loc[df['Trial_ID'] == x]

        lap_number_previous = 1
        trial_identifier = 1

        for index, row in temp.iterrows():
            if row['Lap_Number'] >= lap_number_previous:
                df.at[index, 'Trial_Number'] = trial_identifier
                lap_number_previous += 1
            elif row['Lap_Number'] < lap_number_previous:
                trial_identifier += 1
                df.at[index, 'Trial_Number'] = trial_identifier
                lap_number_previous = 1

    df['Trial_ID_2'] = df['Trial_ID'] + '-' + df['Trial_Number'].astype(int).astype(str)

    def TimeConversion(x):
        x = str(x)
        if x != 'nan':
            try:
                y = datetime.strptime(x, '%M:%S.%f').time()
            except ValueError:
                try:
                    y = datetime.strptime(x, '%S.%f').time()
                except ValueError:
                    try:
                        y = datetime.strptime(x, '%S').time()
                    except ValueError:
                        y = datetime.strptime('0', '%S').time()
        if x == 'nan':
            y = datetime.strptime('0', '%S').time()
        z = timedelta(minutes=y.minute, seconds=y.second, microseconds=y.microsecond)
        return z

    time_cols = [
        'S1',
        'S2',
        'S3',
        'Elapsed',
        'Hour',
        'S1_Large',
        'S2_Large',
        'S3_Large',
        'Pit_Time',
    ]

    daytime_cols = ['Hour']

    for x in time_cols:
        df[x] = df[x].apply(TimeConversion)

    for x in df['Trial_ID_2']:
        temp = df.loc[df['Trial_ID_2'] == x]
        laps = len(temp)
        pit_time = timedelta(0)
        for index, row in temp.iterrows():
            if ~pd.isna(row['Pit_Time']):
                pit_amount = row['Pit_Time']
                pit_time += pit_amount

        df.loc[df['Trial_ID_2'] == x, 'Pit_Time'] = pit_time / laps
    
    df = df.drop(columns=[
    'S1_Large',
    'S2_Large',
    'S3_Large',
    'Trial_Number',
    'Trial_ID',
    'Number',
    'Driver_Number',
    'Crossing_Finish_Line_In_Pit'
    ])
    
    def ConvertToSeconds(x):
        y = x.total_seconds()
        return y

    df['S1'] = df['S1'].apply(ConvertToSeconds)
    df['S2'] = df['S2'].apply(ConvertToSeconds)
    df['S3'] = df['S3'].apply(ConvertToSeconds)
    df['Pit_Time'] = df['Pit_Time'].apply(ConvertToSeconds)
    df['Elapsed'] = df['Elapsed'].apply(ConvertToSeconds)
    
    df['Time_Minutes'] = [(x.total_seconds() / 60) for x in df['Hour']]
    
    wdf_train = pd.read_csv('../Data/train_weather.csv')
    wdf_test = pd.read_csv('../Data/test_weather.csv')
    wdf_test = wdf_test.rename(columns={'EVENTS': 'EVENT'})
    wdf_test['RAIN'] = wdf_test['RAIN'].astype(str)

    wdf = pd.concat([wdf_train, wdf_test])

    #split into location due to different number formats
    train_weather_l1 = wdf[wdf['LOCATION'].isin(['Location 1','Location 2','Location 3','Location 4'])]
    train_weather_l1['AIR_TEMP'] = train_weather_l1['AIR_TEMP'] .str.replace(',','.')
    train_weather_l1['AIR_TEMP'] = pd.to_numeric(train_weather_l1['AIR_TEMP'])
    train_weather_l1['TRACK_TEMP'] = train_weather_l1['TRACK_TEMP'] .str.replace(',','.')
    train_weather_l1['TRACK_TEMP'] = pd.to_numeric(train_weather_l1['TRACK_TEMP'])
    train_weather_l1['HUMIDITY'] = train_weather_l1['HUMIDITY'] .str.replace(',','.')
    train_weather_l1['HUMIDITY'] = pd.to_numeric(train_weather_l1['HUMIDITY'])
    train_weather_l1['PRESSURE'] = train_weather_l1['PRESSURE'] .str.replace(',','.')
    train_weather_l1['PRESSURE'] = pd.to_numeric(train_weather_l1['PRESSURE'])
    train_weather_l1['WIND_SPEED'] = train_weather_l1['WIND_SPEED'] .str.replace(',','.')
    train_weather_l1['WIND_SPEED'] = pd.to_numeric(train_weather_l1['WIND_SPEED'])
    train_weather_l1['RAIN'] = train_weather_l1['RAIN'].str.replace(',', '.')
    train_weather_l1['RAIN'] = pd.to_numeric(train_weather_l1['RAIN'])



    train_weather_l2 = wdf[wdf['LOCATION'].isin(['Location 5','Location 6','Location 7', 'Location 8'])]
    train_weather_l2['AIR_TEMP'] = train_weather_l2['AIR_TEMP'] .str.replace(',','')
    train_weather_l2['AIR_TEMP'] = pd.to_numeric(train_weather_l2['AIR_TEMP'], errors='coerce')
    conditions = [
        (train_weather_l2['AIR_TEMP'] > 100)  & (train_weather_l2['AIR_TEMP'] < 1000),
        (train_weather_l2['AIR_TEMP'] > 1000) & (train_weather_l2['AIR_TEMP'] < 10000),
        (train_weather_l2['AIR_TEMP'] > 10000) & (train_weather_l2['AIR_TEMP'] < 100000),
        (train_weather_l2['AIR_TEMP'] > 100000)]
    choices = [train_weather_l2['AIR_TEMP']/10,train_weather_l2['AIR_TEMP']/100,
               train_weather_l2['AIR_TEMP']/1000,train_weather_l2['AIR_TEMP']/10000]
    train_weather_l2['AIR_TEMP'] = np.select(conditions, choices, default=20)

    train_weather_l2['TRACK_TEMP'] = train_weather_l2['TRACK_TEMP'] .str.replace(',','.')
    train_weather_l2['TRACK_TEMP'] = pd.to_numeric(train_weather_l2['TRACK_TEMP'], errors='coerce')

    train_weather_l2['HUMIDITY'] = train_weather_l2['HUMIDITY'] .str.replace(',','.')
    train_weather_l2['HUMIDITY'] = pd.to_numeric(train_weather_l2['HUMIDITY'], errors='coerce')



    train_weather_l2['PRESSURE'] = train_weather_l2['PRESSURE'] .str.replace(',','')
    train_weather_l2['PRESSURE'] = pd.to_numeric(train_weather_l2['PRESSURE'], errors='coerce')
    conditions = [
        (train_weather_l2['PRESSURE'] > 10000) & (train_weather_l2['PRESSURE'] < 20000),
        (train_weather_l2['PRESSURE'] > 20000) & (train_weather_l2['PRESSURE'] < 200000),
        (train_weather_l2['PRESSURE'] > 200000)]

    choices = [train_weather_l2['PRESSURE']/10,
               train_weather_l2['PRESSURE']/100,
               train_weather_l2['PRESSURE']/1000]
    train_weather_l2['PRESSURE'] = np.select(conditions, choices, default=1000)


    train_weather_l2['WIND_SPEED'] = train_weather_l2['WIND_SPEED'] .str.replace(',','')
    train_weather_l2['WIND_SPEED'] = pd.to_numeric(train_weather_l2['WIND_SPEED'], errors='coerce')
    conditions = [
        (train_weather_l2['WIND_SPEED'] > 10) & (train_weather_l2['WIND_SPEED'] < 100),
        (train_weather_l2['WIND_SPEED'] > 100) & (train_weather_l2['WIND_SPEED'] < 1000),
        (train_weather_l2['WIND_SPEED'] > 1000) & (train_weather_l2['WIND_SPEED'] < 10000),
        (train_weather_l2['WIND_SPEED'] > 10000) & (train_weather_l2['WIND_SPEED'] < 100000),
        (train_weather_l2['WIND_SPEED'] > 100000)]

    choices = [train_weather_l2['WIND_SPEED']/10,
               train_weather_l2['WIND_SPEED']/100,
               train_weather_l2['WIND_SPEED']/1000,
               train_weather_l2['WIND_SPEED']/10000,
               train_weather_l2['WIND_SPEED']/100000,
               ]
    train_weather_l2['WIND_SPEED'] = np.select(conditions, choices, default=1)
    train_weather_l2['RAIN'] = train_weather_l2['RAIN'].str.replace(',', '.')
    train_weather_l2['RAIN'] = pd.to_numeric(train_weather_l2['RAIN'])

    wdf = pd.concat([train_weather_l1,train_weather_l2])

    wdf['TIME_UTC_STR'] = pd.to_datetime(wdf['TIME_UTC_STR'], dayfirst=True)
    wdf['TIME_UTC_MINUTE'] = [x.minute for x in wdf['TIME_UTC_STR']]

    num_cols = ['AIR_TEMP', 'TRACK_TEMP', 'HUMIDITY', 'PRESSURE', 'WIND_SPEED', 'RAIN']

    for index, row in df.iterrows():
        location = row['Location']
        event = row['Event']
        time = row['Time_Minutes']

        try:
            weather = wdf.loc[(wdf['LOCATION'] == location) & (wdf['EVENT'] == event) & (wdf['TIME_UTC_MINUTE'] <= time) & (time < (wdf['TIME_UTC_MINUTE'] + 1))]
            weather = weather.iloc[0]
            df.at[index, 'Air_Temp'] = weather['AIR_TEMP']
            df.at[index, 'Track_Temp'] = weather['TRACK_TEMP']
            df.at[index, 'Humidity'] = weather['HUMIDITY']
            df.at[index, 'Pressure'] = weather['PRESSURE']
            df.at[index, 'Wind_Speed'] = weather['WIND_SPEED']
            df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION']
            df.at[index, 'Rain'] = weather['RAIN']
        except IndexError:
            weather = wdf.loc[(wdf['LOCATION'] == location) & (wdf['EVENT'] == event)]
            if not weather.empty:
                df.at[index, 'Air_Temp'] = weather['AIR_TEMP'].mean()
                df.at[index, 'Track_Temp'] = weather['TRACK_TEMP'].mean()
                df.at[index, 'Humidity'] = weather['HUMIDITY'].mean()
                df.at[index, 'Pressure'] = weather['PRESSURE'].mean()
                df.at[index, 'Wind_Speed'] = weather['WIND_SPEED'].mean()
                df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION'].mean()
                df.at[index, 'Rain'] = weather['RAIN'].mode()[0]
            else:
                weather = wdf.loc[wdf['LOCATION'] == location]
                df.at[index, 'Air_Temp'] = weather['AIR_TEMP'].mean()
                df.at[index, 'Track_Temp'] = weather['TRACK_TEMP'].mean()
                df.at[index, 'Humidity'] = weather['HUMIDITY'].mean()
                df.at[index, 'Pressure'] = weather['PRESSURE'].mean()
                df.at[index, 'Wind_Speed'] = weather['WIND_SPEED'].mean()
                df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION'].mean()
                df.at[index, 'Rain'] = weather['RAIN'].mode()[0]


    df['Power'] = df['Power'].fillna(df['Power'].mode()[0])
    df['Kph'] = df['Kph'].fillna(df['Kph'].mean())
    df = df.drop(columns=['Group', 'Hour', 'Trial_ID_2', 'Time_Minutes'])
    
    return df

In [10]:
wdf_train = pd.read_csv('../Data/train_weather.csv')
wdf_test = pd.read_csv('../Data/test_weather.csv')
wdf_test = wdf_test.rename(columns={'EVENTS': 'EVENT'})
wdf_test['RAIN'] = wdf_test['RAIN'].astype(str)

wdf = pd.concat([wdf_train, wdf_test])

#split into location due to different number formats
train_weather_l1 = wdf[wdf['LOCATION'].isin(['Location 1','Location 2','Location 3','Location 4'])]
train_weather_l1['AIR_TEMP'] = train_weather_l1['AIR_TEMP'] .str.replace(',','.')
train_weather_l1['AIR_TEMP'] = pd.to_numeric(train_weather_l1['AIR_TEMP'])
train_weather_l1['TRACK_TEMP'] = train_weather_l1['TRACK_TEMP'] .str.replace(',','.')
train_weather_l1['TRACK_TEMP'] = pd.to_numeric(train_weather_l1['TRACK_TEMP'])
train_weather_l1['HUMIDITY'] = train_weather_l1['HUMIDITY'] .str.replace(',','.')
train_weather_l1['HUMIDITY'] = pd.to_numeric(train_weather_l1['HUMIDITY'])
train_weather_l1['PRESSURE'] = train_weather_l1['PRESSURE'] .str.replace(',','.')
train_weather_l1['PRESSURE'] = pd.to_numeric(train_weather_l1['PRESSURE'])
train_weather_l1['WIND_SPEED'] = train_weather_l1['WIND_SPEED'] .str.replace(',','.')
train_weather_l1['WIND_SPEED'] = pd.to_numeric(train_weather_l1['WIND_SPEED'])
train_weather_l1['RAIN'] = train_weather_l1['RAIN'].str.replace(',', '.')
train_weather_l1['RAIN'] = pd.to_numeric(train_weather_l1['RAIN'])



train_weather_l2 = wdf[wdf['LOCATION'].isin(['Location 5','Location 6','Location 7', 'Location 8'])]
train_weather_l2['AIR_TEMP'] = train_weather_l2['AIR_TEMP'] .str.replace(',','')
train_weather_l2['AIR_TEMP'] = pd.to_numeric(train_weather_l2['AIR_TEMP'], errors='coerce')
conditions = [
    (train_weather_l2['AIR_TEMP'] > 100)  & (train_weather_l2['AIR_TEMP'] < 1000),
    (train_weather_l2['AIR_TEMP'] > 1000) & (train_weather_l2['AIR_TEMP'] < 10000),
    (train_weather_l2['AIR_TEMP'] > 10000) & (train_weather_l2['AIR_TEMP'] < 100000),
    (train_weather_l2['AIR_TEMP'] > 100000)]
choices = [train_weather_l2['AIR_TEMP']/10,train_weather_l2['AIR_TEMP']/100,
           train_weather_l2['AIR_TEMP']/1000,train_weather_l2['AIR_TEMP']/10000]
train_weather_l2['AIR_TEMP'] = np.select(conditions, choices, default=20)

train_weather_l2['TRACK_TEMP'] = train_weather_l2['TRACK_TEMP'] .str.replace(',','.')
train_weather_l2['TRACK_TEMP'] = pd.to_numeric(train_weather_l2['TRACK_TEMP'], errors='coerce')

train_weather_l2['HUMIDITY'] = train_weather_l2['HUMIDITY'] .str.replace(',','.')
train_weather_l2['HUMIDITY'] = pd.to_numeric(train_weather_l2['HUMIDITY'], errors='coerce')



train_weather_l2['PRESSURE'] = train_weather_l2['PRESSURE'] .str.replace(',','')
train_weather_l2['PRESSURE'] = pd.to_numeric(train_weather_l2['PRESSURE'], errors='coerce')
conditions = [
    (train_weather_l2['PRESSURE'] > 10000) & (train_weather_l2['PRESSURE'] < 20000),
    (train_weather_l2['PRESSURE'] > 20000) & (train_weather_l2['PRESSURE'] < 200000),
    (train_weather_l2['PRESSURE'] > 200000)]

choices = [train_weather_l2['PRESSURE']/10,
           train_weather_l2['PRESSURE']/100,
           train_weather_l2['PRESSURE']/1000]
train_weather_l2['PRESSURE'] = np.select(conditions, choices, default=1000)


train_weather_l2['WIND_SPEED'] = train_weather_l2['WIND_SPEED'] .str.replace(',','')
train_weather_l2['WIND_SPEED'] = pd.to_numeric(train_weather_l2['WIND_SPEED'], errors='coerce')
conditions = [
    (train_weather_l2['WIND_SPEED'] > 10) & (train_weather_l2['WIND_SPEED'] < 100),
    (train_weather_l2['WIND_SPEED'] > 100) & (train_weather_l2['WIND_SPEED'] < 1000),
    (train_weather_l2['WIND_SPEED'] > 1000) & (train_weather_l2['WIND_SPEED'] < 10000),
    (train_weather_l2['WIND_SPEED'] > 10000) & (train_weather_l2['WIND_SPEED'] < 100000),
    (train_weather_l2['WIND_SPEED'] > 100000)]

choices = [train_weather_l2['WIND_SPEED']/10,
           train_weather_l2['WIND_SPEED']/100,
           train_weather_l2['WIND_SPEED']/1000,
           train_weather_l2['WIND_SPEED']/10000,
           train_weather_l2['WIND_SPEED']/100000,
           ]
train_weather_l2['WIND_SPEED'] = np.select(conditions, choices, default=1)
train_weather_l2['RAIN'] = train_weather_l2['RAIN'].str.replace(',', '.')
train_weather_l2['RAIN'] = pd.to_numeric(train_weather_l2['RAIN'])

wdf = pd.concat([train_weather_l1,train_weather_l2])

wdf['TIME_UTC_STR'] = pd.to_datetime(wdf['TIME_UTC_STR'], dayfirst=True)
wdf['TIME_UTC_MINUTE'] = [x.minute for x in wdf['TIME_UTC_STR']]

num_cols = ['AIR_TEMP', 'TRACK_TEMP', 'HUMIDITY', 'PRESSURE', 'WIND_SPEED', 'RAIN']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_weather_l1['AIR_TEMP'] = train_weather_l1['AIR_TEMP'] .str.replace(',','.')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_weather_l1['AIR_TEMP'] = pd.to_numeric(train_weather_l1['AIR_TEMP'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_weather_l1['TRACK_TEMP'] = train_weather_l

In [11]:
wdf['RAIN'].value_counts()

-1.00    795
 0.00    280
 0.01      3
 0.02      1
 0.03      1
 0.25      1
Name: RAIN, dtype: int64

In [16]:
DataProcessing('../Data/train.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_weather_l1['AIR_TEMP'] = train_weather_l1['AIR_TEMP'] .str.replace(',','.')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_weather_l1['AIR_TEMP'] = pd.to_numeric(train_weather_l1['AIR_TEMP'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_weather_l1['TRACK_TEMP'] = train_weather_l

Unnamed: 0,Lap_Number,Lap_Time,Lap_Improvement,S1,S1_Improvement,S2,S2_Improvement,S3,S3_Improvement,Kph,Elapsed,Driver_Name,Pit_Time,Team,Power,Location,Event,Air_Temp,Track_Temp,Humidity,Pressure,Wind_Speed,Wind_Direction,Rain
0,1,92,0,343.300,0,35.427,0,43.313,0,28.8,422.0,SB,104.900000,JR,250.0,Location 2,Free Practice 2,15.0556,18.6,60.00,1018.25,3.18280,175.0,-1.0
1,2,87,2,25.674,2,33.399,2,41.922,2,120.5,523.0,SB,104.900000,JR,250.0,Location 2,Free Practice 2,15.0556,18.7,60.00,1018.25,4.24374,161.0,-1.0
2,3,73,0,28.129,0,34.091,0,57.248,0,101.9,642.5,SB,104.900000,JR,250.0,Location 2,Free Practice 2,15.0556,18.7,60.00,1018.22,3.18280,148.0,-1.0
3,1,73,0,65.000,0,38.416,0,56.833,0,75.9,160.2,LGRA,8.250000,AD,250.0,Location 2,Free Practice 2,15.0556,18.5,60.00,1018.12,2.12187,157.0,-1.0
4,2,73,0,28.013,0,36.743,0,44.716,0,111.2,269.7,LGRA,8.250000,AD,250.0,Location 2,Free Practice 2,15.1111,18.5,60.00,1018.15,3.18280,149.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10271,17,84,0,22.936,0,21.231,0,23.701,0,124.9,1434.7,PWEHRL,12.580952,TAG,250.0,Location 8,Free Practice 1,29.7300,36.0,38.47,1014.09,4.50000,171.0,0.0
10272,18,70,0,23.610,0,22.432,0,30.281,0,111.1,1511.0,PWEHRL,12.580952,TAG,235.0,Location 8,Free Practice 1,29.8600,36.1,38.34,1014.13,1.00000,202.0,0.0
10273,19,70,0,137.100,0,22.681,0,24.308,0,46.1,1695.1,PWEHRL,12.580952,TAG,250.0,Location 8,Free Practice 1,30.0500,36.2,38.30,1014.19,1.39100,203.0,0.0
10274,20,155,2,22.539,2,21.057,2,23.548,2,126.3,1762.3,PWEHRL,12.580952,TAG,250.0,Location 8,Free Practice 1,29.8300,36.2,38.59,1014.16,2.54000,273.0,0.0
