In [None]:
import statistics
import pandas as pd
import numpy as np
from tqdm import tqdm
from glob import glob
from sklearn.model_selection import train_test_split
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from ast import literal_eval
import os
import math
tqdm.pandas(desc="Progress!")
import warnings
warnings.filterwarnings('ignore')

In [None]:
def load_fms_data(fms_data_path):

    df_fms = pd.read_csv(fms_data_path, sep=';', usecols=['Code', 'Name'], encoding='utf-8')
    df_fms.rename(columns={'Code': 'key'}, inplace=True)
    df_fms['key'] = df_fms['key'].astype(str)

    return df_fms

In [None]:
def load_and_clean_zf_data(fms_data_path,zf_raw_data_path,zf_cleaned_data_path):

    df_fms = load_fms_data(fms_data_path)

    for file in glob(zf_raw_data_path + '/*'):
        final_df = pd.DataFrame()
        for folder in tqdm(np.sort(glob(file + '/*'))):
            for path in np.sort(glob(folder + '/*')):

                df_raw = pd.read_csv(path,compression='gzip',sep='|', usecols=['ts_msg_usec','timedelta_usec','key','value2','value'])
                df_raw['Time_stamp'] = pd.to_datetime(df_raw['ts_msg_usec'] + df_raw['timedelta_usec'], unit='us')
                df_raw['Time_stamp'] = df_raw['Time_stamp'].astype('datetime64[s]').dt.tz_localize('utc').dt.tz_convert(
                    'Europe/Berlin')
                df_raw.drop_duplicates(subset=['Time_stamp', 'key'], keep='last', inplace=True)

                merge_df = df_raw.merge(df_fms, on='key', how='left')
                merge_df.Name.fillna(merge_df.key, inplace=True)
                merge_df.value2.fillna(merge_df.value, inplace=True)

                df_raw = merge_df.pivot(index='Time_stamp', columns='Name', values='value2')
                df_raw.reset_index(inplace=True)
                df_raw['lat'] = df_raw['lat'].astype(float)
                df_raw['lon'] = df_raw['lon'].astype(float)
                df_raw.columns.name = None

                df1 = pd.DataFrame()
                df1['Time_stamp'] = pd.date_range(df_raw['Time_stamp'][0], df_raw['Time_stamp'][len(df_raw) - 1],
                                                  freq='1s')
                df1 = df1.merge(df_raw, on='Time_stamp', how='left')
                df1['lat'] = df1['lat'].interpolate().ffill().bfill()
                df1['lon'] = df1['lon'].interpolate().ffill().bfill()
                df1['WheelBasedVehicleSpeed'] = df1['WheelBasedVehicleSpeed'].fillna(0)

                final_df = pd.concat([final_df,df1])

        final_df.to_csv(zf_cleaned_data_path + '/' + str(file[-7:]) + '.csv', sep=",", index=False)

In [None]:
def truck_position(df):

    plant_1 = Polygon( [(9.489763, 47.660213), (9.491629, 47.661182), (9.492552, 47.660907), (9.494827, 47.660633),
              (9.497251, 47.658797), (9.490556, 47.655126), (9.487123, 47.653854), (9.483626, 47.655834),
              (9.485106, 47.657482), (9.48766, 47.659231), (9.489763, 47.660213)])
    plant_2 = Polygon([(9.466138, 47.667208), (9.46352, 47.667251), (9.462512, 47.661471), (9.464314, 47.658335),
              (9.473004, 47.658581), (9.473948, 47.662439), (9.471889, 47.664925), (9.466138, 47.667208)])


    location = []
    logic = []

    flag = 1
    value = ''
  
    for row in tqdm(df.to_dict('records')):
        if plant_2.contains(Point(row['lon'], row['lat'])):
            if flag == 0:
                logic.append(1)
            else:
                logic.append(0)
            location.append('2')
            value = '2-road'
            flag = 1

        elif plant_1.contains(Point(row['lon'], row['lat'])):
            if flag == 0:
                logic.append(1)
            else:
                logic.append(0)
            location.append('1')
            value = '1-road'
            flag = 1

        else:
            location.append(value)
            if flag == 1:
                logic.append(1)
                flag = 0
            else:
                logic.append(0)

    df['location'] = location
    df['logic'] = logic

    return df

In [None]:
def travel_time_less_3(new_df):
    indexes = []
    for index, row in tqdm(new_df.iterrows()):
        if index < len(new_df) - 1:
            if index % 2 != 0:
                if (new_df.iloc[index, :]['Time_stamp'] - new_df.iloc[index - 1, :]['Time_stamp']) < pd.Timedelta(
                        minutes=3):
                    indexes.append(index)
                    indexes.append(index - 1)
    new_df.drop(new_df.index[indexes], inplace=True)
    new_df.reset_index(drop=True, inplace=True)
    return new_df

In [None]:
def travel_time_information(new_df):

    flag1 = 0
    flag2 = 1
    flag3 = 0
    flag4 = 1
    start1 = []
    end1 = []
    start2 = []
    end2 = []
    final_df1 = pd.DataFrame()
    final_df2 = pd.DataFrame()

    for index, row in tqdm(new_df.iterrows()):
        if flag1 == 0:
            if row['location'] == '2-road':
                start2.append(row['Time_stamp'])
                flag1 = 1
                flag2 = 0
            continue

        if flag2 == 0:
            if row['location'] == '1':
                end1.append(row['Time_stamp'])
                flag2 = 1
                flag1 = 0
            continue

    for index, row in tqdm(new_df.iterrows()):
        if flag3 == 0:
            if row['location'] == '1-road':
                start1.append(row['Time_stamp'])
                flag3 = 1
                flag4 = 0
            continue

        if flag4 == 0:
            if row['location'] == '2':
                end2.append(row['Time_stamp'])
                flag4 = 1
                flag3 = 0
            continue
    final_df1['start_plant1'] = start1
    final_df1['end_plant2'] = end2

    final_df2['start_plant2'] = start2
    final_df2['end_plant1'] = end1

    final_df1['travel_time(1-2)'] = final_df1['end_plant2'] - final_df1['start_plant1']
    final_df2['travel_time(2-1)'] = final_df2['end_plant1'] - final_df2['start_plant2']

    final_df1['travel_time(1-2)'] = final_df1['travel_time(1-2)'].apply(lambda x: x.total_seconds() / 60)
    final_df2['travel_time(2-1)'] = final_df2['travel_time(2-1)'].apply(lambda x: x.total_seconds() / 60)

    return final_df1,final_df2

In [None]:
def fetch_gps_and_speed_infromation(final_df1,final_df2,df,speed_threshold):

    final_df2['GPS_2_1_lat'] = final_df2.progress_apply(
        lambda x: df[(df['Time_stamp'].between(x['start_plant2'], x['end_plant1']))]['lat'].values, axis=1)
    final_df2['GPS_2_1_lon'] = final_df2.progress_apply(
        lambda x: df[(df['Time_stamp'].between(x['start_plant2'], x['end_plant1']))]['lon'].values, axis=1)
    final_df1['GPS_1_2_lat'] = final_df1.progress_apply(
        lambda x: df[(df['Time_stamp'].between(x['start_plant1'], x['end_plant2']))]['lat'].values, axis=1)
    final_df1['GPS_1_2_lon'] = final_df1.progress_apply(
        lambda x: df[(df['Time_stamp'].between(x['start_plant1'], x['end_plant2']))]['lon'].values, axis=1)
    

    final_df2['speed_2_1'] = final_df2.progress_apply(
        lambda x: df[(df['Time_stamp'].between(x['start_plant2'], x['end_plant1']))][
            'WheelBasedVehicleSpeed'].values,
        axis=1)
    final_df1['speed_1_2'] = final_df1.progress_apply(
        lambda x: df[(df['Time_stamp'].between(x['start_plant1'], x['end_plant2']))][
            'WheelBasedVehicleSpeed'].values,
        axis=1)
    final_df1['speed_threshold'] = final_df1['speed_1_2'].progress_apply(
        lambda x: statistics.mean([1 if float(i) < speed_threshold else 0 for i in x]))
    final_df2['speed_threshold'] = final_df2['speed_2_1'].progress_apply(
        lambda x: statistics.mean([1 if float(i) < speed_threshold else 0 for i in x]))
    
    return final_df1,final_df2

In [None]:
def fetch_route_information(final_df1,final_df2,name,zf_preprocessed_data1_path,zf_preprocessed_data2_path):

    routes_2_1 = []
    routes_1_2 = []
    
    route_1= Polygon([(9.484763, 47.658797), (9.481587, 47.660994), (9.478283, 47.663277), (9.482145, 47.664867),
              (9.487467, 47.661168), (9.484763, 47.658797)])
    route_2= Polygon([(9.474764, 47.658277), (9.4806, 47.658971), (9.481115, 47.657815), (9.475107, 47.656832),
              (9.474764, 47.658277)])
    route_3= Polygon([(9.485664, 47.665792), (9.490042, 47.668624), (9.496651, 47.665503), (9.497509, 47.661977),
              (9.494247, 47.660763), (9.485664, 47.665792)])
    route_4= Polygon([(9.475193, 47.669665), (9.476137, 47.665908), (9.48266, 47.666312), (9.482231, 47.669896),
              (9.475193, 47.669665)])

    for index, row in tqdm(final_df2.iterrows()):
        for i in range(len(final_df2['GPS_2_1_lat'][index])):
            if route_4.contains(Point(final_df2['GPS_2_1_lon'][index][i], final_df2['GPS_2_1_lat'][index][i])):
                routes_2_1.append(4)
                break
            elif route_3.contains(Point(final_df2['GPS_2_1_lon'][index][i], final_df2['GPS_2_1_lat'][index][i])):
                routes_2_1.append(3)
                break
            elif route_2.contains(Point(final_df2['GPS_2_1_lon'][index][i], final_df2['GPS_2_1_lat'][index][i])):
                routes_2_1.append(2)
                break
            elif route_1.contains(Point(final_df2['GPS_2_1_lon'][index][i], final_df2['GPS_2_1_lat'][index][i])):
                routes_2_1.append(1)
                break

    for index, row in tqdm(final_df1.iterrows()):
        for i in range(len(final_df1['GPS_1_2_lat'][index])):
            if route_4.contains(Point(final_df1['GPS_1_2_lon'][index][i], final_df1['GPS_1_2_lat'][index][i])):
                routes_1_2.append(4)
                break
            elif route_3.contains(Point(final_df1['GPS_1_2_lon'][index][i], final_df1['GPS_1_2_lat'][index][i])):
                routes_1_2.append(3)
                break
            elif route_2.contains(Point(final_df1['GPS_1_2_lon'][index][i], final_df1['GPS_1_2_lat'][index][i])):
                routes_1_2.append(2)
                break
            elif route_1.contains(Point(final_df1['GPS_1_2_lon'][index][i], final_df1['GPS_1_2_lat'][index][i])):
                routes_1_2.append(1)
                break

    final_df2['route_2_1'] = routes_2_1
    final_df1['route_1_2'] = routes_1_2

    final_df1 = final_df1[['start_plant1','end_plant2','travel_time(1-2)','route_1_2','speed_threshold']]
    final_df2 = final_df2[['start_plant2','end_plant1','travel_time(2-1)','route_2_1','speed_threshold']]

    final_df1.to_csv(zf_preprocessed_data1_path + '/' + str(name) + '(1-2).csv', sep=",", index=False)
    final_df2.to_csv(zf_preprocessed_data2_path + '/' + str(name) + '(2-1).csv', sep=",", index=False)


In [None]:
def preprocess_zf_data(zf_cleaned_data_path,zf_preprocessed_data1_path,zf_preprocessed_data2_path,speed_threshold):

    for path in np.sort(glob(zf_cleaned_data_path + '/*')):

        df = pd.read_csv(path,sep=",", usecols=['Time_stamp','lat','lon','WheelBasedVehicleSpeed'],encoding='utf-8')
        df['Time_stamp'] = pd.to_datetime(df['Time_stamp'], infer_datetime_format=True)

        print('-->Finding the position of the truck')
        df = truck_position(df)
        new_df = df[df['logic'] == 1]
        new_df.reset_index(drop=True, inplace=True)

        print('-->Removing records having travel time less that 3 minutes')
        new_df = travel_time_less_3(new_df)

        print('-->Finding the Travel time information')
        final_df1,final_df2 = travel_time_information(new_df)

        print('-->Fetching Gps inforamtion and speed information')
        final_df1,final_df2 = fetch_gps_and_speed_infromation(final_df1,final_df2,df,speed_threshold)

        print('-->route information')
        name = path[-11:-4]
        fetch_route_information(final_df1,final_df2,name,zf_preprocessed_data1_path,zf_preprocessed_data2_path)

In [None]:
def merge_zf_data(zf_dataset_csv,zf_preprocessed_data2_path):

    df_zf = pd.DataFrame()
    for path in np.sort(glob(zf_preprocessed_data2_path+'/*')):
        df = pd.read_csv(path)
        df_zf = pd.concat([df,df_zf])

    
    df_zf = df_zf[df_zf['route_2_1'] == 1]
    df_zf = df_zf[df_zf['travel_time(2-1)'] < 20]

    df_zf['start_plant2'] = pd.to_datetime(df_zf['start_plant2'],infer_datetime_format=True)
    df_zf['Week_Day'] = df_zf['start_plant2'].dt.weekday
    df_zf['Week_Day_Name'] = df_zf['start_plant2'].dt.day_name()
    df_zf['Week'] = df_zf['start_plant2'].dt.isocalendar().week
    df_zf['time'] = df_zf['start_plant2'].dt.time
    df_zf['Hour'] = df_zf['time'].apply(lambda x: x.hour)
    df_zf['Minutes'] = df_zf['time'].apply(lambda x: x.minute)
    df_zf['Seconds'] = df_zf['time'].apply(lambda x: x.second)
    print(df_zf.head())
    df_zf.to_csv(zf_dataset_csv,index=False)

In [None]:
def merge_and_split_data(zf_dataset_csv,weather_data_path,test_data_path,train_data_path,split_ratio):

    df_zf = pd.read_csv(zf_dataset_csv)
    df_weather = pd.read_csv(weather_data_path)

    df_weather['Timestamp'] = pd.to_datetime(df_weather['Timestamp'])
    df_zf['start_plant2'] = pd.to_datetime(df_zf['start_plant2'])

    df_weather['time'] = df_weather['Timestamp'].apply(lambda x: x.strftime("%Y-%m-%d %H"))
    df_zf['time'] = df_zf['start_plant2'].apply(lambda x: x.strftime("%Y-%m-%d %H"))

    merge_df = pd.merge(df_zf,df_weather,on='time')

    merge_df = merge_df[['Week_Day','Week','Hour','Minutes','Seconds','speed_threshold','Clouds','Temp','Wind_deg','Wind_speed','Rain_1h','Rain_3h','Snow_1h','Snow_3h','travel_time(2-1)']]

    train, test = train_test_split(
        merge_df,
        test_size=split_ratio,
        random_state=42
    )
    train.to_csv(train_data_path, sep=",", index=False, encoding="utf-8")
    test.to_csv(test_data_path, sep=",", index=False, encoding="utf-8")

In [None]:
fms_data_path = '/content/drive/MyDrive/Data/zf_data/FMS_signals_raw.csv'

zf_raw_data_path = '/content/drive/MyDrive/Data/zf_data/raw_data'
zf_cleaned_data_path = '/content/drive/MyDrive/Data/zf_data/cleaned_data'
zf_preprocessed_data1_path = '/content/drive/MyDrive/Data/zf_data/preprocessed_data/plant1-plant2'
zf_preprocessed_data2_path = '/content/drive/MyDrive/Data/zf_data/preprocessed_data/plant2-plant1'
zf_dataset_csv = '/content/drive/MyDrive/Data/zf_data/merge_data/zf_data.csv'

test_data_path = '/content/drive/MyDrive/Data/merged_data/test.csv'
train_data_path = '/content/drive/MyDrive/Data/merged_data/train.csv'
weather_data_path = '/content/drive/MyDrive/Data/weather_data/weather.csv'

split_ratio = 0.2
speed_threshold = 5

print('Load and Clean ZF Data:')
load_and_clean_zf_data(fms_data_path,zf_raw_data_path,zf_cleaned_data_path)

print('Preprocess ZF Data:')
preprocess_zf_data(zf_cleaned_data_path,zf_preprocessed_data1_path,zf_preprocessed_data2_path,speed_threshold)

print('Merge ZF Data:')
merge_zf_data(zf_dataset_csv,zf_preprocessed_data2_path)

print('Merge and Split Complete Data:')
merge_and_split_data(zf_dataset_csv,weather_data_path,test_data_path,train_data_path,split_ratio)

In [None]:
def store_weather_data(start,end):

    df_weather = pd.DataFrame()

    for i in tqdm(range(math.ceil((end-start)/(167*3600)))):
        df = pd.read_json('http://history.openweathermap.org/data/2.5/history/city?q=Friedrichshafen,DE&type=hour&start={0}&cnt=168&appid=212117db1236e6aee483f90d1592f01b'.format(start))
        df_weather = pd.concat([df_weather,df])
        start = start + (167 * 3600)

    df_weather.to_csv(weather_raw_data_path,index=False)

In [None]:
def clean_weather_data(weather_cleaned_data_path,weather_raw_data_path):

    # Wed Sep 01 2021 00:00:01 GMT+0200 (Central European Summer Time)
    # Tue Mar 01 2022 00:00:01 GMT+0100 (Central European Standard Time)
    # store_weather_data(1630447201,1646089201)

    Timestamp = []
    Clouds = []
    Temp = []
    Weather = []
    Wind_deg = []
    Wind_speed = []
    Rain_1h = []
    Rain_3h = []
    Snow_1h = []
    Snow_3h = []

    df_weather = pd.read_csv(weather_raw_data_path)
    df_weather['list'] = df_weather['list'].apply(lambda x: literal_eval(x))
    final_weather = pd.DataFrame(
        columns=['Timestamp', 'Clouds', 'Temp', 'Weather', 'Wind_deg', 'Wind_speed', 'Rain_1h', 'Rain_3h', 'Snow_1h',
                 'Snow_3h'])

    for row in df_weather.to_dict('records'):
        Timestamp.append(pd.to_datetime(row['list']['dt'], unit='s').tz_localize('utc').tz_convert('Europe/Berlin'))
        Clouds.append(row['list']['clouds']['all'])
        Temp.append(row['list']['main']['temp'])
        Weather.append(row['list']['weather'][0]['description'])

        if 'rain' in row['list'].keys():
            if '1h' in row['list']['rain'].keys():
                Rain_1h.append(row['list']['rain']['1h'])
            else:
                Rain_1h.append(np.NaN)
            if '3h' in row['list']['rain'].keys():
                Rain_3h.append(row['list']['rain']['3h'])
            else:
                Rain_3h.append(np.NaN)
        else:
            Rain_1h.append(np.NaN)
            Rain_3h.append(np.NaN)

        if 'snow' in row['list'].keys():
            if '1h' in row['list']['snow'].keys():
                Snow_1h.append(row['list']['snow']['1h'])
            else:
                Snow_1h.append(np.NaN)
            if '3h' in row['list']['snow'].keys():
                Snow_3h.append(row['list']['snow']['3h'])
            else:
                Snow_3h.append(np.NaN)
        else:
            Snow_1h.append(np.NaN)
            Snow_3h.append(np.NaN)

        if 'wind' in row['list'].keys():
            Wind_deg.append(row['list']['wind']['deg'])
            Wind_speed.append(row['list']['wind']['speed'])
        else:
            Wind_deg.append(np.NaN)
            Wind_speed.append(np.NaN)

    final_weather['Timestamp'] = Timestamp
    final_weather['Clouds'] = Clouds
    final_weather['Temp'] = Temp
    final_weather['Weather'] = Weather
    final_weather['Rain_1h'] = Rain_1h
    final_weather['Rain_3h'] = Rain_3h
    final_weather['Snow_1h'] = Snow_1h
    final_weather['Snow_3h'] = Snow_3h
    final_weather['Wind_deg'] = Wind_deg
    final_weather['Wind_speed'] = Wind_speed

    final_weather.to_csv(weather_cleaned_data_path,index=False)


In [None]:
def preprocess_weather_data(weather_cleaned_data_path,weather_raw_data_path,weather_dataset_csv):
    #clean_weather_data(weather_cleaned_data_path,weather_raw_data_path)
    df = pd.read_csv(weather_cleaned_data_path)
    df.fillna(0,inplace=True)
    df.to_csv(weather_dataset_csv,index=False)

In [None]:
weather_raw_data_path = '/content/drive/MyDrive/Data/weather_data/raw_data/weather.csv'
weather_cleaned_data_path = '/content/drive/MyDrive/Data/weather_data/cleaned_data/cleaned_weather.csv'
weather_dataset_csv = '/content/drive/MyDrive/Data/weather_data/weather.csv'
preprocess_weather_data(weather_cleaned_data_path,weather_raw_data_path,weather_dataset_csv)