## Подготовка данных и разведовательный анализ

In [1]:
# imports
import pandas as pd
# import pandas_profiling
from functools import reduce

#### Константы
- пути к данным
- признаки, описывающие данные
- типы "столбцов", или "подпризнаков"
- ID ТС

In [2]:
DATA_PATH = 'data/vehicle_dataset_public/'
DATA_PROC_PATH = 'data/vehicle_proccessed/'

TAGS_FEATURES = ['fuelLevel', 'ingection', 'speedAndHeight', 'tachometer']
TAGS_TARGET = ['refueling2']
TAGS = TAGS_FEATURES + TAGS_TARGET

TIME_COLUMNS = ['DTIME', 'STARTDATE', 'ENDDATE']
FLOAT_COLUMNS = ['BEVALUE', 'SPEED', 'HEIGHT']

VEHICLE_IDS = [1, 3, 5, 19, 28]

In [3]:
# Словарь, состоящий из dataframe'ов для каждого признака
def dict_df_vehicle_i(i):
    dict_df_vehicle = {}
    for tag in TAGS:
        df = pd.read_csv(DATA_PATH +
                         f'vehicle{i}_{tag}_public.csv', sep=';')
        for col in FLOAT_COLUMNS:
            if col in df.columns and df[col].dtype == 'object':
                df[col] = df[col].str.replace(',','.').astype(float)
        df.rename(columns={'BEVALUE': tag}, inplace=True)

        for time_col in TIME_COLUMNS:
            if time_col in df.columns:
                df[time_col] = pd.to_datetime(df[time_col])

        dict_df_vehicle[tag] = df

    return dict_df_vehicle

In [4]:
# Объединение всех "исходных" признаков из словаря dataframe'ов
# в один dataframe
def df_summary(dict_df):
    df_to_merge = [dict_df[name] for name in TAGS_FEATURES]

    df_res = reduce(lambda left, right: pd.merge(left, right, how='outer', on='DTIME'),
                    df_to_merge)
    df_res.columns = df_res.columns.str.lower()

    return df_res

In [5]:
def combine_data():
    for i in VEHICLE_IDS:
        dict_df = dict_df_vehicle_i(i)

        df_sum = df_summary(dict_df)
        df_refuel = dict_df['refueling2']

        df = df_sum.copy()
        df['refuel'] = [1 if ((df_refuel['STARTDATE'] < date) & (date <= df_refuel['ENDDATE'])).any()
                        else 0 for date in df_sum['dtime']]

        name = f'vehicle{i}'
        df.to_csv(DATA_PROC_PATH + name + '.csv', index_label='i')

In [6]:
combine_data()

In [7]:
def df_for_all_vehicles():
    df_sum_list = []
    for i in VEHICLE_IDS:
        df_sum = pd.read_csv(DATA_PROC_PATH + f'vehicle{i}.csv')
        df_sum['TSID'] = str(i)

        df_sum_list.append(df_sum)

    df_vehicles_all = pd.concat(df_sum_list, ignore_index=True)

    return df_vehicles_all

In [8]:
df_vehicles_all = df_for_all_vehicles()

df_vehicles_all



Unnamed: 0,i,dtime,fuellevel,ingection,speed,height,tachometer,refuel,TSID
0,0,2020-01-08 21:16:02,0.0,0,0,-22.9,0,0,1
1,1,2020-01-08 21:17:04,0.0,0,0,-22.9,0,0,1
2,2,2020-01-08 21:18:04,0.0,0,0,-22.9,0,0,1
3,3,2020-01-09 10:05:26,49.7,1,0,-22.9,1248,0,1
4,4,2020-01-09 10:06:27,49.9,1,0,-22.9,1056,0,1
...,...,...,...,...,...,...,...,...,...
290141,93186,2020-06-30 18:13:27,,1,16,255.4,1792,1,28
290142,93187,2020-06-30 18:13:52,,1,39,250.6,2368,1,28
290143,93188,2020-06-30 18:14:25,,1,12,244.4,2400,1,28
290144,93189,2020-06-30 18:14:33,,1,5,243.4,1888,1,28
