# S6. App Structure

## Алгоритм

1. На входе данные за 1 час - показания по экологии и деперсонализированные события по авто
2. Каждый автомобиль получает или теряет баллы в зависимости от показаний экодатчиков в момент проезда
3. Математика начисления баллов (и выставления порогов) модернизируется с каждой итерацией
4. Авто, чьи баллы превысили порог, передаются на обработку экополиции, те возвращаются к нам с информацией о том, какая достигнута точность
5. Повторяем пока не получим нужную точность

## 1. Получение входных данных

1. Отсортировать данные по времени
2. Распределить по локациям
3. Сформировать временные фреймы
4. Упаковать по временным фреймам данные по авто и экологии


In [1]:
import pandas as pd

eco_df = pd.read_csv("ecodata2.csv")
car_df = pd.read_csv("cardata2.csv")

eco_df.sort_values(by=['time'], inplace=True)
car_df.sort_values(by=['time'], inplace=True)

In [2]:
eco_df.head(3)

Unnamed: 0,id,city_id,location_id,camera_id,co,no2,so2,o3,pm25,pm10,temp,hum,time,ver,lat,lon,created
4999,18,2,741,4553,4.0,0.68,0,0.81,40,45,18.24,21.28,12/10/22 17:41,-1,43.237604,76.934758,12/10/22 11:41
4858,15,2,2,420,8.8,0.0,0,0.87,35,38,19.2,19.29,12/10/22 17:41,-1,43.238362,76.889989,12/10/22 11:41
4857,14,2,337,1724,12.8,5.23,0,0.0,63,68,19.28,21.45,12/10/22 17:41,-1,43.24417,76.915991,12/10/22 11:41


In [21]:
time_from = pd.to_datetime('12/10/22 17:42')
time_to = pd.to_datetime('12/10/22 17:43')

eco_df_part = eco_df.loc[(pd.to_datetime(eco_df['time']) >= time_from) & (pd.to_datetime(eco_df['time']) < time_to) & (eco_df['camera_id'] == 420)]
car_df_part = car_df.loc[(pd.to_datetime(eco_df['time']) >= time_from) & (pd.to_datetime(eco_df['time']) < time_to) & (car_df['camera_id'] == 420)]

In [22]:
timeframes = {
    '12/10/22 17:42': [eco_df_part, car_df_part]
}

eco_df_part.head(10)

Unnamed: 0,id,city_id,location_id,camera_id,co,no2,so2,o3,pm25,pm10,temp,hum,time,ver,lat,lon,created
4610,15,2,2,420,9.3,0.0,0,0.93,34,36,19.02,19.22,12/10/22 17:42,-1,43.238362,76.889989,12/10/22 11:42
4601,15,2,2,420,9.3,0.0,0,0.93,34,36,19.02,19.17,12/10/22 17:42,-1,43.238362,76.889989,12/10/22 11:42
4593,15,2,2,420,9.3,0.0,0,0.93,34,36,19.02,19.13,12/10/22 17:42,-1,43.238362,76.889989,12/10/22 11:42
4584,15,2,2,420,9.3,0.0,0,0.93,34,36,19.02,19.1,12/10/22 17:42,-1,43.238362,76.889989,12/10/22 11:42
4637,15,2,2,420,9.3,0.0,0,0.93,39,40,18.99,18.99,12/10/22 17:42,-1,43.238362,76.889989,12/10/22 11:42
4628,15,2,2,420,9.3,0.0,0,0.93,39,40,19.01,19.16,12/10/22 17:42,-1,43.238362,76.889989,12/10/22 11:42
4619,15,2,2,420,9.3,0.0,0,0.93,34,36,19.02,19.21,12/10/22 17:42,-1,43.238362,76.889989,12/10/22 11:42
4540,15,2,2,420,9.3,0.0,0,0.93,34,36,19.04,19.02,12/10/22 17:42,-1,43.238362,76.889989,12/10/22 11:42
4532,15,2,2,420,9.3,0.0,0,0.93,34,36,19.03,18.98,12/10/22 17:42,-1,43.238362,76.889989,12/10/22 11:42
4646,15,2,2,420,9.3,0.0,0,0.93,39,40,18.98,18.85,12/10/22 17:42,-1,43.238362,76.889989,12/10/22 11:42


In [23]:
car_df_part.head(10)

Unnamed: 0,id,city_id,location_id,camera_id,time,lat,lon,lane,speed
4610,0af36951-1559-cd4f-d9b4-efa3242c9bc2,2,2,420,12/10/2022 17:42,43.238362,76.889989,2,32
4601,11996e85-02d8-1532-fd7b-16fdcbb59561,2,2,420,12/10/2022 17:42,43.238362,76.889989,2,31
4593,55009bd0-f25c-2063-304d-303e182b29f9,2,2,420,12/10/2022 17:42,43.238362,76.889989,2,22
4584,dcf8c952-7cf7-a5ac-0c87-89ccaf367a02,2,2,420,12/10/2022 17:42,43.238362,76.889989,3,40
4637,8621080c-44f6-04e1-391e-a15d938c5b03,2,2,420,12/10/2022 17:42,43.238362,76.889989,1,5
4628,cd6b530d-d959-cf20-429a-b908cb7629e8,2,2,420,12/10/2022 17:42,43.238362,76.889989,2,19
4619,b6901ea4-4fe7-1193-be66-7d0e640c2f44,2,2,420,12/10/2022 17:42,43.238362,76.889989,3,17
4540,f77bf3e5-6115-6f46-6b47-4bfab63ba777,2,2,420,12/10/2022 17:42,43.238362,76.889989,1,16
4532,d8b81f31-ba00-aa4b-7798-b4ef24d4c47f,2,2,420,12/10/2022 17:42,43.238362,76.889989,3,39
4646,647c5fc7-68f0-ce07-c73c-0a6469bf8716,2,2,420,12/10/2022 17:42,43.238362,76.889989,3,21


In [24]:
eco_df['time'].unique()

array(['12/10/22 17:41', '12/10/22 17:42', '12/10/22 17:43',
       '12/10/22 17:44', '12/10/22 17:45', '12/10/22 17:46',
       '12/10/22 17:47', '12/10/22 17:48', '12/10/22 17:49',
       '12/10/22 17:50', '12/10/22 17:51', '12/10/22 17:52',
       '12/10/22 17:53', '12/10/22 17:54', '12/10/22 17:55',
       '12/10/22 17:56', '12/10/22 17:57', '12/10/22 17:58',
       '12/10/22 17:59', '12/10/22 18:00'], dtype=object)

In [8]:
import pandas as pd

# read ecodata2.csv and cardata2.csv into Pandas dataframes
eco_df = pd.read_csv("ecodata2.csv")
car_df = pd.read_csv("cardata2.csv")

# sort dataframes by time
eco_df.sort_values(by=['time'], inplace=True)
car_df.sort_values(by=['time'], inplace=True)

# set the start and end for the timeframes
time_from = pd.to_datetime('12/10/22 17:41')
time_to = pd.to_datetime('12/10/22 18:00')

#create function to get the data for 420
def createTimeframe(time_from, time_to, camera_id):
    eco_df_part = eco_df.loc[(pd.to_datetime(eco_df['time']) >= time_from) & (pd.to_datetime(eco_df['time']) < time_to) 
                             & (eco_df['camera_id'] == camera_id)]
    car_df_part = car_df.loc[(pd.to_datetime(car_df['time']) >= time_from) & (pd.to_datetime(car_df['time']) < time_to) 
                             & (car_df['camera_id'] == camera_id)]
    return [eco_df_part, car_df_part] #return both dataframes as a list


timeframes420 = {} #create empty dictionary to hold timeframes for 420

#get a list of all unique minutes in ecodata df
minutes = list(eco_df['time'].unique())

# loop through each minute in the list of minutes
for minute in minutes:
    time_from = pd.to_datetime(minute) #set the start time for this timeframe to the current minute
    time_to = time_from + pd.Timedelta(minutes=1) #set the end time for this timeframe to 1 minute after the current minute
    timeframes420[minute] = createTimeframe(time_from, time_to, 420) #call function with the current time range and camera 420
    #add the resulting data to timeframes420 dictionary under the key of the current minute
    
df_420 = car_df.loc[car_df['camera_id'] == 420]
df_420.head(10)


Unnamed: 0,id,city_id,location_id,camera_id,time,lat,lon,lane,speed
4858,1162cbfa-b8a3-c2e9-de60-94f9168192dc,2,2,420,12/10/2022 17:41,43.238362,76.889989,3,10
4849,1ecfb7c6-c5a9-f439-f506-36dfdf97f3c1,2,2,420,12/10/2022 17:41,43.238362,76.889989,2,6
4840,3901d7e0-576b-a587-8815-693deb731b32,2,2,420,12/10/2022 17:41,43.238362,76.889989,3,4
4883,f1652ccf-e045-bdbc-b7be-b20325afa6d2,2,2,420,12/10/2022 17:41,43.238362,76.889989,1,37
4868,7336fbd7-6f12-237f-660a-5de61420999e,2,2,420,12/10/2022 17:41,43.238362,76.889989,1,32
4832,e0962049-202b-c4ef-07b9-3ecfdb0ea21e,2,2,420,12/10/2022 17:41,43.238362,76.889989,1,16
4797,fb487b7b-1086-7dc4-92bd-765906cc150b,2,2,420,12/10/2022 17:41,43.238362,76.889989,3,32
4789,193fd082-5ef6-b5a6-3dd4-ba2a4bb58c47,2,2,420,12/10/2022 17:41,43.238362,76.889989,2,15
4806,f868bb55-a04e-d680-64da-8da2f494c851,2,2,420,12/10/2022 17:41,43.238362,76.889989,1,33
4823,0bd424bb-66b0-0e80-ab3d-a4f4ebaa54dd,2,2,420,12/10/2022 17:41,43.238362,76.889989,2,6


In [26]:
df_420_eco = eco_df.loc[eco_df['camera_id'] == 420]
df_420_eco.head(10)

Unnamed: 0,id,city_id,location_id,camera_id,co,no2,so2,o3,pm25,pm10,temp,hum,time,ver,lat,lon,created
4858,15,2,2,420,8.8,0.0,0,0.87,35,38,19.2,19.29,12/10/22 17:41,-1,43.238362,76.889989,12/10/22 11:41
4849,15,2,2,420,8.8,0.0,0,0.88,35,38,19.19,19.22,12/10/22 17:41,-1,43.238362,76.889989,12/10/22 11:41
4840,15,2,2,420,8.89,0.0,0,0.88,35,38,19.19,19.18,12/10/22 17:41,-1,43.238362,76.889989,12/10/22 11:41
4883,15,2,2,420,8.69,0.0,0,0.86,35,38,19.18,19.37,12/10/22 17:41,-1,43.238362,76.889989,12/10/22 11:41
4868,15,2,2,420,8.69,0.0,0,0.87,35,38,19.19,19.33,12/10/22 17:41,-1,43.238362,76.889989,12/10/22 11:41
4832,15,2,2,420,8.89,0.0,0,0.89,35,38,19.19,19.12,12/10/22 17:41,-1,43.238362,76.889989,12/10/22 11:41
4797,15,2,2,420,9.0,0.0,0,0.91,35,38,19.13,18.96,12/10/22 17:41,-1,43.238362,76.889989,12/10/22 11:41
4789,15,2,2,420,9.0,0.0,0,0.91,39,40,19.13,18.92,12/10/22 17:41,-1,43.238362,76.889989,12/10/22 11:41
4806,15,2,2,420,9.0,0.0,0,0.9,35,38,19.15,18.97,12/10/22 17:41,-1,43.238362,76.889989,12/10/22 11:41
4823,15,2,2,420,8.89,0.0,0,0.89,35,38,19.18,19.04,12/10/22 17:41,-1,43.238362,76.889989,12/10/22 11:41


In [9]:
import pandas as pd

# read ecodata2.csv and cardata2.csv into Pandas dataframes
eco_df = pd.read_csv("ecodata2.csv")
car_df = pd.read_csv("cardata2.csv")

# convert time columns to datetime
eco_df['time'] = pd.to_datetime(eco_df['time'])
car_df['time'] = pd.to_datetime(car_df['time'])

# sort dataframes by time
eco_df.sort_values(by=['time'], inplace=True)
car_df.sort_values(by=['time'], inplace=True)

# define time range
time_from = pd.to_datetime('12/10/22 17:41')
time_to = pd.to_datetime('12/10/22 18:00')

# define a function to create a timeframe for a given camera ID
def createTimeframe(time_from, time_to, camera_id):
    # get relevant rows 
    eco_df_part = eco_df.loc[(eco_df['time'] >= time_from) & (eco_df['time'] < time_to) & (eco_df['camera_id'] == camera_id)]
    car_df_part = car_df.loc[(car_df['time'] >= time_from) & (car_df['time'] < time_to) & (car_df['camera_id'] == camera_id)]
    # merge dataframes on camera_id and time columns
    df = pd.merge(eco_df_part, car_df_part, on=['camera_id', 'time'])
    # select columns
    cols = ['id_y', 'city_id_y', 'location_id_y', 'co', 'no2', 'so2', 'o3', 'pm25', 'pm10', 'temp', 'hum']
    df = df[cols]
    # rename the columns
    df = df.rename(columns={'id_y': 'id', 'city_id_y': 'city_id', 'location_id_y': 'location_id'})
    return df

# create dictionary to store dataframes
timeframes = {}

# get list of unique camera IDs
camera_ids = car_df['camera_id'].unique()
print(camera_ids)

# loop through each camera ID
for camera_id in camera_ids:
    camera_dict = {} #dictionary to store the dataframes for camera
    minutes = list(car_df.loc[car_df['camera_id'] == camera_id, 'time'].dt.floor('Min').unique()) #list of unique minutes for camera
    # loop through each minute
    for minute in minutes:
        time_from = minute #start and end times for this timeframe
        time_to = time_from + pd.Timedelta(minutes=1)
        df = createTimeframe(time_from, time_to, camera_id) #dataframe for this timeframe and camera
        camera_dict[time_from] = df #add this dataframe to the camera dictionary
        
    # add this camera's dictionary to the overall dictionary
    timeframes[camera_id] = camera_dict
    
    #timeframe = {from, to , df eco, df car}

# function to print dataframe for a given camera ID and minute
def printDataFrame(camera_id, minute):
    time_from = minute # start and end times for this timeframe
    time_to = time_from + pd.Timedelta(minutes=1)
    df = timeframes[camera_id][time_from] #get dataframe for this timeframe and camera
    print(df)



[4553  420 1724 4571 5779  450 4429 1378 1695 3022]


In [15]:
import pandas as pd

# read ecodata2.csv and cardata2.csv into Pandas dataframes
eco_df = pd.read_csv("ecodata2.csv")
car_df = pd.read_csv("cardata2.csv")

# convert time columns to datetime
eco_df['time'] = pd.to_datetime(eco_df['time'])
car_df['time'] = pd.to_datetime(car_df['time'])

# sort dataframes by time
eco_df.sort_values(by=['time'], inplace=True)
car_df.sort_values(by=['time'], inplace=True)

# define time range
time_from = pd.to_datetime('12/10/22 17:41')
time_to = pd.to_datetime('12/10/22 18:00')
times = car_df['time'].unique()
print("All times:", times)


# get list of unique camera IDs
camera_ids = car_df['camera_id'].unique()
print("All cameras:", camera_ids)

# create dictionary to store dataframes
timeframes = {}

# loop over unique camera IDs
for camera_id in camera_ids:
    # get relevant rows 
    timeframes[camera_id] = []
    for timestart in times:
        eco_df_part = eco_df.loc[(eco_df['time'] == timestart) & (eco_df['camera_id'] == camera_id)]
        car_df_part = car_df.loc[(car_df['time'] == timestart) & (car_df['camera_id'] == camera_id)]
        
        timeframes[camera_id].append([eco_df_part, car_df_part, camera_id])



All times: ['2022-12-10T17:41:00.000000000' '2022-12-10T17:42:00.000000000'
 '2022-12-10T17:43:00.000000000' '2022-12-10T17:44:00.000000000'
 '2022-12-10T17:45:00.000000000' '2022-12-10T17:46:00.000000000'
 '2022-12-10T17:47:00.000000000' '2022-12-10T17:48:00.000000000'
 '2022-12-10T17:49:00.000000000' '2022-12-10T17:50:00.000000000'
 '2022-12-10T17:51:00.000000000' '2022-12-10T17:52:00.000000000'
 '2022-12-10T17:53:00.000000000' '2022-12-10T17:54:00.000000000'
 '2022-12-10T17:55:00.000000000' '2022-12-10T17:56:00.000000000'
 '2022-12-10T17:57:00.000000000' '2022-12-10T17:58:00.000000000'
 '2022-12-10T17:59:00.000000000' '2022-12-10T18:00:00.000000000']
All cameras: [4553  420 1724 4571 5779  450 4429 1378 1695 3022]


## 2. Начисление баллов

1. Сформировать хранилище для баллов
2. Подготовить функцию для начисления баллов
3. Прогнать входные данные через функцию начисления баллов по каждой локации фрейм за фреймом

In [16]:
from functools import reduce
import math

#define a dictionary to store expected pm25 values for each camera
pm_expect = {}

#loop over unique camera IDs
for camera_id in car_df['camera_id'].unique():
    #get pm25 values for the current camera
    pm = eco_df.loc[eco_df['camera_id'] == camera_id, 'pm25']
    #calculate expected pm25 value for the current camera
    pm_avg = pm.mean()
    pm_std = pm.std()
    pm_expect[camera_id] = pm_avg + pm_std
    
pm_expect

{4553: 52.71492775046045,
 420: 64.9315612463409,
 1724: 74.458675232306,
 4571: 74.93811224123839,
 5779: 64.79800805813147,
 450: 67.97424439528069,
 4429: 75.27224327387746,
 1378: 34.7652209765218,
 1695: 46.163876225999786,
 3022: 166.67982542224166}

In [22]:
#define function to calculate score 
def calculate_score(eco_part_df, car_part_df, pm_exp):
    pm = eco_part_df['pm25']
    cars = car_part_df['id'].unique()
    pm_max = eco_part_df['pm25'].max()
    
    for car_id in cars:
        #car_pm = car_part_df.loc[car_part_df['id'] == car_id, 'pm25'].values[0]
        if pm > pm_max:
            score[car_id] = score.get(car_id, 0) + 1
        else:
            score[car_id] = score.get(car_id, 0) - 1
                        
#initialize score dictionary
score = {'default': 0}

#loop over timeframes
for (camera_id, by_camera) in timeframes.items():
    for by_time in by_camera:
        print(by_time)
        #pm_max = pm_expect[int(camera_id)]
        calculate_score(by_time[0], by_time[1], pm_expect[int(camera_id)])
        
score    

[      id  city_id  location_id  camera_id   co   no2  so2    o3  pm25  pm10  \
4999  18        2          741       4553  4.0  0.68    0  0.81    40    45   
4850  18        2          741       4553  4.0  0.73    0  0.80    32    36   
4841  18        2          741       4553  4.0  0.73    0  0.80    32    36   
4860  18        2          741       4553  4.0  0.72    0  0.80    32    36   
4885  18        2          741       4553  4.0  0.70    0  0.80    32    36   
4876  18        2          741       4553  4.0  0.70    0  0.09    32    36   
4798  18        2          741       4553  3.9  0.76    0  0.80    32    36   
4790  18        2          741       4553  3.9  0.76    0  0.80    36    40   
4807  18        2          741       4553  4.0  0.75    0  0.80    32    36   
4824  18        2          741       4553  4.0  0.74    0  0.80    32    36   
4833  18        2          741       4553  4.0  0.73    0  0.80    32    36   
4815  18        2          741       4553  4.0  0.7

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

## 3. Сохранение результата

Сохранить итоговое значение score в файл формата .csv

In [18]:
# saving the dataframe
df.to_csv('file1.csv')