In [1]:
import pandas as pd

# Load the data
train_data = pd.read_csv('loop_sensor_train.csv')
test_data = pd.read_csv('loop_sensor_test_x.csv')
metadata = pd.read_csv('geo_reference.csv', delimiter=';')

print(train_data.shape, test_data.shape, metadata.shape)

(23132425, 4) (439298, 4) (3739, 10)


In [17]:
baseline = pd.read_csv('loop_sensor_test_baseline.csv')
print(baseline.shape)
baseline.head()

(439298, 2)


Unnamed: 0,id,estimate_q
0,1,1184.0
1,2,273.0
2,3,1495.0
3,4,1612.0
4,5,1948.0


In [2]:
train_data.head()

Unnamed: 0,iu_ac,t_1h,etat_barre,q
0,1655,2022-01-12 12:00:00,3,957.0
1,1655,2022-01-12 13:00:00,3,957.0
2,1655,2022-01-12 14:00:00,3,1014.0
3,1655,2022-01-12 15:00:00,3,939.0
4,1655,2022-01-12 16:00:00,3,927.0


In [18]:
print(test_data.shape)
test_data.head()

(439298, 4)


Unnamed: 0,id,iu_ac,t_1h,etat_barre
0,1,5,2023-01-02 00:00:00,3
1,2,5,2023-01-03 05:00:00,3
2,3,5,2023-01-04 10:00:00,3
3,4,5,2023-01-05 15:00:00,3
4,5,5,2023-01-06 20:00:00,3


In [12]:
test_data = test_data[test_data['etat_barre'] != '2']
test_data.shape

(439298, 4)

In [19]:
print(metadata.shape)
metadata.head()

(3739, 10)


Unnamed: 0,iu_ac,date_debut,date_fin,libelle,iu_nd_aval,libelle_nd_aval,iu_nd_amont,libelle_nd_amont,geo_point_2d,geo_shape
0,1223,1996-10-03T02:00:00+00:00,2023-01-01T01:00:00+00:00,Av_Pte_de_Sevres,618,Pte_Sevres-Armee_Air,619,Av_Pte_de_Sevres_PI,"48.83458548908511, 2.2775887111084794","{""coordinates"": [[2.2773736802582274, 48.83415..."
1,1139,1996-10-03T02:00:00+00:00,2023-01-01T01:00:00+00:00,Rond_Pt_Pte_Plaine,627,Pte_Plaine-Av_Pte_Plaine,628,Pte_Plaine-Gal_Guillaumat,"48.8278127813947, 2.292632476946377","{""coordinates"": [[2.2928564154951085, 48.82776..."
2,5266,1996-10-28T01:00:00+00:00,2023-01-01T01:00:00+00:00,PE_Poterne,478,SE_Italie,457,AE_Gentilly,"48.817370106973385, 2.350550919738548","{""coordinates"": [[2.3464715081625283, 48.81630..."
3,5450,1996-11-06T01:00:00+00:00,2023-01-01T01:00:00+00:00,PI_St_Cloud,2488,AI_St_Cloud,2723,AI_Georges_Lafont,"48.839259648175734, 2.254289772364343","{""coordinates"": [[2.2546614296019127, 48.83818..."
4,5417,1996-11-05T01:00:00+00:00,2023-01-01T01:00:00+00:00,PI_Courcelles,2270,SI_Asnieres,2736,AI_Champerret,"48.88926481478913, 2.295812006937022","{""coordinates"": [[2.292386753449537, 48.888225..."


In [None]:
# Drop rows where 'q' is NULL
train_data = train_data[train_data['q'].notnull()]
print(train_data.shape)
# Optionally, check for missing values in other columns
train_data = train_data.dropna(subset=['t_1h', 'etat_barre'])  # Assuming these are also critical
print(train_data.shape)
# Filtering data to include only those IDs present in the metadata
train_data = train_data[train_data['iu_ac'].isin(metadata['iu_ac'])]
print(train_data.shape)
 

In [6]:
# Convert 't_1h' to datetime format if not already
train_data['t_1h'] = pd.to_datetime(train_data['t_1h'])
# Extract time features
train_data['hour'] = train_data['t_1h'].dt.hour
train_data['weekday'] = train_data['t_1h'].dt.weekday
train_data.head()

Unnamed: 0,iu_ac,t_1h,etat_barre,q,hour,weekday
0,1655,2022-01-12 12:00:00,3,957.0,12,2
1,1655,2022-01-12 13:00:00,3,957.0,13,2
2,1655,2022-01-12 14:00:00,3,1014.0,14,2
3,1655,2022-01-12 15:00:00,3,939.0,15,2
4,1655,2022-01-12 16:00:00,3,927.0,16,2


In [13]:
# One-hot encoding
train_data = pd.get_dummies(train_data, columns=['etat_barre'])
train_data.head()
# # Alternatively, for label encoding
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# train_data['etat_barre'] = le.fit_transform(train_data['etat_barre'])


Unnamed: 0,iu_ac,t_1h,q,hour,weekday,etat_barre_1,etat_barre_2,etat_barre_3
0,1655,2022-01-12 12:00:00,957.0,12,2,0,0,1
1,1655,2022-01-12 13:00:00,957.0,13,2,0,0,1
2,1655,2022-01-12 14:00:00,1014.0,14,2,0,0,1
3,1655,2022-01-12 15:00:00,939.0,15,2,0,0,1
4,1655,2022-01-12 16:00:00,927.0,16,2,0,0,1


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Assuming 'q' is the only continuous variable needing scaling
train_data['q_scaled'] = scaler.fit_transform(train_data[['q']])


In [None]:
# Review the dataset
train_data.head()
