# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# Data loading

In [2]:
X_train = pd.read_csv('x_train_sncf.csv')
X_train.head()

Unnamed: 0,date,station,job,ferie,vacances
0,2015-01-01,1J7,1,1,1
1,2015-01-01,O2O,1,1,1
2,2015-01-01,8QR,1,1,1
3,2015-01-01,UMC,1,1,1
4,2015-01-01,FK3,1,1,1


In [3]:
y_train = pd.read_csv("y_train_sncf.csv")
y_train.head()

Unnamed: 0,index,y
0,2015-01-01_1J7,7
1,2015-01-01_O2O,0
2,2015-01-01_8QR,9
3,2015-01-01_UMC,9
4,2015-01-01_FK3,28


# Data exploration

In [4]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1229863 entries, 0 to 1229862
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   date      1229863 non-null  object
 1   station   1229863 non-null  object
 2   job       1229863 non-null  int64 
 3   ferie     1229863 non-null  int64 
 4   vacances  1229863 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 46.9+ MB


In [5]:
X_train.isna().sum()

date        0
station     0
job         0
ferie       0
vacances    0
dtype: int64

In [6]:
X_train.shape

(1229863, 5)

In [7]:
y_train.shape

(1229863, 2)

In [8]:
num_stations = len(X_train.iloc[:, 1].unique())
print("Number of different stations:", num_stations)

Number of different stations: 439


In [9]:
num_days = len(X_train.iloc[:, 0].unique())
print("Number of different days:", num_days)

Number of different days: 2922


In [10]:
X_test = pd.read_csv("x_test_sncf.csv")
X_test.head()

Unnamed: 0,index,date,station,job,ferie,vacances
0,2023-01-01_1J7,2023-01-01,1J7,0,1,1
1,2023-01-01_O2O,2023-01-01,O2O,0,1,1
2,2023-01-01_8QR,2023-01-01,8QR,0,1,1
3,2023-01-01_L58,2023-01-01,L58,0,1,1
4,2023-01-01_UMC,2023-01-01,UMC,0,1,1


In [12]:
station_mapping = {station: i for i, station in enumerate(X_train['station'].unique())}
X_train['station_id'] = X_train['station'].map(station_mapping)
X_train

Unnamed: 0,date,station,job,ferie,vacances,station_id
0,2015-01-01,1J7,1,1,1,0
1,2015-01-01,O2O,1,1,1,1
2,2015-01-01,8QR,1,1,1,2
3,2015-01-01,UMC,1,1,1,3
4,2015-01-01,FK3,1,1,1,4
...,...,...,...,...,...,...
1229858,2022-12-31,V2P,0,0,1,431
1229859,2022-12-31,N9K,0,0,1,432
1229860,2022-12-31,P6E,0,0,1,434
1229861,2022-12-31,BDC,0,0,1,435


In [15]:
selected_rows = X_train[X_train['station'] == '1J7']
selected_rows


Unnamed: 0,date,station,job,ferie,vacances,station_id,days_since
0,2015-01-01,1J7,1,1,1,0,0
423,2016-01-01,1J7,1,1,1,0,365
848,2017-01-01,1J7,0,1,1,0,731
1272,2018-01-01,1J7,1,1,1,0,1096
1698,2019-01-01,1J7,1,1,1,0,1461
...,...,...,...,...,...,...,...
1227369,2017-12-31,1J7,0,0,1,0,1095
1227796,2018-12-31,1J7,1,0,1,0,1460
1228592,2020-12-31,1J7,1,0,1,0,2191
1229006,2021-12-31,1J7,1,0,1,0,2556


In [14]:
X_train['date'] = pd.to_datetime(X_train['date'])
reference_date = pd.to_datetime('2015-01-01')  # Replace with your desired reference date
X_train['days_since'] = (X_train['date'] - reference_date).dt.days
X_train



Unnamed: 0,date,station,job,ferie,vacances,station_id,days_since
0,2015-01-01,1J7,1,1,1,0,0
1,2015-01-01,O2O,1,1,1,1,0
2,2015-01-01,8QR,1,1,1,2,0
3,2015-01-01,UMC,1,1,1,3,0
4,2015-01-01,FK3,1,1,1,4,0
...,...,...,...,...,...,...,...
1229858,2022-12-31,V2P,0,0,1,431,2921
1229859,2022-12-31,N9K,0,0,1,432,2921
1229860,2022-12-31,P6E,0,0,1,434,2921
1229861,2022-12-31,BDC,0,0,1,435,2921


In [None]:
# Build x_test

x_test_submit = pd.read_csv("x_test_sncf.csv")
data_station = x_test_submit['date'].astype(str) + '_' + x_test_submit['station']
data_station_array = np.array(data_station)