In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost 

from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_percentage_error
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

from time import time
from google.colab import drive
from time import perf_counter
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
# Dataset with deletion of NaN
df_clean =  pd.read_csv('/gdrive/MyDrive/X_station_train_clean.csv', index_col=0)

# Dataset with imputation of NaN 
df_imputation = pd.read_csv('/gdrive/MyDrive/X_station_Train_imputation.csv', index_col=0)

# 1rst Dataset with NaN
df_full_dataset = pd.read_csv('/gdrive/MyDrive/X_station_train.csv')

path_station_coordinate='/gdrive/MyDrive/stations_coordinates.csv'
path_df_imputation="/gdrive/MyDrive/X_station_Train_imputation_Richard.csv"

  mask |= (ar1 == a)


In [None]:
def preprocessing_X_station (X_station_path, stations_coordinates_path, L_labels):
  
  # Renaming of features and add stations coordinates

    # Open Data
    X_station = pd.read_csv(X_station_path)
    stations_coordinates = pd.read_csv(stations_coordinates_path)

    # Split Date
    if 'X_station_train' in X_station_path:
        X_station['year']  = X_station['date'].apply(lambda row: row[:4]).astype('int32')
        X_station['month'] = X_station['date'].apply(lambda row: row[5:7]).astype('int32')
        X_station['day']   = X_station['date'].apply(lambda row: row[8:10]).astype('int32')
        X_station['hour']  = X_station['date'].apply(lambda row: row[11:13]).astype('int32')
        X_station.drop("date", axis='columns', inplace=True)

    if 'X_station_test' in X_station_path:
        X_station['number_sta'] = X_station['Id'].apply(lambda row: row.split('_')[0]).astype('int32')
        X_station['day_id']     = X_station['Id'].apply(lambda row: row.split('_')[1]).astype('int32')
        X_station['hour']       = X_station['Id'].apply(lambda row: row.split('_')[2]).astype('int32')


    X_station.drop("Id", axis='columns', inplace=True)


    # Add Stations Coordinates
    X_station = pd.merge(X_station, stations_coordinates, how='left', on='number_sta')

    # Rename columns
    X_station.rename(columns={'number_sta':"station_id", 
                                    'ff': "wind_speed", 
                                    't': "temperature", 
                                    'td':"dew_point", 
                                    'hu':"humidity", 
                                    'dd':"wind_direction", 
                                    'precip':"precipitations",
                                    'lat':"latitude", 
                                    'lon':"longitude", 
                                    'height_sta':"altitude"}, inplace=True)

    # Reorder columns
    X_station = X_station[L_labels]

    # Sort by station and date
    if 'X_station_train' in X_station_path:
        X_station = X_station.sort_values(by=['station_id','year', 'month', 'day', 'hour'])
    if 'X_station_test' in X_station_path:
        X_station = X_station.sort_values(by=['station_id', 'day_id', 'hour'])

    X_station = X_station.reset_index(drop=True)

    return X_station
def normalizing_data (X, L_labels, L_labels_cos_sin, min_train, max_train):

    # Exctact & Reorder columns
    X = X[L_labels]

    # Normalize
    X = (X - min_train) / (max_train - min_train)

    # Les valeurs sont normalisées entre 0 et 1, or cos(0)=cos(2*pi) => janvier=decembre, donc la plus grande valeur (normalisée) ne doit pas être 1
    X['hour_cos'] = np.cos(2*np.pi * X['hour'] * 23/24)
    X['hour_sin'] = np.sin(2*np.pi * X['hour'] * 23/24)
    X['wind_direction_cos'] = np.cos(2*np.pi * X['wind_direction'] * 359/360)
    X['wind_direction_sin'] = np.sin(2*np.pi * X['wind_direction'] * 359/360)
    X.drop('hour', axis=1, inplace=True)
    X.drop('wind_direction', axis=1, inplace=True)

    # Reorder columns
    X = X[L_labels_cos_sin]

    # X_station = X_station.reset_index(drop=True)

    return X

def m_mape(y_true,y_predict):
    n = len(y_true)
    At = np.array(y_true) + 1
    Ft = np.array(y_predict) + 1

    res = ((100/n)*(np.sum(np.abs((Ft-At)/At))))
    return res

In [None]:
def my_test_train_split(dataset):

  Y_station_train = dataset.groupby(['station_id'])['precipitations'].shift(-1)
  X_train_clean = dataset[["station_id","latitude","longitude","altitude","timestamp","year","month","day","hour","wind_direction","wind_speed","temperature","humidity","dew_point","precipitations"]]
  
  Y_train_clean = X_train_clean.groupby(['station_id'])['precipitations'].shift(-1)
  Y_train = df_full_dataset.groupby(['number_sta'])['precip'].shift(-1)
  # Y_train_imputation=X_train_imputation.groupby(['station_id'])['precipitations'].shift(-1)

  # Drop NaN

  listNan=Y_train_clean[Y_train_clean.isna()].index.values.tolist()
  if len(listNan)>0:
    X_train_clean = X_train_clean.drop(listNan)
    Y_train_clean = Y_train_clean.drop(listNan)

  check_for_nan = X_train_clean.isnull().sum().sum()
  print("Xtrain nan :",check_for_nan)

  check_for_nan = Y_train_clean.isnull().sum().sum()
  print("y train nan",check_for_nan)

  X_train = X_train_clean[X_train_clean['year']==2016]
  listX_train=X_train_clean[X_train_clean['year']==2016].index.values.tolist()
  y_train=Y_train_clean[listX_train]

  X_test = X_train_clean[X_train_clean['year']==2017]
  listX_test=X_train_clean[X_train_clean['year']==2017].index.values.tolist()
  y_test=Y_train_clean[listX_test]
  return X_train,X_test,y_train,y_test

def data_month(X,y):
  jan_index = X[X["month"]==1].index.values.tolist()
  X_jan = X.loc[jan_index]
  y_jan = y.loc[jan_index]

  feb_index = X[X["month"]==2].index.values.tolist()
  X_feb = X.loc[feb_index]
  y_feb = y.loc[feb_index]

  march_index = X[X["month"]==3].index.values.tolist()
  X_march = X.loc[march_index]
  y_march = y.loc[march_index]

  apr_index = X[X["month"]==4].index.values.tolist()
  X_apr = X.loc[apr_index]
  y_apr = y.loc[apr_index]

  may_index = X[X["month"]==5].index.values.tolist()
  X_may = X.loc[may_index]
  y_may = y.loc[may_index]

  june_index = X[X["month"]==6].index.values.tolist()
  X_june = X.loc[june_index]
  y_june = y.loc[june_index]

  july_index = X[X["month"]==7].index.values.tolist()
  X_july = X.loc[july_index]
  y_july = y.loc[july_index]

  aug_index = X[X["month"]==8].index.values.tolist()
  X_aug = X.loc[aug_index]
  y_aug = y.loc[aug_index]

  sept_index = X[X["month"]==9].index.values.tolist()
  X_sept = X.loc[sept_index]
  y_sept = y.loc[sept_index]

  oct_index = X[X["month"]==10].index.values.tolist()
  X_oct = X.loc[oct_index]
  y_oct = y.loc[oct_index]

  nov_index = X[X["month"]==11].index.values.tolist()
  X_nov = X.loc[nov_index]
  y_nov = y.loc[nov_index]

  dec_index = X[X["month"]==12].index.values.tolist()
  X_dec = X.loc[dec_index]
  y_dec = y.loc[dec_index]

  X= X.drop('month', 1)
  X_jan=X_jan.drop('month', 1)
  X_feb=X_feb.drop('month', 1)
  X_march=X_march.drop('month', 1)
  X_apr=X_apr.drop('month', 1)
  X_may=X_may.drop('month', 1)
  X_june=X_june.drop('month', 1)
  X_july=X_july.drop('month', 1)
  X_sept=X_sept.drop('month', 1)
  X_oct=X_oct.drop('month', 1)
  X_nov=X_nov.drop('month', 1)
  X_dec=X_dec.drop('month', 1)
  return X,y,X_jan,y_jan,X_feb,y_feb,X_march,y_march,X_apr,y_apr,X_may,y_may,X_june,y_june,X_july,y_july,X_aug,y_aug,X_sept,y_sept,X_oct,y_oct,X_nov,y_nov,X_dec,y_dec


def my_normalize_permonth(dataset,X_train,X_test,y_train,y_test):
  #Normalize
  Y_station_train = dataset.groupby(['station_id'])['precipitations'].shift(-1)

  L_labels = ['latitude', 'longitude', 'altitude', 'hour', 'wind_direction', 'wind_speed', 'temperature', 'humidity', 'dew_point', 'precipitations']
  L_labels_cos_sin = ['latitude', 'longitude', 'altitude', 'hour_cos', 'hour_sin', 'wind_direction_cos', 'wind_direction_sin', 'wind_speed', 'temperature', 'humidity', 'dew_point', 'precipitations']

  min_train = X_train[L_labels].min()
  max_train = X_train[L_labels].max()

  X_train = normalizing_data (X_train, L_labels, L_labels_cos_sin, min_train, max_train)
  X_test = normalizing_data (X_test, L_labels, L_labels_cos_sin, min_train, max_train)
  
  Y_temp = (Y_station_train - min_train['precipitations']) / (max_train['precipitations'] - min_train['precipitations'])
  y_train, y_test = Y_temp.loc[X_train.index], Y_temp.loc[X_test.index]

  print("train shape",X_train.shape,y_train.shape)
  print("test shape",X_test.shape,y_test.shape)

  return X_train,y_train,X_test,y_test

In [None]:
X_train, X_test, y_train, y_test=my_test_train_split(df_clean)

X_train,y_train,X_jan_train,y_jan_train,X_feb_train,y_feb_train,X_march_train,y_march_train,X_apr_train,y_apr_train,X_may_train,y_may_train,X_june_train,y_june_train,X_july_train,y_july_train,X_aug_train,y_aug_train,X_sept_train,y_sept_train,X_oct_train,y_oct_train,X_nov_train,y_nov_train,X_dec_train,y_dec_train = data_month(X_train,y_train)
X_test,y_test,X_jan_test,y_jan_test,X_feb_test,y_feb_test,X_march_test,y_march_test,X_apr_test,y_apr_test,X_may_test,y_may_test,X_june_test,y_june_test,X_july_test,y_july_test,X_aug_test,y_aug_test,X_sept_test,y_sept_test,X_oct_test,y_oct_test,X_nov_test,y_nov_test,X_dec_test,y_dec_test = data_month(X_test,y_test)


Xtrain nan : 0
y train nan 0


In [None]:

X_jan_train,y_jan_train,X_jan_test,y_jan_test=my_normalize_permonth(df_clean,X_jan_train,X_jan_test,y_jan_train,y_jan_test)
X_feb_train,y_feb_train,X_feb_test,y_feb_test=my_normalize_permonth(df_clean,X_feb_train,X_feb_test,y_feb_train,y_feb_test)
X_march_train,y_march_train,X_march_test,y_march_test=my_normalize_permonth(df_clean,X_march_train,X_march_test,y_march_train,y_march_test)
X_apr_train,y_apr_train,X_apr_test,y_apr_test=my_normalize_permonth(df_clean,X_apr_train,X_apr_test,y_apr_train,y_apr_test)
X_may_train,y_may_train,X_may_test,y_may_test=my_normalize_permonth(df_clean,X_may_train,X_may_test,y_may_train,y_may_test)
X_june_train,y_june_train,X_june_test,y_june_test=my_normalize_permonth(df_clean,X_june_train,X_june_test,y_june_train,y_june_test)
X_july_train,y_july_train,X_july_test,y_july_test=my_normalize_permonth(df_clean,X_july_train,X_july_test,y_july_train,y_july_test)
X_aug_train,y_aug_train,X_aug_test,y_aug_test=my_normalize_permonth(df_clean,X_aug_train,X_aug_test,y_aug_train,y_aug_test)
X_sept_train,y_sept_train,X_sept_test,y_sept_test=my_normalize_permonth(df_clean,X_sept_train,X_sept_test,y_sept_train,y_sept_test)
X_oct_train,y_oct_train,X_oct_test,y_oct_test=my_normalize_permonth(df_clean,X_oct_train,X_oct_test,y_oct_train,y_oct_test)
X_nov_train,y_nov_train,X_nov_test,y_nov_test=my_normalize_permonth(df_clean,X_nov_train,X_nov_test,y_nov_train,y_nov_test)
X_dec_train,y_dec_train,X_dec_test,y_dec_test=my_normalize_permonth(df_clean,X_dec_train,X_dec_test,y_dec_train,y_dec_test)

train shape (91865, 12) (91865,)
test shape (97951, 12) (97951,)
train shape (85674, 12) (85674,)
test shape (89458, 12) (89458,)
train shape (93240, 12) (93240,)
test shape (98865, 12) (98865,)
train shape (89675, 12) (89675,)
test shape (96106, 12) (96106,)
train shape (89102, 12) (89102,)
test shape (98604, 12) (98604,)
train shape (89798, 12) (89798,)
test shape (93571, 12) (93571,)
train shape (93656, 12) (93656,)
test shape (97777, 12) (97777,)
train shape (95081, 12) (95081,)
test shape (99275, 12) (99275,)
train shape (92020, 12) (92020,)
test shape (94920, 12) (94920,)
train shape (96296, 12) (96296,)
test shape (98829, 12) (98829,)
train shape (94435, 12) (94435,)
test shape (95186, 12) (95186,)
train shape (97377, 12) (97377,)
test shape (95204, 12) (95204,)


In [None]:
model_jan = XGBRegressor(n_estimators=500, max_depth=18, gamma=3.430739184133814, min_child_weight = 8,reg_alpha=180, reg_lambda=0.7436396623675846, random_state=123)
t1_start = perf_counter()
model_jan.fit(X_jan_train,y_jan_train)
t1_stop = perf_counter()
# print("Time_jan:", t1_stop-t1_start)

model_feb = XGBRegressor(n_estimators=500, max_depth=18, gamma=3.430739184133814, min_child_weight = 8,reg_alpha=180, reg_lambda=0.7436396623675846, random_state=123)
t1_start = perf_counter()              
model_feb.fit(X_feb_train,y_feb_train)
t1_stop = perf_counter()
# print("Time_jan:", t1_stop-t1_start)

model_march = XGBRegressor(n_estimators=500, max_depth=18, gamma=3.430739184133814, min_child_weight = 8,reg_alpha=180, reg_lambda=0.7436396623675846, random_state=123)
t1_start = perf_counter()              
model_march.fit(X_march_train,y_march_train)
t1_stop = perf_counter()

model_apr = XGBRegressor(n_estimators=500, max_depth=18, gamma=3.430739184133814, min_child_weight = 8,reg_alpha=180, reg_lambda=0.7436396623675846, random_state=123)
t1_start = perf_counter()              
model_apr.fit(X_apr_train,y_apr_train)
t1_stop = perf_counter()

model_may = XGBRegressor(n_estimators=500, max_depth=18, gamma=3.430739184133814, min_child_weight = 8,reg_alpha=180, reg_lambda=0.7436396623675846, random_state=123)
t1_start = perf_counter()              
model_may.fit(X_may_train,y_may_train)
t1_stop = perf_counter()

model_june = XGBRegressor(n_estimators=500, max_depth=18, gamma=3.430739184133814, min_child_weight = 8,reg_alpha=180, reg_lambda=0.7436396623675846, random_state=123)
t1_start = perf_counter()              
model_june.fit(X_june_train,y_june_train)
t1_stop = perf_counter()

model_july = XGBRegressor(n_estimators=500, max_depth=18, gamma=3.430739184133814, min_child_weight = 8,reg_alpha=180, reg_lambda=0.7436396623675846, random_state=123)
t1_start = perf_counter()              
model_july.fit(X_july_train,y_july_train)
t1_stop = perf_counter()

model_aug = XGBRegressor(n_estimators=500, max_depth=18, gamma=3.430739184133814, min_child_weight = 8,reg_alpha=180, reg_lambda=0.7436396623675846, random_state=123)
t1_start = perf_counter()              
model_aug.fit(X_aug_train,y_aug_train)
t1_stop = perf_counter()

model_sept = XGBRegressor(n_estimators=500, max_depth=18, gamma=3.430739184133814, min_child_weight = 8,reg_alpha=180, reg_lambda=0.7436396623675846, random_state=123)
t1_start = perf_counter()              
model_sept.fit(X_sept_train,y_sept_train)
t1_stop = perf_counter()

model_oct = XGBRegressor(n_estimators=500, max_depth=18, gamma=3.430739184133814, min_child_weight = 8,reg_alpha=180, reg_lambda=0.7436396623675846, random_state=123)
t1_start = perf_counter()              
model_oct.fit(X_oct_train,y_oct_train)
t1_stop = perf_counter()

model_nov = XGBRegressor(n_estimators=500, max_depth=18, gamma=3.430739184133814, min_child_weight = 8,reg_alpha=180, reg_lambda=0.7436396623675846, random_state=123)
t1_start = perf_counter()              
model_nov.fit(X_nov_train,y_nov_train)
t1_stop = perf_counter()

model_dec = XGBRegressor(n_estimators=500, max_depth=18, gamma=3.430739184133814, min_child_weight = 8,reg_alpha=180, reg_lambda=0.7436396623675846, random_state=123)
t1_start = perf_counter()              
model_dec.fit(X_dec_train,y_dec_train)
t1_stop = perf_counter()

In [None]:
y_predict = model_jan.predict(X_jan_test)
print(y_predict)
print(type(y_predict))

[0.00994861 0.00994861 0.00994861 ... 0.00994861 0.00994861 0.00994861]
<class 'numpy.ndarray'>


In [None]:
y_predict = model_jan.predict(X_jan_test)
print("My MAPE =", m_mape(y_jan_test,y_predict))
print("MSE =",mean_squared_error(y_jan_test,y_predict))

y_predict = model_feb.predict(X_feb_test)
print("My MAPE =", m_mape(y_feb_test,y_predict))
print("MSE =",mean_squared_error(y_feb_test,y_predict))

y_predict = model_march.predict(X_march_test)
print("My MAPE =", m_mape(y_march_test,y_predict))
print("MSE =",mean_squared_error(y_march_test,y_predict))

y_predict = model_apr.predict(X_apr_test)
print("My MAPE =", m_mape(y_apr_test,y_predict))
print("MSE =",mean_squared_error(y_apr_test,y_predict))

y_predict = model_may.predict(X_may_test)
print("My MAPE =", m_mape(y_may_test,y_predict))
print("MSE =",mean_squared_error(y_may_test,y_predict))

y_predict = model_june.predict(X_june_test)
print("My MAPE =", m_mape(y_june_test,y_predict))
print("MSE =",mean_squared_error(y_june_test,y_predict))

y_predict = model_july.predict(X_july_test)
print("My MAPE =", m_mape(y_july_test,y_predict))
print("MSE =",mean_squared_error(y_july_test,y_predict))

y_predict = model_aug.predict(X_aug_test)
print("My MAPE =", m_mape(y_aug_test,y_predict))
print("MSE =",mean_squared_error(y_aug_test,y_predict))

y_predict = model_sept.predict(X_sept_test)
print("My MAPE =", m_mape(y_sept_test,y_predict))
print("MSE =",mean_squared_error(y_sept_test,y_predict))

y_predict = model_oct.predict(X_oct_test)
print("My MAPE =", m_mape(y_oct_test,y_predict))
print("MSE =",mean_squared_error(y_oct_test,y_predict))

y_predict = model_nov.predict(X_nov_test)
print("My MAPE =", m_mape(y_nov_test,y_predict))
print("MSE =",mean_squared_error(y_nov_test,y_predict))

y_predict = model_dec.predict(X_dec_test)
print("My MAPE =", m_mape(y_dec_test,y_predict))
print("MSE =",mean_squared_error(y_dec_test,y_predict))



My MAPE = 1.2570294869521748
MSE = 0.00042274620458173767
My MAPE = 0.926970277830805
MSE = 0.0003013558363974034
My MAPE = 1.1789457213575596
MSE = 0.0007105927079665449
My MAPE = 0.5017051274614899
MSE = 9.198437333299878e-05
My MAPE = 0.558056893488985
MSE = 0.0001516277866899749
My MAPE = 0.5740071298433448
MSE = 0.00021945166483971522
My MAPE = 0.5579504989122119
MSE = 0.0009818321713234077
My MAPE = 0.5150552784087876
MSE = 0.00037961683095759857
My MAPE = 0.6116290131676222
MSE = 0.0003082504092785334
My MAPE = 0.5450245536826063
MSE = 0.00015861006964890427
My MAPE = 0.9744732762222601
MSE = 0.0004471085848175082
My MAPE = 1.263959865334934
MSE = 0.0019084034607334919


In [None]:
X_station_test_path = '/gdrive/MyDrive/X_station_test.csv'
stations_coordinates_path = '/gdrive/MyDrive/stations_coordinates.csv'
L_labels_test  = ['station_id', 'day_id', 'latitude', 'longitude', 'altitude','month','hour', 'wind_direction', 'wind_speed', 'temperature', 'humidity', 'dew_point', 'precipitations']
X_station_test = preprocessing_X_station(X_station_test_path, stations_coordinates_path, L_labels_test)
L_labels = ['latitude', 'longitude', 'altitude', 'hour', 'wind_direction', 'wind_speed', 'temperature', 'humidity', 'dew_point', 'precipitations']
L_labels_cos_sin = ['latitude', 'longitude', 'altitude', 'hour_cos', 'hour_sin', 'wind_direction_cos', 'wind_direction_sin', 'wind_speed', 'temperature', 'humidity', 'dew_point', 'precipitations']
min_train = X_train[L_labels].min()
max_train = X_train[L_labels].max()
X_test_norm = normalizing_data (X_station_test, L_labels, L_labels_cos_sin, min_train, max_train)
# display(X_station_test)
display(X_test_norm)
display(X_dec_train)
display(X_test_norm.loc[0])
Y_pred_test=[]


In [None]:
y_predict = model_jan.predict(pd.DataFrame(X_test_norm.iloc[0]).T)[0]
print(y_predict)
print(type(y_predict))

In [None]:
# secondrow = pd.DataFrame(X_test_norm.iloc[1]).T
# y_predict = model_dec.predict(secondrow)

for i in range(len(X_station_test)):
  if X_station_test.loc[i]["month"] == 1:
    y_predict = model_jan.predict(pd.DataFrame(X_test_norm.iloc[i]).T)[0]
  elif X_station_test.loc[i]["month"] == 2:
    y_predict = model_feb.predict(pd.DataFrame(X_test_norm.iloc[i]).T)[0]
  elif X_station_test.loc[i]["month"] == 3:
    y_predict = model_march.predict(pd.DataFrame(X_test_norm.iloc[i]).T)[0]
  elif X_station_test.loc[i]["month"] == 4:
    y_predict = model_apr.predict(pd.DataFrame(X_test_norm.iloc[i]).T)[0]
  elif X_station_test.loc[i]["month"] == 5:
    y_predict = model_may.predict(pd.DataFrame(X_test_norm.iloc[i]).T)[0]
  elif X_station_test.loc[i]["month"] == 6:
    y_predict = model_june.predict(pd.DataFrame(X_test_norm.iloc[i]).T)[0]
  elif X_station_test.loc[i]["month"] == 7:
    y_predict = model_july.predict(pd.DataFrame(X_test_norm.iloc[i]).T)[0]
  elif X_station_test.loc[i]["month"] == 8:
    y_predict = model_aug.predict(pd.DataFrame(X_test_norm.iloc[i]).T)[0]
  elif X_station_test.loc[i]["month"] == 9:
    y_predict = model_sept.predict(pd.DataFrame(X_test_norm.iloc[i]).T)[0]
  elif X_station_test.loc[i]["month"] == 10:
    y_predict = model_oct.predict(pd.DataFrame(X_test_norm.iloc[i]).T)[0]
  elif X_station_test.loc[i]["month"] ==11:
    y_predict = model_nov.predict(pd.DataFrame(X_test_norm.iloc[i]).T)[0]
  elif X_station_test.loc[i]["month"] == 12:
    y_predict = model_dec.predict(pd.DataFrame(X_test_norm.iloc[i]).T)[0]
  Y_pred_test.append(y_predict)




In [None]:
print(Y_pred_test[:10])
# Y_pred_test = Y_pred_test * (max_train['precipitations'] - min_train['precipitations']) + min_train['precipitations']
print(X_station_test.shape)
print(len(Y_pred_test))
# display(Y_pred_test[0])
save = Y_pred_test
print(type(Y_pred_test))
# Y_pred_test = np.array(Y_pred_test)
pred_merged = pd.concat([X_station_test, pd.DataFrame(Y_pred_test, columns=['Y_pred'])], axis=1)

pred_merged = pred_merged[['station_id',	'day_id', 'Y_pred']]
pred_merged['station_day_id'] = pred_merged[['station_id', 'day_id']].astype(str).apply(lambda x: '_'.join(x), axis=1)

pred_merged.drop(['station_id',	'day_id'], axis=1, inplace=True)
pred_merged = pred_merged[['station_day_id', 'Y_pred']]


pred_merged = pred_merged.groupby('station_day_id').agg(np.sum).reset_index()

baseline_obs = pd.read_csv('/gdrive/MyDrive/Baseline_observation_test.csv')
baseline_obs.drop('Prediction', axis=1, inplace=True)

baseline_obs = baseline_obs.rename(columns={"Id": "station_day_id"})
display(baseline_obs)
display(pred_merged)
pred_merged = pd.merge(baseline_obs, pred_merged, how='inner', on=['station_day_id'])
display(pred_merged)
pred_merged = pred_merged.rename(columns={'station_day_id': "Id", 'Y_pred':"Prediction"})

display(pred_merged)

In [None]:

# Y_pred_test = Y_pred_test * (max_train['precipitations'] - min_train['precipitations']) + min_train['precipitations']
# print(X_station_test.shape)
# print(Y_pred_test.shape)
# display(Y_pred_test)

pred_merged = pd.concat([X_station_test, pd.DataFrame(Y_pred_test, columns=['Y_pred'])], axis=1)
pred_merged = pred_merged[['station_id',	'day_id', 'Y_pred']]
pred_merged['station_id'] = [str(item)[:-2] for item in pred_merged['station_id']]
pred_merged['day_id'] = [str(item)[:-2] for item in pred_merged['day_id']]
display(pred_merged)
pred_merged['station_day_id'] = pred_merged[['station_id', 'day_id']].astype(str).apply(lambda x: '_'.join(x), axis=1)
display(pred_merged[:30])

pred_merged.drop(['station_id',	'day_id'], axis=1, inplace=True)
pred_merged = pred_merged[['station_day_id', 'Y_pred']]

pred_merged = pred_merged.groupby('station_day_id').agg(np.sum).reset_index()

baseline_obs = pd.read_csv('/gdrive/MyDrive/Baseline_observation_test.csv')
baseline_obs.drop('Prediction', axis=1, inplace=True)

baseline_obs = baseline_obs.rename(columns={"Id": "station_day_id"})
pred_merged = pd.merge(baseline_obs, pred_merged, how='inner', on=['station_day_id'])

pred_merged = pred_merged.rename(columns={'station_day_id': "Id", 'Y_pred':"Prediction"})

display(pred_merged)

In [None]:
display(pred_merged.describe())
# display(Y_pred_test)
pred_merged.to_csv('/gdrive/MyDrive/MLDM-Prediction/xgboost_per_month_data_clean_normalisee.csv',index=False)

Unnamed: 0,Prediction
count,85140.0
mean,0.13005
std,0.049231
min,0.073869
25%,0.094015
50%,0.108682
75%,0.16367
max,0.619402
