# Import Lib + data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost 

from sklearn.metrics import mean_squared_error


from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_percentage_error
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.preprocessing import StandardScaler

from time import time
from google.colab import drive
from time import perf_counter
import pickle
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


# Import dataset

In [None]:
# Dataset with deletion of NaN
df_clean =  pd.read_csv('/gdrive/MyDrive/X_station_train_clean.csv', index_col=0)

# Dataset with imputation of NaN 
df_imputation = pd.read_csv('/gdrive/MyDrive/X_station_Train_imputation.csv', index_col=0)

# 1rst Dataset with NaN
df_full_dataset = pd.read_csv('/gdrive/MyDrive/X_station_train.csv')

path_station_coordinate='/gdrive/MyDrive/stations_coordinates.csv'
path_df_imputation="/gdrive/MyDrive/X_station_Train_imputation_Richard.csv"

path_df_2016_imputation="/gdrive/MyDrive/MLDM Project/data/X_all_2016_imputed_by_day.zip"




  mask |= (ar1 == a)


In [None]:

path_df_2016="/gdrive/MyDrive/MLDM Project/data/X_all_2016_final.zip"
path_df_2017="/gdrive/MyDrive/MLDM Project/data/X_all_2017_final.zip"
df_train = pd.read_csv(path_df_2016)
df_test = pd.read_csv(path_df_2017)

In [None]:

x_train, y_train = df_train.drop("ground_truth", axis=1), df_train["ground_truth"]
x_train.drop(["day","Id","month","station_id","hour","next hour precipitation (kg/m^2)"], axis=1, inplace=True)
x_train = pd.DataFrame(StandardScaler().fit_transform(x_train), columns=x_train.columns)
del df_train

x_test, y_test = df_test.drop("ground_truth", axis=1), df_test["ground_truth"]
x_test.drop(["day","Id","month","station_id","hour","next hour precipitation (kg/m^2)"], axis=1, inplace=True)
x_test = pd.DataFrame(StandardScaler().fit_transform(x_test), columns=x_test.columns)
del df_test


In [None]:
display(pd.read_csv(path_station_coordinate))

Unnamed: 0,number_sta,lat,lon,height_sta
0,86118001,46.477,0.985,120.0
1,86149001,46.917,0.025,60.0
2,56081003,48.050,-3.660,165.0
3,53215001,47.790,-0.710,63.0
4,22135001,48.550,-3.380,148.0
...,...,...,...,...
320,86137003,47.035,0.098,96.0
321,86165005,46.412,0.841,153.0
322,86273001,46.464,1.042,121.0
323,91200002,48.526,1.993,116.0


# Functions used in preprocessing

 preprocessing + Normalized + MAPE

In [None]:
def preprocessing_X_station (X_station_path, stations_coordinates_path, L_labels):
  
  # Renaming of features and add stations coordinates

    # Open Data
    X_station = pd.read_csv(X_station_path)
    stations_coordinates = pd.read_csv(stations_coordinates_path)

    # Split Date
    if 'X_station_train' in X_station_path:
        X_station['year']  = X_station['date'].apply(lambda row: row[:4]).astype('int32')
        X_station['month'] = X_station['date'].apply(lambda row: row[5:7]).astype('int32')
        X_station['day']   = X_station['date'].apply(lambda row: row[8:10]).astype('int32')
        X_station['hour']  = X_station['date'].apply(lambda row: row[11:13]).astype('int32')
        X_station.drop("date", axis='columns', inplace=True)

    if 'X_station_test' in X_station_path:
        X_station['number_sta'] = X_station['Id'].apply(lambda row: row.split('_')[0]).astype('int32')
        X_station['day_id']     = X_station['Id'].apply(lambda row: row.split('_')[1]).astype('int32')
        X_station['hour']       = X_station['Id'].apply(lambda row: row.split('_')[2]).astype('int32')


    X_station.drop("Id", axis='columns', inplace=True)


    # Add Stations Coordinates
    X_station = pd.merge(X_station, stations_coordinates, how='left', on='number_sta')

    # Rename columns
    X_station.rename(columns={'number_sta':"station_id", 
                                    'ff': "wind_speed", 
                                    't': "temperature", 
                                    'td':"dew_point", 
                                    'hu':"humidity", 
                                    'dd':"wind_direction", 
                                    'precip':"precipitations",
                                    'lat':"latitude", 
                                    'lon':"longitude", 
                                    'height_sta':"altitude"}, inplace=True)

    # Reorder columns
    X_station = X_station[L_labels]

    # Sort by station and date
    if 'X_station_train' in X_station_path:
        X_station = X_station.sort_values(by=['station_id','year', 'month', 'day', 'hour'])
    if 'X_station_test' in X_station_path:
        X_station = X_station.sort_values(by=['station_id', 'day_id', 'hour'])

    X_station = X_station.reset_index(drop=True)

    return X_station



In [None]:
def normalizing_data (X, L_labels, L_labels_cos_sin, min_train, max_train):

    # Exctact & Reorder columns
    X = X[L_labels]

    # Normalize
    X = (X - min_train) / (max_train - min_train)


    # Les valeurs sont normalisées entre 0 et 1, or cos(0)=cos(2*pi) => janvier=decembre, donc la plus grande valeur (normalisée) ne doit pas être 1
    X['month_cos'] = np.cos(2*np.pi * X['month'] * 11/12)  
    X['month_sin'] = np.sin(2*np.pi * X['month'] * 11/12)
    X['hour_cos'] = np.cos(2*np.pi * X['hour'] * 23/24)
    X['hour_sin'] = np.sin(2*np.pi * X['hour'] * 23/24)
    X['wind_direction_cos'] = np.cos(2*np.pi * X['wind_direction'] * 359/360)
    X['wind_direction_sin'] = np.sin(2*np.pi * X['wind_direction'] * 359/360)

    X.drop('month', axis=1, inplace=True)
    X.drop('hour', axis=1, inplace=True)
    X.drop('wind_direction', axis=1, inplace=True)

    # Reorder columns
    X = X[L_labels_cos_sin]

    # X_station = X_station.reset_index(drop=True)

    return X


In [None]:
def m_mape(y_true,y_predict):
    n = len(y_true)
    At = np.array(y_true) + 1
    Ft = np.array(y_predict) + 1

    res = ((100/n)*(np.sum(np.abs((Ft-At)/At))))
    return res

# Preprocessing data

In [None]:
df_y = pd.read_csv("Y_train.csv")

In [None]:
def my_test_train_split(dataset):

  Y_station_train = dataset.groupby(['station_id'])['precipitations'].shift(-1)
  X_train_clean = dataset[["station_id","latitude","longitude","altitude","timestamp","year","month","day","hour","wind_direction","wind_speed","temperature","humidity","dew_point","precipitations"]]
  
  Y_train_clean = X_train_clean.groupby(['station_id'])['precipitations'].shift(-1)
  Y_train = df_full_dataset.groupby(['number_sta'])['precip'].shift(-1)
  
  # Y_train_imputation=X_train_imputation.groupby(['station_id'])['precipitations'].shift(-1)


  # Drop NaN

  listNan=Y_train_clean[Y_train_clean.isna()].index.values.tolist()
  if len(listNan)>0:
    X_train_clean = X_train_clean.drop(listNan)
    Y_train_clean = Y_train_clean.drop(listNan)

  check_for_nan = X_train_clean.isnull().sum().sum()
  print("Xtrain nan :",check_for_nan)

  check_for_nan = Y_train_clean.isnull().sum().sum()
  print("y train nan",check_for_nan)

  X_train = X_train_clean[X_train_clean['year']==2016]
  listX_train=X_train_clean[X_train_clean['year']==2016].index.values.tolist()
  y_train=Y_train_clean[listX_train]

  X_test = X_train_clean[X_train_clean['year']==2017]
  listX_test=X_train_clean[X_train_clean['year']==2017].index.values.tolist()
  y_test=Y_train_clean[listX_test]


  #Normalize
  L_labels = ['latitude', 'longitude', 'altitude', 'month', 'hour', 'wind_direction', 'wind_speed', 'temperature', 'humidity', 'dew_point', 'precipitations']
  L_labels_cos_sin = ['latitude', 'longitude', 'altitude', 'month_cos', 'month_sin', 'hour_cos', 'hour_sin', 'wind_direction_cos', 'wind_direction_sin', 'wind_speed', 'temperature', 'humidity', 'dew_point', 'precipitations']

  min_train = X_train[L_labels].min()
  max_train = X_train[L_labels].max()



  X_train = normalizing_data (X_train, L_labels, L_labels_cos_sin, min_train, max_train)
  X_test = normalizing_data (X_test, L_labels, L_labels_cos_sin, min_train, max_train)

  Y_temp = (Y_station_train - min_train['precipitations']) / (max_train['precipitations'] - min_train['precipitations'])
  y_train, y_test = Y_temp.loc[X_train.index], Y_temp.loc[X_test.index]

  print("train shape",X_train.shape,y_train.shape)
  print("test shape",X_test.shape,y_test.shape)

  return X_train,X_test,y_train,y_test


# Model Fit and validation

In [None]:

check_for_nan = df_imputation.isnull().sum().sum()
print("Xtrain nan :",check_for_nan)

Xtrain nan : 3520008


In [None]:


model = XGBRegressor(n_estimators=500, 
                     max_depth=18, 
                     #learning_rate=0.01, 
                    #  verbosity=None, 
                    #  objective='reg:squarederror', 
                    #  booster=None,
                    #  n_jobs=None, 
                      gamma=3.430739184133814, 
                      min_child_weight = 8,
                    #  max_delta_step=None, 
                    #  subsample=None,
                       reg_alpha=180,
                       reg_lambda=0.7436396623675846,
                    #  scale_pos_weight=None,
                    #  base_score=None,
                     random_state=123)
                    #  num_parallel_tree=None,
                    #  validate_parameters=None,
                    #  predictor=None,oilkju,;n 
                    #  enable_categorical=False)

t1_start = perf_counter()              
model.fit(x_train[:750000], y_train[:750000])
t1_stop = perf_counter()
print("Time:", t1_stop-t1_start)


Time: 13699.630568891


In [None]:
display(x_train)
print(x_train.shape)

In [None]:

file_name = "xgb_reg_2.pkl"

# save
pickle.dump(model, open(file_name, "wb"))

In [None]:
t1_start = perf_counter() 
y_predict = model.predict(x_test)
t1_stop = perf_counter()
print("Time:", t1_stop-t1_start)
print("My MAPE =", m_mape(y_test,y_predict))
print("MSE =",mean_squared_error(y_test,y_predict))

# Nouveaux test

In [None]:
test_set = pd.read_csv("/gdrive/MyDrive/MLDM Project/data/X_all_test_final.zip")

In [None]:
x_test = test_set[x_train.columns]
x_test = pd.DataFrame(StandardScaler().fit_transform(x_test), columns=x_train.columns)
y_pred = model.predict(x_test)

In [None]:
test_set = pd.concat([test_set, pd.DataFrame(y_pred, columns=['Prediction'])], axis=1)
pred_merged = test_set[['Id', 'Prediction']]
print(len(pred_merged))
df = pred_merged.groupby("Id").agg({"Prediction": pd.Series.sum})
print(len(df))

baseline = pd.read_csv("/gdrive/MyDrive/Baseline_observation_test.csv")

print("Remove Ids not in Baseline.")
submission = baseline.drop("Prediction",axis=1).merge(df, how="left", on="Id")

print(f"\nSum of NaNs :\n\n{submission.isna().sum()}\n\n")
if submission["Prediction"].isna().sum() > 0:
  print("fill nans with average.")
  submission["Prediction"].fillna(submission["Prediction"].mean(), inplace=True)

if len(submission) != 85140:
  print("Warning : len(df) != len(Baseline) i.e. {} != {}".format(len(submission), 183498))

display(submission)
print(submission.describe())
submission.to_csv('/gdrive/MyDrive/MLDM-Prediction/BIGfinal_xgboost_data_normalisee_Bigdata.csv', index=False)

In [None]:
print(submission.describe())

         Prediction
count  85140.000000
mean       3.315887
std        2.669195
min       -0.468424
25%        1.380118
50%        2.612640
75%        4.540940
max       77.393410
