# Salt Wedge Intrusion Length predictions with Machine Learning: <br>Goro-Gnocca-Tolle-Dritta River branch Test Case 

---
# Data Splitting

## Import library

In [None]:
from google.colab import drive
import os

# Data Manipulation
import pandas as pd
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

## Function definition

### *Function to check if the path directory is a valid path*

In [None]:
def check_path(path_name):
  valid_path = False
  if not os.path.exists(path_name):
    print('Error! Invalid path selected.')
  else:
    print(path_name + ' is a valid path.')
    valid_path = True
  return valid_path

### *Function to split dataset in training data and test data*

In [None]:
def split_dataframe_in_train_test(predictor_names='', target_feature_name=-1, 
                                  dataframe=None, test_dataset_size=0.20, 
                                  is_shuffle=True):
  dataframe_splitted = None
  if dataframe is not None:
    if predictor_names == '':
      X = dataframe.iloc[:, :-1]
    else:
      X = dataframe[predictor_names]
    if target_feature_name == -1:
      y = dataframe.iloc[:,target_feature_name]
    else:
      y = dataframe[target_feature_name]

    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=test_dataset_size, 
                                                        shuffle=is_shuffle)

    train_dataframe = X_train.join(y_train)
    train_dataframe['DatasetType'] = ['TRAINING'] * len(train_dataframe)
    test_dataframe = X_test.join(y_test)
    test_dataframe['DatasetType'] = ['TEST'] * len(test_dataframe)
    dataframe_splitted = pd.concat([train_dataframe, test_dataframe])
    dataframe_splitted.sort_index(inplace=True)
  else:
    print('Error! No dataframe available')  
  return dataframe_splitted

## Load dataset

---

Check if the data are loaded in the specific path. If the path is a valid path and folder is not empty, then load the data.

### *Check directory and file*

In [None]:
drive.mount('/content/gdrive')

In [None]:
%cd /content/gdrive/MyDrive/
path_name = !pwd
path_name = path_name[0]

In [None]:
list_of_files = ''
found = False

if check_path(path_name):  
  if not os.listdir(path_name):
    print('Warning! Empty directory.')
  else:
    list_of_files = os.listdir(path_name)
    print('File available: {}'.format(list_of_files))
    if 'Lx_Dataset_EstuarIO_Raw.xlsx' in list_of_files:
      print("'Lx_Dataset_EstuarIO_Raw.xlsx' exists")
      found = True
    else:
      print("'Lx_Dataset_EstuarIO_Raw.xlsx' NOT exists")

### *Read the dataset*

In [None]:
if found:
  list_of_files = pd.read_excel(path_name + '/Lx_Dataset_EstuarIO_Raw.xlsx', sheet_name=['GORO','GNOCCA','TOLLE','DRITTA'])

## Po Goro Branch

In [None]:
goro_branch_dataset = list_of_files['GORO']
goro_branch_dataset = split_dataframe_in_train_test(predictor_names=['Date','Doy','BranchName',
                                                    'Qriver','Qll', 'Qtidef', 'Sll'],
                                   target_feature_name='LxObs', 
                                   dataframe = goro_branch_dataset, 
                                   test_dataset_size=0.20, 
                                   is_shuffle=True)

In [None]:
goro_branch_dataset

## Po Gnocca Branch

In [None]:
gnocca_branch_dataset = list_of_files['GNOCCA']
gnocca_branch_dataset = split_dataframe_in_train_test(predictor_names=['Date','Doy','BranchName',
                                                    'Qriver','Qll', 'Qtidef', 'Sll'],
                                   target_feature_name='LxObs', 
                                   dataframe = gnocca_branch_dataset, 
                                   test_dataset_size=0.20, 
                                   is_shuffle=True)

In [None]:
gnocca_branch_dataset

## Po Tolle Branch

In [None]:
tolle_branch_dataset = list_of_files['TOLLE']
tolle_branch_dataset = split_dataframe_in_train_test(predictor_names=['Date','Doy','BranchName',
                                                    'Qriver','Qll', 'Qtidef', 'Sll'],
                                   target_feature_name='LxObs', 
                                   dataframe = tolle_branch_dataset, 
                                   test_dataset_size=0.20, 
                                   is_shuffle=True)

In [None]:
tolle_branch_dataset

## Po Dritta Branch

In [None]:
dritta_branch_dataset = list_of_files['DRITTA']
dritta_branch_dataset = split_dataframe_in_train_test(predictor_names=['Date','Doy','BranchName',
                                                    'Qriver','Qll', 'Qtidef', 'Sll'],
                                   target_feature_name='LxObs', 
                                   dataframe = dritta_branch_dataset, 
                                   test_dataset_size=0.20, 
                                   is_shuffle=True)

In [None]:
dritta_branch_dataset

## All Branch

In [None]:
all_branch_dataset = pd.concat([goro_branch_dataset, gnocca_branch_dataset, 
                                tolle_branch_dataset, dritta_branch_dataset], 
                               ignore_index=True)

In [None]:
all_branch_dataset

## Save the new splitted dataset

In [None]:
%cd /content/gdrive/MyDrive/
path_name = !pwd
path_name = path_name[0]

In [None]:
if check_path(path_name):
  print('-----------------------------------------')
  with pd.ExcelWriter ('Lx_Dataset_EstuarIO_Processed_Training_Test_Data.xlsx') as writer:
    goro_branch_dataset.to_excel(writer, sheet_name='GORO', index_label='ID')
    gnocca_branch_dataset.to_excel(writer, sheet_name='GNOCCA', index_label='ID')
    tolle_branch_dataset.to_excel(writer, sheet_name='TOLLE', index_label='ID')
    dritta_branch_dataset.to_excel(writer, sheet_name='DRITTA', index_label='ID')
    all_branch_dataset.to_excel(writer, sheet_name='ALL', index_label= 'ID')  
  print('Dataset stored correctly')
else:
  print('Warning! Dataset NOT stored.')