# Preprocess Raw Data and Build Dataset

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load Data

In [None]:
data_folder = os.path.join('..', 'data')

raw_data_folder = os.path.join(data_folder, 'raw_data')
intermediate_data_folder = os.path.join(data_folder, 'intermediate_data')

In [None]:
datasets_dict = {}

datasets_dict['TORTOSA'] = {}
datasets_dict['GUIAMETS'] = {}
datasets_dict['MEQUINENZA'] = {}
datasets_dict['XERTA'] = {}

In [None]:
for file in os.listdir(raw_data_folder):
    location = file.split('_')[0]
    feature_name = '_'.join(file.split('_')[1:-2])
    if file.endswith('.csv'):
         datasets_dict[location][feature_name] = pd.read_csv(
            filepath_or_buffer=os.path.join(raw_data_folder, file),
            sep=';',
            decimal=',',
            date_format='%Y-%m-%d %H:%M:%S',
            header=0,
            encoding='utf-8',
        )
    elif file.endswith('.xlsx'):
        datasets_dict[location][feature_name] = pd.read_excel(
            os.path.join(raw_data_folder, file),
            date_format='%Y-%m-%d %H:%M:%S',
            header=0,
        )

In [None]:
tortosa_dfs = datasets_dict['TORTOSA']
guiamets_dfs = datasets_dict['GUIAMETS']
mequinenza_dfs = datasets_dict['MEQUINENZA']
xerta_dfs = datasets_dict['XERTA']

# Tortosa Preprocessing

In [None]:
tortosa_dfs.keys()

In [None]:
# The water temperature has two datasets, but the excel one has no missing values
tortosa_dfs['watertemperature'].isna().sum() / tortosa_dfs['watertemperature'].shape[0]

In [None]:
tortosa_dfs['water_temperature'].isna().sum() / tortosa_dfs['water_temperature'].shape[0]

In [None]:
tortosa_dfs.pop('water_temperature')

In [None]:
# Check cumulated rainfall data since it is the only csv file
tortosa_dfs['cumulated_rainfall_24h']

In [None]:
# fecha column is the one to take into account since 
# it is equal to the Fecha acumulado column in the same dataframe
# but it has no missing values
mask = tortosa_dfs['cumulated_rainfall_24h']['Fecha acumulado'] == tortosa_dfs['cumulated_rainfall_24h']['fecha']
tortosa_dfs['cumulated_rainfall_24h'][mask == False]

In [None]:
tortosa_dfs['cumulated_rainfall_24h'].isna().sum() / tortosa_dfs['cumulated_rainfall_24h'].shape[0]

In [None]:
tortosa_dfs['cumulated_rainfall_24h'] = tortosa_dfs['cumulated_rainfall_24h'][['fecha', 'Acumulado']].rename(columns={'fecha': 'DateTime', 'Acumulado': 'Average'})

In [None]:
tortosa_dfs['conductivity'].isna().sum() / tortosa_dfs['conductivity'].shape[0]

In [None]:
tortosa_dfs['flowriver'].isna().sum() / tortosa_dfs['flowriver'].shape[0]

In [None]:
tortosa_dfs['turbidity'].isna().sum() / tortosa_dfs['turbidity'].shape[0]

In [None]:
for feature, df in tortosa_dfs.items():
    if feature != 'cumulated_rainfall_24h':
        df.rename(
            columns={
                'Fecha': 'DateTime',
                'Promedio': 'Average',
                'Máximo': 'Maximum',
                'Mínimo': 'Minimum'
            },
            inplace=True
        )

In [None]:
for df in tortosa_dfs.values():
    df['DateTime'] = pd.to_datetime(df['DateTime'])
    df[df.columns.difference(['DateTime'])] = df[df.columns.difference(['DateTime'])].apply(pd.to_numeric, errors='coerce')


In [None]:
# Check for missing values in the datasets
for feature, df in tortosa_dfs.items():
    print(f'{feature}: {df.isna().sum().sum()}')

In [None]:
# For the moment, drop the missing values
for feature, df in tortosa_dfs.items():
    tortosa_dfs[feature] = df.dropna()

## Outliers Detection and Missing Values

### Inspect Data

#### Histograms

In [None]:
for feature, df in tortosa_dfs.items():
    plt.figure(figsize=(15, 7.5))
    sns.histplot(data=df['Average'], kde=True)
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

#### Boxplots

In [None]:
for feature, df in tortosa_dfs.items():
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=df, y='Average')
    plt.title(feature)
    plt.show()

#### Timeseries

In [None]:
for feature, df in tortosa_dfs.items():
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=df, x='DateTime', y='Average', label='Average')
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

### Clean Data

# Guiamets Preprocessing

In [None]:
guiamets_dfs.keys()

In [None]:
# Percentage of missing values
guiamets_dfs['cumulated_rainfall_24h'].isna().sum() / guiamets_dfs['cumulated_rainfall_24h'].shape[0]

In [None]:
guiamets_dfs['environmental_temperature'].isna().sum() / guiamets_dfs['environmental_temperature'].shape[0]

In [None]:
guiamets_dfs['cumulated_rainfall_24h'].columns.to_list()

In [None]:
guiamets_dfs['cumulated_rainfall_24h'].drop(columns=['Fecha m�ximo', 'M�ximo', 'Fecha acumulado'], inplace=True)
guiamets_dfs['environmental_temperature'].drop(columns=['Fecha m�ximo', 'Fecha m�nimo'], inplace=True)

In [None]:
guiamets_dfs['cumulated_rainfall_24h'].rename(
    columns={
        'fecha': 'DateTime',
        'Acumulado': 'Average'
    },
    inplace=True
)

guiamets_dfs['environmental_temperature'].rename(
    columns={
        'fecha': 'DateTime',
        'Media': 'Average',
        'M�nimo': 'Minimum',
        'M�ximo': 'Maximum'
    },
    inplace=True
)

In [None]:
for df in guiamets_dfs.values():
    df['DateTime'] = pd.to_datetime(df['DateTime'])
    df[df.columns.difference(['DateTime'])] = df[df.columns.difference(['DateTime'])].apply(pd.to_numeric, errors='coerce')

In [None]:
# Check for missing values in the datasets
for feature, df in guiamets_dfs.items():
    print(f'{feature}: {df.isna().sum().sum()}')

In [None]:
# For the moment, drop the missing values
for feature, df in guiamets_dfs.items():
    guiamets_dfs[feature] = df.dropna()

## Outliers Detection and Missing Values

### Inspect Data

#### Histograms

In [None]:
for feature, df in guiamets_dfs.items():
    plt.figure(figsize=(15, 7.5))
    sns.histplot(data=df['Average'], kde=True)
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

#### Boxplots

In [None]:
for feature, df in guiamets_dfs.items():
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=df, y='Average')
    plt.title(feature)
    plt.show()

#### Timeseries

In [None]:
for feature, df in guiamets_dfs.items():
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=df, x='DateTime', y='Average', label='Average')
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

### Clean Data

# Mequinenza Preprocessing

In [None]:
mequinenza_dfs.keys()

In [None]:
mequinenza_dfs['cumulated_rainfall_24h'].isna().sum() / mequinenza_dfs['cumulated_rainfall_24h'].shape[0]

In [None]:
mequinenza_dfs['cumulated_rainfall_24h'].drop(columns=['Fecha m�ximo', 'M�ximo', 'Fecha acumulado'], inplace=True)

In [None]:
mequinenza_dfs['cumulated_rainfall_24h'].rename(
    columns={
        'fecha': 'DateTime',
        'Acumulado': 'Average'
    },
    inplace=True
)

In [None]:
mequinenza_dfs['cumulated_rainfall_24h']['DateTime'] = pd.to_datetime(mequinenza_dfs['cumulated_rainfall_24h']['DateTime'])
mequinenza_dfs['cumulated_rainfall_24h'][mequinenza_dfs['cumulated_rainfall_24h'].columns.difference(['DateTime'])] = mequinenza_dfs['cumulated_rainfall_24h'][mequinenza_dfs['cumulated_rainfall_24h'].columns.difference(['DateTime'])].apply(pd.to_numeric, errors='coerce')

In [None]:
# Check for missing values in the datasets
for feature, df in mequinenza_dfs.items():
    print(f'{feature}: {df.isna().sum().sum()}')

In [None]:
# For the moment, drop the missing values
for feature, df in mequinenza_dfs.items():
    mequinenza_dfs[feature] = df.dropna()

## Outliers Detection and Missing Values

### Inspect Data

#### Histograms

In [None]:
for feature, df in mequinenza_dfs.items():
    plt.figure(figsize=(15, 7.5))
    sns.histplot(data=df['Average'], kde=True)
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

#### Boxplots

In [None]:
for feature, df in mequinenza_dfs.items():
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=df, y='Average')
    plt.title(feature)
    plt.show()

#### Timeseries

In [None]:
for feature, df in mequinenza_dfs.items():
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=df, x='DateTime', y='Average', label='Average')
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

### Clean Data

# Xerta Preprocessing

In [None]:
xerta_dfs.keys()

In [None]:
for feature, df in xerta_dfs.items():
    print('Feature:', feature)
    print()
    print('% missing values:')
    print()
    print(df.isna().sum() / df.shape[0])
    print()
    print('Column names:', df.columns.to_list())
    print()
    print('-' * 100)
    print()

In [None]:
for df in xerta_dfs.values():
    df.rename(
        columns={
            'Fecha': 'DateTime',
            'Promedio': 'Average',
            'Máximo': 'Maximum',
            'Mínimo': 'Minimum'
        },
        inplace=True
    )
    
    df['DateTime'] = pd.to_datetime(df['DateTime'])
    df[df.columns.difference(['DateTime'])] = df[df.columns.difference(['DateTime'])].apply(pd.to_numeric, errors='coerce')

In [None]:
# Check for missing values in the datasets
for feature, df in xerta_dfs.items():
    print(f'{feature}: {df.isna().sum().sum()}')

In [None]:
# For the moment, drop the missing values
for feature, df in xerta_dfs.items():
    xerta_dfs[feature] = df.dropna()

## Outliers Detection and Missing Values

### Inspect Data

#### Histograms

In [None]:
for feature, df in xerta_dfs.items():
    plt.figure(figsize=(15, 7.5))
    sns.histplot(data=df['Average'], kde=True)
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

#### Boxplots

In [None]:
for feature, df in xerta_dfs.items():
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=df, y='Average')
    plt.title(feature)
    plt.show()

#### Timeseries

In [None]:
for feature, df in xerta_dfs.items():
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=df, x='DateTime', y='Average', label='Average')
    # if 'Maximum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Maximum', label='Maximum')
    # if 'Minimum' in df.columns:
    #     sns.lineplot(data=df, x='DateTime', y='Minimum', label='Minimum')
    plt.title(feature)
    plt.legend()
    plt.show()

### Clean Data

# Save Data

In [None]:
datasets_dict['TORTOSA'] = tortosa_dfs
datasets_dict['GUIAMETS'] = guiamets_dfs
datasets_dict['MEQUINENZA'] = mequinenza_dfs
datasets_dict['XERTA'] = xerta_dfs

In [None]:
for location in datasets_dict.keys():
    if not os.path.exists(os.path.join(intermediate_data_folder, location)):
        os.makedirs(os.path.join(intermediate_data_folder, location))
        
    path = os.path.join(intermediate_data_folder, location)
    for feature, df in datasets_dict[location].items():
        df.to_excel(os.path.join(path, f'{feature}.xlsx'), index=False)