<a href="https://colab.research.google.com/github/Karim-Anwar/masterProject/blob/main/preporcess_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import scipy
import datetime

In [3]:
from statsmodels.tsa.stattools import adfuller

In [4]:
# Autocorrelation
from statsmodels.graphics.tsaplots import plot_acf
# Partial Autocorrelation
from statsmodels.graphics.tsaplots import plot_pacf

In [5]:
import seaborn as sns

In [6]:
from statsmodels.tsa.stattools import ccf

In [7]:
import os
import re

In [8]:
def read_csv_files_from_directories(root_directory):
    directories = []
    for root, dirs, files in os.walk(root_directory):
        for file in files:
            if file.endswith('.csv'):
                directories.append(root)
                break
    return directories

In [9]:
# def extract_numbers_from_csv_names(root_directory):
#     numbers = []
#     pattern = r'TL(\d+)\.csv'

#     for dirpath, dirnames, filenames in os.walk(root_directory):
#       # if dirpath == root_directory:
#       # # Skip the root directory
#       #   continue
#       for file_name in filenames:
#           if file_name.endswith('.csv'):
#               match = re.search(pattern, file_name)
#               if match:
#                   extracted_number = match.group(1)
#                   numbers.append(int(extracted_number))
    
#     return numbers

In [10]:
def extract_numbers_from_csv_names(file_name):
    numbers = []
    pattern = r'TL(\d+)\.csv'
    
    match = re.search(pattern, file_name)
    if match:
        extracted_number = match.group(1)
        numbers.append(int(extracted_number))
    
    return numbers

In [11]:
def extract_direction_from_subdirectories(root_directory):
    directions = []
    pattern = r'.*([ENSW])$'

    for dirpath, dirnames, filenames in os.walk(root_directory):
        for dirname in dirnames:
            match = re.search(pattern, dirname)
            if match:
                extracted_direction = match.group(1)
                directions.append(extracted_direction)

    return directions

In [12]:
def get_fill_method(number):
    if number < 10:
        return 'ffill'  # Fill forward
    elif number % 10 == 1:
        return 'backfill'  # Backward fill
    elif number % 10 == 2:
        return 'interpolate'  # Linear interpolation
    else:
        return None

In [13]:
def read_and_resample_csv_files(directory, start_index=None, end_index=None):
    dataframes = []
    file_names = os.listdir(directory)


    for file_name in file_names:
      if file_name.endswith('.csv'):
          file_path = os.path.join(directory, file_name)
          
          df = pd.read_csv(file_path)
          fill_method = get_fill_method(extract_numbers_from_csv_names(file_name)[0])
          
          if fill_method is None:
            return None
          elif fill_method == 'linear':
            return None

          df.columns = ['seq', 'time', 'value']

          # Drop the 'seq' column
          if 'seq' in df.columns:
              df.drop('seq', axis=1, inplace=True)
          
          df['time'] = pd.to_datetime(df['time'])
          df.set_index('time', inplace=True)
          # Check for duplicate indices and keep the first occurrence
          df = df.loc[~df.index.duplicated(keep='first')]
          
          # Sort the index in ascending order
          df.sort_index(inplace=True)


          if fill_method == 'interpolate':
            resampled_df = df.asfreq('1S').interpolate(method='linear')
          else:
            resampled_df = df.asfreq('1S', method=fill_method)

          
          if start_index is not None and end_index is not None:
              resampled_df = resampled_df.loc[start_index:end_index]
            
      dataframes.append(resampled_df)
            
    if dataframes:
      merged_df = pd.concat(dataframes, axis=1)

      merged_df.columns = ['pir', 'shade', 'alight'] # Rename columns based on the number
      return merged_df
    
    return None

In [14]:
root_directory = '/content/drive/MyDrive/exploratory-data-analysis/data/'
output_directory = '/content/drive/MyDrive/exploratory-data-analysis/cleaned_data'
# Step 1: Read CSV files from multiple directories
directories = read_csv_files_from_directories(root_directory)[1:]

# Step 2: Read and merge CSV files from the same directory
merged_dataframes = []
for directory in directories:
    merged_df = read_and_resample_csv_files(directory, start_index='2023-03-22 00:20:00', end_index='2023-05-19 12:10:00')
    if merged_df is not None:
        merged_dataframes.append(merged_df)

for i, df in enumerate(merged_dataframes):
    original_directory = directories[i]
    directory_name = os.path.basename(original_directory)
    file_name = f'{directory_name}.csv'
    file_path = os.path.join(output_directory, file_name)
    df.to_csv(file_path, index=True)

In [None]:
merged_dataframes