In [3]:
import os
import numpy as np
import pandas as pd
import polars as pl

In [4]:
#pip install polars

In [10]:
#mmap_dir = r"C:\Users\musab\mmap_files_combined_t-2"


In [9]:
parent_dir = os.path.abspath('../data/')  

processed_data_dir = os.path.join(parent_dir, 'processed_data')

mmap_dir = os.path.join(processed_data_dir, 'mmap_files_combined')


In [11]:



mmap_files = [file for file in os.listdir(mmap_dir) if file.endswith('.mmap')]

dfs = []
for mmap_file in mmap_files:
    file_path = os.path.join(mmap_dir, mmap_file)
    data = np.memmap(file_path, dtype=np.float32, mode='r')
    reshaped_data = data.reshape((-1, 22))
    df = pd.DataFrame(reshaped_data)
    dfs.append((mmap_file, df)) 

pl_dfs = []
for idx, (mmap_file, df) in enumerate(dfs):
    pl_df = pl.from_pandas(df)
    pl_dfs.append((mmap_file, pl_df))

for idx, (mmap_file, pl_df) in enumerate(pl_dfs):
    print(f"Polars DataFrame {idx + 1} - {mmap_file}:")
    print(pl_df)
    print()


Polars DataFrame 1 - Yaw-01_combine.mmap:
shape: (15, 22)
┌──────────┬───────────┬────────────┬───────────┬───┬──────────┬───────────┬───────────┬───────────┐
│ 0        ┆ 1         ┆ 2          ┆ 3         ┆ … ┆ 18       ┆ 19        ┆ 20        ┆ 21        │
│ ---      ┆ ---       ┆ ---        ┆ ---       ┆   ┆ ---      ┆ ---       ┆ ---       ┆ ---       │
│ f32      ┆ f32       ┆ f32        ┆ f32       ┆   ┆ f32      ┆ f32       ┆ f32       ┆ f32       │
╞══════════╪═══════════╪════════════╪═══════════╪═══╪══════════╪═══════════╪═══════════╪═══════════╡
│ 3.691024 ┆ 0.892393  ┆ -1.4698e-1 ┆ 1.0822e-1 ┆ … ┆ 0.787805 ┆ -0.485824 ┆ 0.074187  ┆ 0.185656  │
│          ┆           ┆ 2          ┆ 3         ┆   ┆          ┆           ┆           ┆           │
│ 3.712692 ┆ -0.462723 ┆ -1.4784e-1 ┆ 9.6410e-1 ┆ … ┆ 0.87347  ┆ -0.446525 ┆ -0.051322 ┆ 0.27992   │
│          ┆           ┆ 2          ┆ 4         ┆   ┆          ┆           ┆           ┆           │
│ 4.166931 ┆ 0.154067  ┆ -2.4333e

In [12]:
from sklearn.model_selection import train_test_split
import gc


def rename_columns(df):
    df.columns = ['Wind_Speed', 'Yaw_Error', 'Pitch_Angle_B1', 'Pitch_Angle_B2', 'Pitch_Angle_B3', 'Rotorposition', 'Generator_Speed', 'Rotor_Speed', 'Generator_Torque', 'Power', 'YAW_Bearing_THRUST', 'Acceleration_CROSS', 'Acceleration_THRUST', 'Tower_Deflection_Y', 'Tower_Deflection_Z', 'Tower_Deflection_RES', 'Tip_Deflection_flap_V1', 'Tip_Deflection_flap_V2', 'Tip_Deflection_flap_V3', 'Tip_Deflection_edge_V1', 'Tip_Deflection_edge_V2', 'Tip_Deflection_edge_V3']

big_raw_train_df = pd.DataFrame()
big_raw_test_df = pd.DataFrame()

for idx, (mmap_file, df) in enumerate(dfs):
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    if 'File' in train_df.columns:
        train_df.drop(columns=['File'], inplace=True)
    if 'File' in test_df.columns:
        test_df.drop(columns=['File'], inplace=True)

    rename_columns(train_df)
    rename_columns(test_df)

    big_raw_train_df = pd.concat([big_raw_train_df, train_df])
    big_raw_test_df = pd.concat([big_raw_test_df, test_df])

    del train_df, test_df
    gc.collect()

big_raw_train_df.reset_index(drop=True, inplace=True)
big_raw_test_df.reset_index(drop=True, inplace=True)


In [8]:
big_raw_train_df

Unnamed: 0,Wind_Speed,Yaw_Error,Pitch_Angle_B1,Pitch_Angle_B2,Pitch_Angle_B3,Rotorposition,Generator_Speed,Rotor_Speed,Generator_Torque,Power,...,Acceleration_THRUST,Tower_Deflection_Y,Tower_Deflection_Z,Tower_Deflection_RES,Tip_Deflection_flap_V1,Tip_Deflection_flap_V2,Tip_Deflection_flap_V3,Tip_Deflection_edge_V1,Tip_Deflection_edge_V2,Tip_Deflection_edge_V3
0,18.682123,-1.596192,1.680402e+01,1.680402e+01,1.680402e+01,262.380890,1162.288574,14.630025,16.742210,2037.717896,...,0.107292,-0.043204,0.250642,0.254351,-0.058906,0.715910,0.460624,-0.386754,0.202869,0.270197
1,6.107320,3.939924,4.447896e-14,-3.098482e-13,9.448531e-13,168.872147,724.662048,9.115246,3.774999,286.445984,...,0.031806,-0.007188,0.136193,0.136379,0.013959,-0.379246,0.130187,-0.024449,-0.563705,0.166524
2,3.965328,2.824905,6.225340e-14,-3.919337e-13,1.116625e-12,296.695587,616.304993,7.753157,1.343278,86.686752,...,0.055415,-0.008464,0.057083,0.057709,0.702802,0.502894,0.605750,-0.458867,0.254396,-0.061473
3,21.863661,9.685012,2.469822e+01,2.469822e+01,2.469822e+01,274.761414,1105.028809,13.912936,17.221859,1992.948853,...,0.709812,-0.077920,-0.003051,0.077963,1.639496,3.682669,1.812940,-0.359351,0.409954,0.382061
4,13.268320,-0.321607,1.045637e+01,1.045637e+01,1.045637e+01,181.425125,1157.229980,14.565252,16.942884,2053.245850,...,-0.162261,-0.034745,0.381078,0.382649,-1.276094,-1.459493,-0.905985,-0.097274,-0.448850,0.324759
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310283,20.511766,-17.582071,2.258259e+01,2.258259e+01,2.258259e+01,120.107780,1151.455078,14.534418,16.969955,2046.223999,...,0.304543,0.068725,0.123465,0.141317,0.206724,2.824255,0.838595,0.220407,-0.304410,0.040523
310284,13.952372,-17.124477,1.106933e+01,1.106933e+01,1.106933e+01,84.617981,1150.868774,14.462347,17.022270,2051.583496,...,0.131844,-0.139938,0.301437,0.332337,-0.507749,-0.952302,-0.839388,0.367092,-0.229924,-0.302130
310285,4.284587,-20.257898,-2.907866e-12,2.304370e-11,-6.412915e-12,43.423481,609.707825,7.669727,1.078151,68.829269,...,0.013570,-0.026145,0.053486,0.059535,0.564534,0.649684,0.577832,0.203349,0.046722,-0.501167
310286,20.995831,-15.862638,2.055640e+01,2.055640e+01,2.055640e+01,309.136139,1143.425293,14.367408,17.196159,2059.068115,...,-0.216517,0.027712,0.324377,0.325555,0.186293,0.526308,1.908796,-0.218616,0.489843,0.003028


In [13]:
import os
import pandas as pd

# Set the parent directory and subdirectory names
parent_dir = os.path.abspath(r'../data/')
processed_data_subdir = 'processed_data'
subdir_name = 'RawData_ModelingData'

# Initialize a suffix counter
suffix = 0

# Check if the directory already exists and find an available name
while True:
    if suffix == 0:
        target_dir = os.path.join(parent_dir, processed_data_subdir, subdir_name)
    else:
        target_dir = os.path.join(parent_dir, processed_data_subdir, f'{subdir_name}_{suffix}')
    
    if not os.path.exists(target_dir):
        break
    
    suffix += 1

# Create the target directory
os.makedirs(target_dir)

# Define file paths for Parquet files
train_parquet_path = os.path.join(target_dir, 'big_raw_train_df.parquet')
test_parquet_path = os.path.join(target_dir, 'big_raw_test_df.parquet')

# Export big_raw_train_df and big_raw_test_df as Parquet files
big_raw_train_df.to_parquet(train_parquet_path, index=False)
big_raw_test_df.to_parquet(test_parquet_path, index=False)

print(f"Exported data to {target_dir} as Parquet files.")


Exported data to c:\Users\musab\2023_musa-kaleem_martin\data\processed_data\RawData_ModelingData_1 as Parquet files.



# Set the parent directory and subdirectory names
parent_dir = os.path.abspath(r'D:')
processed_data_subdir = 'processed_data'
subdir_name = 'RawData_ModelingData'

# Initialize a suffix counter
suffix = 0

# Check if the directory already exists and find an available name
while True:
    if suffix == 0:
        target_dir = os.path.join(parent_dir, processed_data_subdir, subdir_name)
    else:
        target_dir = os.path.join(parent_dir, processed_data_subdir, f'{subdir_name}_{suffix}')
    
    if not os.path.exists(target_dir):
        break
    
    suffix += 1

# Create the target directory
os.makedirs(target_dir)

# Export big_raw_train_df and big_raw_test_df to the target directory
train_csv_path = os.path.join(target_dir, 'big_raw_train_df.csv')
test_csv_path = os.path.join(target_dir, 'big_raw_test_df.csv')

big_raw_train_df.to_csv(train_csv_path, index=False)
big_raw_test_df.to_csv(test_csv_path, index=False)

print(f"Exported data to {target_dir}")
