In [1]:
import pandas as pd
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
import openpyxl
import matplotlib.pyplot as plt



# rename all sheets in the excel workbook according to the names of the samples in names_of_files.txt

In [5]:
# Open the text file and read the names
with open('names_of_files.txt', 'r') as f:
    names = f.read().splitlines()

# Load the Excel workbook
workbook = openpyxl.load_workbook('all_data_abs.xlsx')

# Make sure there are enough names for all sheets
assert len(names) >= len(workbook.sheetnames), "Not enough names for all sheets"

# Rename each sheet to a separate name
for sheet, name in zip(workbook.sheetnames, names):
    worksheet = workbook[sheet]
    worksheet.title = name

# Save the modified workbook
workbook.save('data_abs\output_abs.xlsx')


# Importing all sheets into a dataframe dict

In [4]:
# Load spreadsheet
xls = pd.ExcelFile('data\output.xlsx')

# Load a sheet into a DataFrame by its name
df_dict = {sheet_name: xls.parse(sheet_name) for sheet_name in xls.sheet_names}


FileNotFoundError: [Errno 2] No such file or directory: 'output.xlsx'

<hr>

# Deleting columns with NaN

In [None]:
for df in df_dict.values():
    # Calculate the percentage of NaN values in each column
    nan_percentages = (df.isnull().sum() / len(df)) * 100

    # Define the threshold for the percentage of NaN values
    threshold = 1

    # Get the columns that exceed the threshold and store them in a list
    columns_to_drop = nan_percentages[nan_percentages > threshold].index.tolist()

    # Drop the columns that have more than 1 percent NaN values
    df.drop(columns=columns_to_drop, inplace=True)

In [None]:
with open("column_names.txt", "w") as fout:
    for df in df_dict.values():
        for column in df.columns:
            fout.writelines(column)
            fout.writelines("\n")
        break

<hr>

In [None]:
for df in df_dict.values():
    fig, axs = plt.subplots(2,1, figsize=(20,6))
    axs[0].plot(df['<rAccelPedal>'])
    axs[0].set_title('<rAccelPedal>')
    axs[1].plot(df['DrvBus.rAccelPedal'])
    axs[1].set_title('DrvBus.rAccelPedal')
    plt.tight_layout()
    plt.show()

<hr>

# compine datasets

In [None]:
df_combined = pd.DataFrame()
for idx, df in df_dict.items():
    df['run'] = idx
    df_combined = pd.concat([df_combined, df], axis=0, ignore_index=True)


<hr>

In [None]:
columns_to_keep = ['time','DrvBus.aSteerWheel', 'DrvBus.rAccelPedal', 'DrvBus.rBrakePedal', 'VehBus.Chassis.Body.CG.aRoll', 'VehBus.Chassis.Body.CG.nRoll', 'run']
df_combined = df_combined[columns_to_keep]
df_combined

<hr>

In [None]:
df_combined.describe()

<hr>

# split features and target

In [None]:
X = df_combined[['time', 'DrvBus.aSteerWheel', 'DrvBus.rAccelPedal', 'DrvBus.rBrakePedal']]
y = df_combined[['VehBus.Chassis.Body.CG.aRoll']]
y

# train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

<hr>

# TPOT

In [None]:
pipeline_optimizer = TPOTRegressor()

In [None]:
pipeline_optimizer = TPOTRegressor(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2)

In [None]:
pipeline_optimizer.fit(X_train, y_train)

In [None]:
print(pipeline_optimizer.score(X_test, y_test))

In [None]:
# pipeline_optimizer.export('tpot_exported_pipeline.py')