In [1]:
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler

In [2]:
csv_path = "data_points"
if not os.path.exists(csv_path): os.makedirs(csv_path)
log_path =  os.path.join(csv_path, 'logs')
if not os.path.exists(log_path): os.makedirs(log_path)

# Variables for OutlierRemover Class
origin_csv_path = os.path.join(csv_path, 'data_points_origin.csv')
outlier_removed_csv_path = os.path.join(csv_path, 'outlier_removed_data_points.csv')
lower_outliers_path =   os.path.join(log_path, 'lower_outliers.csv')
upper_outliers_path = os.path.join(log_path, 'upper_outliers.csv')

# Variable for DataNormalizer Class
normalized_csv_path = os.path.join(csv_path, 'normalized_data_points.csv')

In [3]:
class OutlierRemover:
    def __init__(self, filepath):
        """Initialize the DataFrame by taking the path to the CSV file"""
        self.origin_data = pd.read_csv(filepath)
    
    def remove_outliers(self, column_index):
        """Calculates the IQR for the specified column index and removes rows with outliers"""
        # Select that column
        col_values = self.origin_data.iloc[:, column_index]
        
        # Calculate IQRs
        Q1 = col_values.quantile(0.25)
        Q3 = col_values.quantile(0.75)
        IQR = Q3 - Q1
        
        # Calculate outlier boundary values
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Filtering outliers
        self.lower_outliers = self.origin_data[(col_values < lower_bound)]
        self.upper_outliers = self.origin_data[(col_values > upper_bound)]
        self.data = self.origin_data[(col_values >= lower_bound) & (col_values <= upper_bound)]

    
    def save_data(self, output_filepath):
        self.data.to_csv(output_filepath, index=False)
        print(f"Outlier removed csv file was created in -> {output_filepath}")

    def save_outliers_data(self, lower_outlier_path, upper_outlier_path):
        self.lower_outliers.to_csv(lower_outlier_path, index=False)
        print(f"Data detected as outliers stored in (lower bound) -> {lower_outlier_path}")
        self.upper_outliers.to_csv(upper_outlier_path, index=False)
        print(f"Data detected as outliers stored in (upper bound) -> {upper_outlier_path}")

In [4]:
remover = OutlierRemover(origin_csv_path)
remover.remove_outliers(column_index=3)  # Detect outliers based on the 4th index (run_time)
remover.save_data(outlier_removed_csv_path)
remover.save_outliers_data(lower_outliers_path, upper_outliers_path)

Outlier removed csv file was created in -> data_points/outlier_removed_data_points.csv
Data detected as outliers stored in (lower bound) -> data_points/logs/lower_outliers.csv
Data detected as outliers stored in (upper bound) -> data_points/logs/upper_outliers.csv


In [5]:
print( f"Origin data length : {len(remover.origin_data)}" )
print( f"Outliers removed data length : {len(remover.data)}")
print( f"Lower outliers length : {len(remover.lower_outliers)}")
print( f"Upper outliers length : {len(remover.upper_outliers)}")

Origin data length : 26072
Outliers removed data length : 26072
Lower outliers length : 0
Upper outliers length : 0


In [6]:
class DataNormalizer:
    def __init__(self, filepath):
        self.filepath = filepath
        self.data = self.load_data()

    def load_data(self):
        return pd.read_csv(self.filepath)

    def normalize_columns(self, normalized_filepath, num_of_data=None):
        """ Methods to normalize data from the 4th to the last column The data is scaled to values between 0 and 1 using the Min-Max normalization method"""
        scaler = MinMaxScaler()
        self.data.iloc[:, 7:] = scaler.fit_transform(self.data.iloc[:, 7:])
        self.save_data(normalized_filepath, num_of_data)
    
    def save_data(self, normalized_filepath, num_of_data = None):

        self.data.columns = ['target_X', 'target_Y', 'target_Z', 'qX', 'qY', 'qZ', 'qW', 'execution_time','distance', 'angle',
                'delta_of_6_axis', 'delta_of_3_axis', 'joint1', 'joint2', 'joint3', 'joint4', 'joint5', 'joint6']
        print(self.data)
        if(num_of_data == None):
            self.data.to_csv(normalized_filepath, index=False)
        else:
            sampled_data = self.data.sample(n=num_of_data, random_state=42)
            sampled_data.to_csv(normalized_filepath, index=False)
            print( f"Number of data changes based on \"num_of_data\" : ( {len(self.data)} ) -> ( {num_of_data} )" )
            
        print(f"Normalized csv file was created in -> {normalized_filepath}")

In [7]:
normalizer = DataNormalizer(outlier_removed_csv_path) 
normalized_data = normalizer.normalize_columns(normalized_csv_path) #num_of_data can be used to determine the number of data

ValueError: Length mismatch: Expected axis has 18 elements, new values have 13 elements