In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# paths
data_dir = r'data'
data_pth = os.path.join(data_dir, r'Almond.csv')

In [3]:
data = pd.read_csv(data_pth)

# drop the first column
data = data.drop(data.columns[0], axis=1)
data

Unnamed: 0,Length (major axis),Width (minor axis),Thickness (depth),Area,Perimeter,Roundness,Solidity,Compactness,Aspect Ratio,Eccentricity,Extent,Convex hull(convex area),Type
0,,227.940628,127.759132,22619.0,643.813269,,0.973384,1.458265,,,0.681193,23237.5,MAMRA
1,,234.188126,128.199509,23038.0,680.984841,,0.957304,1.601844,,,0.656353,24065.5,MAMRA
2,,229.418610,125.796547,22386.5,646.943212,,0.967270,1.487772,,,0.683620,23144.0,MAMRA
3,,232.763153,125.918808,22578.5,661.227483,,0.965512,1.540979,,,0.685360,23385.0,MAMRA
4,,230.150742,107.253448,19068.0,624.842706,,0.951450,1.629395,,,0.714800,20041.0,MAMRA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2798,,192.709366,122.356506,18471.5,653.345233,,0.931000,1.838965,,,0.725739,19840.5,SANORA
2799,,186.254745,118.708961,17213.5,581.688379,,0.952706,1.564234,,,0.714016,18068.0,SANORA
2800,,186.196182,119.147224,17510.5,608.315795,,0.948821,1.681705,,,0.718999,18455.0,SANORA
2801,,188.660828,120.634438,17941.0,630.759446,,0.944810,1.764701,,,0.738191,18989.0,SANORA


In [7]:
def data_split_basic(data, train_p=0.8):
    # shuffle the data
    data = data.sample(frac=1).reset_index(drop=True)
    
    # split the data
    train = data.iloc[:int(train_p*len(data)), :]
    test = data.iloc[int(train_p*len(data)):, :]
    
    return train, test
    

In [24]:
def data_transform_method1(data):
    # drop the Aspect Ratio and Eccentricity columns
    data = data.drop(['Aspect Ratio', 'Eccentricity'], axis=1)

    # add a new column perspective
    #   - if Length is null, perspective is -1, if Width is null, perspective is 1, if Thickness is null, perspective is 0
    data['Perspective'] = np.where(data['Length (major axis)'].isnull(), -1, np.where(data['Width (minor axis)'].isnull(), 1, 0))

    # retain only the two non null dimension columns, as dim1 and dim2
    # if Length is null, dim1 is Width, dim2 is Thickness
    # if Width is null, dim1 is Length, dim2 is Thickness
    # if Thickness is null, dim1 is Length, dim2 is Width
    return data

In [4]:
# Scaling
# Calculate scaling parameters for each column in the training set, such as mean, std dev, min, max and save the scaling parameters to a json file without actually scaling the data.
def calculate_data_scale(train):
    # calculate scaling parameters
    scaling_params = train.describe().loc[['mean', 'std', 'min', 'max'], :]    
    return scaling_params

# save scaling parameters to a file without scaling the data
def save_scaling_params(scaling_params, pth):
    scaling_params.to_json(pth, indent=4)


In [5]:
def save_datasets(train, test, dir):
    if not os.path.exists(os.path.join(data_dir, dir)):
        os.makedirs(os.path.join(data_dir, dir), exist_ok=True)
    train_pth = None
    if train is not None:
        train_pth = os.path.join(data_dir, dir, 'train.csv')
        train.to_csv(train_pth, index=False)
    
    test_pth = None
    if test is not None:
        test_pth = os.path.join(data_dir, dir, 'test.csv')
        test.to_csv(test_pth, index=False)
    
    return train_pth, test_pth

In [6]:
train, test = data_split_basic(data, train_p=0.9)
scaling_params = calculate_data_scale(train)
train_pth, test_pth = save_datasets(train, test, "basic")
# save_scaling_params(scaling_params, os.path.join(data_dir, 'basic', 'scaling_params.json'))

NameError: name 'data_split_basic' is not defined

In [25]:
transformed = data_transform_method1(data)
train, test = data_split_basic(transformed, train_p=0.9)
scaling_params = calculate_data_scale(train)
train_pth, test_pth = save_datasets(train, test, 'transformed')
save_scaling_params(scaling_params, os.path.join(data_dir, 'transformed', 'scaling_params.json'))