In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
sns.set_theme(style="whitegrid")

In [18]:
train_pd = pd.read_csv('data/train.csv')
test_pd = pd.read_csv('data/test.csv')

In [19]:
def encode_sex_column(data):
    encoder = OneHotEncoder()
    sex_encoded = encoder.fit_transform(data[["Sex"]])
    encoded_df = pd.DataFrame(sex_encoded.toarray(), columns=encoder.get_feature_names_out(["Sex"]))
    data = pd.concat([data.drop("Sex", axis=1), encoded_df], axis=1)
    return data

In [20]:
train_pd = encode_sex_column(train_pd)
test_pd = encode_sex_column(test_pd)

In [21]:
train_pd.head()

Unnamed: 0,id,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11,1.0,0.0,0.0
1,1,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11,1.0,0.0,0.0
2,2,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6,0.0,1.0,0.0
3,3,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10,0.0,0.0,1.0
4,4,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9,0.0,1.0,0.0


In [23]:
train_pd = train_pd.set_index("id")

In [25]:
test_pd = test_pd.set_index("id")

In [24]:
train_pd.head()

Unnamed: 0_level_0,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings,Sex_F,Sex_I,Sex_M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11,1.0,0.0,0.0
1,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11,1.0,0.0,0.0
2,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6,0.0,1.0,0.0
3,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10,0.0,0.0,1.0
4,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9,0.0,1.0,0.0


In [29]:
def calculate_percentiles(data: pd.DataFrame, column: str):
    p01 = data[column].quantile(q=0.01)
    p99 = data[column].quantile(q=0.99)
    return p01, p99

In [30]:
for col in train_pd.columns:
    if col == "Rings":
        continue
    p01, p99 = calculate_percentiles(train_pd, col)
    print(col)
    print(p01, p99)

Length
0.19 0.72
Diameter
0.135 0.57
Height
0.045 0.215
Whole weight
0.032 1.93843
Whole weight.1
0.0125 0.8968600000000007
Whole weight.2
0.0065 0.422
Shell weight
0.01 0.57
Sex_F
0.0 1.0
Sex_I
0.0 1.0
Sex_M
0.0 1.0


In [33]:
train_pd.describe(percentiles=[0.01, 0.25, 0.5, 0.75, 0.99])

Unnamed: 0,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings,Sex_F,Sex_I,Sex_M
count,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0
mean,0.517098,0.401679,0.135464,0.789035,0.340778,0.169422,0.225898,9.696794,0.292391,0.365204,0.342405
std,0.118217,0.098026,0.038008,0.457671,0.204428,0.100909,0.130203,3.176221,0.454863,0.48149,0.474517
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0,0.0,0.0,0.0
1%,0.19,0.135,0.045,0.032,0.0125,0.0065,0.01,4.0,0.0,0.0,0.0
25%,0.445,0.345,0.11,0.419,0.1775,0.0865,0.12,8.0,0.0,0.0,0.0
50%,0.545,0.425,0.14,0.7995,0.33,0.166,0.225,9.0,0.0,0.0,0.0
75%,0.6,0.47,0.16,1.0675,0.463,0.2325,0.305,11.0,1.0,1.0,1.0
99%,0.72,0.57,0.215,1.93843,0.89686,0.422,0.57,20.0,1.0,1.0,1.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0,1.0,1.0,1.0


In [46]:
train_pd, val_pd = train_test_split(train_pd, test_size=0.2, random_state=42)

In [39]:
def save_to_numpy_file(data: pd.DataFrame, subset: str):
    if "Rings" in data.columns:
        y = data["Rings"].to_numpy()
        np.save(f"data/y_{subset}.npy", y)
        data = data.drop("Rings", axis=1)
    X = data.to_numpy()
    np.save(f"data/X_{subset}.npy", X)

In [47]:
save_to_numpy_file(train_pd, "train")

In [48]:
save_to_numpy_file(val_pd, "val")

In [41]:
save_to_numpy_file(test_pd, "test")