In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os import listdir,chdir
from os.path import abspath,basename,dirname,join
import sklearn.preprocessing as prepro
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
abalone = pd.read_csv("abalone_dataset.csv")

In [3]:
abalone.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,type
0,M,0.535,0.42,0.15,0.6995,0.2575,0.153,0.24,3
1,I,0.51,0.38,0.115,0.5155,0.215,0.1135,0.166,1
2,I,0.185,0.13,0.045,0.029,0.012,0.0075,0.0095,1
3,M,0.55,0.45,0.17,0.81,0.317,0.157,0.22,3
4,I,0.535,0.415,0.15,0.5765,0.3595,0.135,0.225,1


In [4]:
abalone.describe()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,type
count,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0
mean,0.521392,0.405865,0.138263,0.818738,0.355398,0.178349,0.235616,1.991379
std,0.120756,0.0996,0.039206,0.48956,0.221473,0.109554,0.139215,0.824561
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.345,0.11,0.436375,0.1815,0.090875,0.1275,1.0
50%,0.54,0.42,0.14,0.787,0.3305,0.168,0.225,2.0
75%,0.61,0.48,0.165,1.141625,0.4975,0.250125,0.323625,3.0
max,0.815,0.65,0.515,2.8255,1.488,0.76,1.005,3.0


# Transformando o sex em 3 colunas binarias

In [5]:
abalone['sex'] = LabelEncoder().fit_transform(abalone['sex'].tolist())

In [6]:
transformed_sex_feature = OneHotEncoder().fit_transform(abalone['sex'].values.reshape(-1,1)).toarray()
df_sex_encoded = pd.DataFrame(transformed_sex_feature, columns = ["sex_"+str(int(i)) for i in range(transformed_sex_feature.shape[1])])
abalone = pd.concat([abalone, df_sex_encoded], axis=1)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
abalone = abalone[['length','diameter','height','whole_weight','shucked_weight','viscera_weight','shell_weight','sex_0','sex_1','sex_2','type',]]

In [8]:
abalone.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,sex_0,sex_1,sex_2,type
0,0.535,0.42,0.15,0.6995,0.2575,0.153,0.24,0.0,0.0,1.0,3
1,0.51,0.38,0.115,0.5155,0.215,0.1135,0.166,0.0,1.0,0.0,1
2,0.185,0.13,0.045,0.029,0.012,0.0075,0.0095,0.0,1.0,0.0,1
3,0.55,0.45,0.17,0.81,0.317,0.157,0.22,0.0,0.0,1.0,3
4,0.535,0.415,0.15,0.5765,0.3595,0.135,0.225,0.0,1.0,0.0,1


In [9]:
abalone.to_csv("abalone_sex_bin.csv", index=False)

# Scaling
- Transformando todos os valores pra o intervalo entre (0,1)

In [10]:
numsei = abalone.drop(columns=abalone.columns[:-4]).astype('int64')

In [11]:
minmax_scaler = prepro.MinMaxScaler(feature_range=(0,1))

In [12]:
abaloneMinMax = minmax_scaler.fit_transform(abalone.drop(columns=abalone.columns[7:], axis=1))

In [13]:
abalone_scaled = pd.DataFrame(abaloneMinMax, columns=abalone.columns[:-4])

In [14]:
abalone_scaled = pd.merge(abalone_scaled,numsei, how='outer', on=abalone.index)

In [15]:
abalone_scaled = abalone_scaled.drop(columns='key_0')

In [16]:
abalone_scaled.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,sex_0,sex_1,sex_2,type
0,0.621622,0.613445,0.291262,0.247034,0.172495,0.20079,0.237668,0,0,1,3
1,0.587838,0.546218,0.223301,0.181866,0.143914,0.148782,0.163926,0,1,0,1
2,0.148649,0.12605,0.087379,0.009563,0.007397,0.009217,0.007972,0,1,0,1
3,0.641892,0.663866,0.330097,0.28617,0.212508,0.206057,0.217738,0,0,1,3
4,0.621622,0.605042,0.291262,0.203471,0.241089,0.17709,0.22272,0,1,0,1


In [17]:
abalone_scaled.to_csv("abalone_min_max.csv", index=False)

# Normalization
- L1 -> A soma dos valores absolutos de cada linha é igual 1.
- L2 -> A soma dos quadrados de cada linha é igual 1.

In [18]:
abalone_l1 = prepro.normalize(abalone.drop(columns=abalone.columns[7:], axis=1),norm='l1')
abalone_l1 = pd.DataFrame(abalone_l1, columns=abalone.columns[:-4])
abalone_l1 = pd.merge(abalone_l1,numsei, how='outer', on=abalone.index)
abalone_l1 = abalone_l1.drop(columns='key_0')

In [19]:
abalone_l1.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,sex_0,sex_1,sex_2,type
0,0.217923,0.171079,0.0611,0.284929,0.104888,0.062322,0.09776,0,0,1,3
1,0.253102,0.188586,0.057072,0.255831,0.1067,0.056328,0.082382,0,1,0,1
2,0.442584,0.311005,0.107656,0.069378,0.028708,0.017943,0.022727,0,1,0,1
3,0.205684,0.168287,0.063575,0.302917,0.118549,0.058714,0.082274,0,0,1,3
4,0.223289,0.173205,0.062604,0.240609,0.150042,0.056344,0.093907,0,1,0,1


In [20]:
abalone_l1.to_csv("abalone_L1.csv", index=False)

In [21]:
abalone_l2 = prepro.normalize(abalone.drop(columns=abalone.columns[7:], axis=1),norm='l2')
abalone_l2 = pd.DataFrame(abalone_l2, columns=abalone.columns[:-4])
abalone_l2 = pd.merge(abalone_l2,numsei, how='outer', on=abalone.index)
abalone_l2 = abalone_l2.drop(columns='key_0')

In [22]:
abalone_l2.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,sex_0,sex_1,sex_2,type
0,0.505135,0.396555,0.141627,0.660452,0.243126,0.144459,0.226603,0,0,1,3
1,0.58115,0.433014,0.131044,0.587418,0.244995,0.129334,0.189159,0,1,0,1
2,0.794046,0.557978,0.193146,0.124472,0.051506,0.032191,0.040775,0,1,0,1
3,0.471009,0.385371,0.145585,0.693668,0.271472,0.134452,0.188404,0,0,1,3
4,0.531978,0.412656,0.149153,0.573244,0.35747,0.134238,0.223729,0,1,0,1


In [23]:
abalone_l1.to_csv("abalone_L2.csv", index=False)