### Questão 2 - Considere o breast cancer dataset.

In [1]:
# a. Identifique as features contínuas deste dataset.
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np

data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
# Adiciona target
df['target'] = data.target

print(f"Features Contínuas\n  {df.select_dtypes(include=['float64']).columns}")
print(f"Features Não Contínuas\n  {df.select_dtypes(exclude=['float64']).columns}\n")

Features Contínuas
  Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')
Features Não Contínuas
  Index(['target'], dtype='object')



In [2]:
# b. Selecione duas features contínuas do dataset e realize a discretização dessas variáveis por meio de quantização por bins fixos. Apresente os resultados.

# Visualizar intervalos
def getIntervals(intervals, f_name):
    print(" " + f_name)
    for i, inter in enumerate(intervals):
        print(f"  Bin {i}: {inter}")

# Selecionando duas features contínuas
features_to_discretize = ['mean radius', 'mean texture']

# Discretização com bins fixos
df_fixed_bins = df.copy()

# Discretização com 4 bins fixos
for feature in features_to_discretize:
    discretized = pd.cut(df[feature], bins=4)

    df_fixed_bins[f'{feature}_bin_fixed'] = discretized.cat.codes

    getIntervals(discretized.cat.categories, feature)
    
df_fixed_bins[[*features_to_discretize, 'mean radius_bin_fixed', 'mean texture_bin_fixed']].head(10).style.format("{:.2f}")

 mean radius
  Bin 0: (6.96, 12.263]
  Bin 1: (12.263, 17.545]
  Bin 2: (17.545, 22.828]
  Bin 3: (22.828, 28.11]
 mean texture
  Bin 0: (9.68, 17.102]
  Bin 1: (17.102, 24.495]
  Bin 2: (24.495, 31.888]
  Bin 3: (31.888, 39.28]


Unnamed: 0,mean radius,mean texture,mean radius_bin_fixed,mean texture_bin_fixed
0,17.99,10.38,2.0,0.0
1,20.57,17.77,2.0,1.0
2,19.69,21.25,2.0,1.0
3,11.42,20.38,0.0,1.0
4,20.29,14.34,2.0,0.0
5,12.45,15.7,1.0,0.0
6,18.25,19.98,2.0,1.0
7,13.71,20.83,1.0,1.0
8,13.0,21.82,1.0,1.0
9,12.46,24.04,1.0,1.0


In [3]:
# c. Selecione duas features contínuas do dataset (podem ser as mesmas do item (b)) e realize a discretização dessas variáveis por meio de quantização por bins variáveis. Apresente os resultados.

# Loop pelas features
for feature in features_to_discretize:
    # Discretização com quantis
    variable = pd.qcut(df[feature], q=4, duplicates='drop')
    df_fixed_bins[f'{feature}_bin_variable'] = variable.cat.codes
    getIntervals(variable.cat.categories, feature)

df_fixed_bins[[*features_to_discretize,'mean radius_bin_variable', 'mean texture_bin_variable']].head(10).style.format("{:.2f}")

 mean radius
  Bin 0: (6.9799999999999995, 11.7]
  Bin 1: (11.7, 13.37]
  Bin 2: (13.37, 15.78]
  Bin 3: (15.78, 28.11]
 mean texture
  Bin 0: (9.709000000000001, 16.17]
  Bin 1: (16.17, 18.84]
  Bin 2: (18.84, 21.8]
  Bin 3: (21.8, 39.28]


Unnamed: 0,mean radius,mean texture,mean radius_bin_variable,mean texture_bin_variable
0,17.99,10.38,3.0,0.0
1,20.57,17.77,3.0,1.0
2,19.69,21.25,3.0,2.0
3,11.42,20.38,0.0,2.0
4,20.29,14.34,3.0,0.0
5,12.45,15.7,1.0,0.0
6,18.25,19.98,3.0,2.0
7,13.71,20.83,2.0,2.0
8,13.0,21.82,1.0,3.0
9,12.46,24.04,1.0,3.0


### Questão 3 - Considere o breast cancer dataset.

In [4]:
# a. Defina uma função personalizada que realize uma normalização nas features do dataset usando FunctionTransformer do sklearn. Apresente os resultados obtidos.
from sklearn.preprocessing import FunctionTransformer

# Função de normalização Z-Score
def z_score_scaling(df):
    return (df - df.mean(axis=0)) / df.std(axis=0)

# Criando o transformer
min_max_transformer = FunctionTransformer(z_score_scaling)

# Aplicando a transformação
X_minmax = min_max_transformer.fit_transform(df)

# Convertendo para DataFrame
X_minmax_df = pd.DataFrame(X_minmax, columns=df.columns)

X_minmax_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,1.0961,-2.071512,1.268817,0.98351,1.567087,3.280628,2.650542,2.530249,2.215566,2.253764,...,-1.358098,2.301575,1.999478,1.306537,2.614365,2.107672,2.294058,2.748204,1.935312,-1.296535
1,1.828212,-0.353322,1.684473,1.90703,-0.826235,-0.486643,-0.023825,0.547662,0.001391,-0.867889,...,-0.368879,1.533776,1.888827,-0.375282,-0.430066,-0.14662,1.086129,-0.243675,0.280943,-1.296535
2,1.578499,0.455786,1.565126,1.557513,0.941382,1.052,1.36228,2.03544,0.938859,-0.397658,...,-0.023953,1.346291,1.455004,0.526944,1.08198,0.854222,1.953282,1.151242,0.201214,-1.296535
3,-0.768233,0.253509,-0.592166,-0.763792,3.280667,3.399917,1.914213,1.450431,2.864862,4.906602,...,0.133866,-0.24972,-0.549538,3.391291,3.889975,1.987839,2.173873,6.040726,4.930672,-1.296535
4,1.748758,-1.150804,1.775011,1.824624,0.280125,0.538866,1.369806,1.427237,-0.009552,-0.561956,...,-1.465481,1.337363,1.219651,0.220362,-0.313119,0.61264,0.728618,-0.86759,-0.396751,-1.296535


In [5]:
# b. Utilize o PowerTransform para normalizar as features do dataset. Apresente os resultados obtidos.
from sklearn.preprocessing import PowerTransformer

# Instanciando o PowerTransformer
power_transformer = PowerTransformer(method='yeo-johnson', standardize=True) # aceita valores negativos

# Aplicando a transformação
X_power = power_transformer.fit_transform(df)

# Convertendo em DataFrame
X_power_df = pd.DataFrame(X_power, columns=df.columns)

X_power_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,1.134881,-2.678666,1.259822,1.126421,1.504114,2.165938,1.862988,1.848558,1.953067,1.886398,...,-1.488367,1.810506,1.65221,1.282792,1.942737,1.730182,1.935654,2.197206,1.723624,-1.297676
1,1.619346,-0.264377,1.528723,1.633946,-0.820227,-0.384102,0.291976,0.820609,0.102291,-0.956577,...,-0.288382,1.430616,1.610022,-0.32508,-0.29658,0.070746,1.101594,-0.121997,0.537179,-1.297676
2,1.464796,0.547806,1.454664,1.461645,0.963977,1.163977,1.403673,1.683104,0.985668,-0.292433,...,0.071406,1.321941,1.425307,0.580301,1.209701,1.005512,1.722744,1.218181,0.453955,-1.297676
3,-0.759262,0.357721,-0.514886,-0.836238,2.781494,2.197843,1.642391,1.423004,2.360528,2.708925,...,0.228089,-0.03948,-0.43686,2.857821,2.282276,1.675087,1.862378,3.250202,2.517606,-1.297676
4,1.57126,-1.23352,1.58334,1.59512,0.343932,0.762392,1.407479,1.410929,0.090964,-0.511332,...,-1.637882,1.316582,1.309486,0.284367,-0.131829,0.817474,0.807077,-0.943554,-0.279402,-1.297676


### Questão 4 - Considere o breast cancer dataset.

In [6]:
# a. Faça uma normalização das features do dataset usando o MinMaxScaler. Apresente os resultados obtidos.

# Função de normalização min-max
def min_max_scaling(df):
    return (df - df.min(axis=0)) / (df.max(axis=0) - df.min(axis=0))

# Criando o transformer
min_max_transformer = FunctionTransformer(min_max_scaling)

# Aplicando a transformação
X_minmax = min_max_transformer.fit_transform(df)

# Convertendo para DataFrame
X_minmax_df = pd.DataFrame(X_minmax, columns=df.columns)

X_minmax_df.head()



Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,...,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864,0.0
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878,0.0
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433,0.0
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711,0.0
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,...,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595,0.0


In [7]:
# b. Faça o escalonamento padrão das features do dataset usando o StandardScaler. Apresente os resultados obtidos.
from sklearn.preprocessing import StandardScaler

# Aplicando StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
X_scaled_df = pd.DataFrame(X_scaled, columns=df.columns)

X_scaled_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,2.217515,2.255747,...,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015,-1.297676
1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,0.001392,-0.868652,...,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.24389,0.28119,-1.297676
2,1.579888,0.456187,1.566503,1.558884,0.94221,1.052926,1.363478,2.037231,0.939685,-0.398008,...,-0.023974,1.347475,1.456285,0.527407,1.082932,0.854974,1.955,1.152255,0.201391,-1.297676
3,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,2.867383,4.910919,...,0.133984,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.93501,-1.297676
4,1.750297,-1.151816,1.776573,1.826229,0.280372,0.53934,1.371011,1.428493,-0.00956,-0.56245,...,-1.46677,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.3971,-1.297676


In [8]:
# c. Faça a regularização das features do dataset usando a norma L2. Apresente os resultados obtidos.
from sklearn.preprocessing import Normalizer

# Aplicando normalização L2
l2_normalizer = Normalizer(norm='l2')
X_l2 = l2_normalizer.fit_transform(df)
X_l2_df = pd.DataFrame(X_l2, columns=df.columns)

X_l2_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,0.007925,0.004573,0.054099,0.440986,5.2e-05,0.000122,0.000132,6.5e-05,0.000107,3.5e-05,...,0.007635,0.081325,0.889462,7.1e-05,0.000293,0.000314,0.000117,0.000203,5.2e-05,0.0
1,0.008666,0.007486,0.055988,0.558619,3.6e-05,3.3e-05,3.7e-05,3e-05,7.6e-05,2.4e-05,...,0.009862,0.066899,0.824026,5.2e-05,7.9e-05,0.000102,7.8e-05,0.000116,3.8e-05,0.0
2,0.009367,0.010109,0.061842,0.572276,5.2e-05,7.6e-05,9.4e-05,6.1e-05,9.8e-05,2.9e-05,...,0.012145,0.072545,0.812984,6.9e-05,0.000202,0.000214,0.000116,0.000172,4.2e-05,0.0
3,0.016325,0.029133,0.110899,0.551922,0.000204,0.000406,0.000345,0.00015,0.000371,0.000139,...,0.037881,0.141333,0.811515,0.0003,0.001238,0.000982,0.000368,0.000949,0.000247,0.0
4,0.009883,0.006985,0.065808,0.631774,4.9e-05,6.5e-05,9.6e-05,5.1e-05,8.8e-05,2.9e-05,...,0.00812,0.074137,0.767189,6.7e-05,0.0001,0.000195,7.9e-05,0.000115,3.7e-05,0.0
