In [112]:
import pandas as pd
import matplotlib.pyplot as plt

In [113]:
# Read the training data, test data, and solution CSV files
train_data = pd.read_csv("./datasets_kaggle/breast-cancer/breast-cancer-diagnostic.shuf.lrn.csv")
test_data = pd.read_csv("./datasets_kaggle/breast-cancer/breast-cancer-diagnostic.shuf.tes.csv")
solution = pd.read_csv("./datasets_kaggle/breast-cancer/breast-cancer-diagnostic.shuf.sol.ex.csv")

# Notes
* learn decision tree with and without normalization and look for differences

## Training data set

In [114]:
print("Samples: " + str(train_data.shape[0]))
print("Features: " + str(train_data.shape[1]))

# Iterate over all columns
for column in train_data.columns:
    # Convert bytes to string for each column if necessary
    if train_data[column].dtype == 'object':  # Check if column dtype is object (usually indicates strings)
        train_data[column] = train_data[column].apply(lambda x: x.decode() if isinstance(x, bytes) else x)


Samples: 285
Features: 32


In [115]:
train_data

Unnamed: 0,ID,class,radiusMean,textureMean,perimeterMean,areaMean,smoothnessMean,compactnessMean,concavityMean,concavePointsMean,...,radiusWorst,textureWorst,perimeterWorst,areaWorst,smoothnessWorst,compactnessWorst,concavityWorst,concavePointsWorst,symmetryWorst,fractalDimensionWorst
0,886452,True,13.96,17.05,91.43,602.4,0.10960,0.12790,0.09789,0.05246,...,16.39,22.07,108.10,826.0,0.1512,0.3262,0.3209,0.13740,0.3068,0.07957
1,84348301,True,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.91,26.50,98.87,567.7,0.2098,0.8663,0.6869,0.25750,0.6638,0.17300
2,9012795,True,21.37,15.10,141.30,1386.0,0.10010,0.15150,0.19320,0.12550,...,22.69,21.84,152.10,1535.0,0.1192,0.2840,0.4024,0.19660,0.2730,0.08666
3,894326,True,18.22,18.87,118.70,1027.0,0.09746,0.11170,0.11300,0.07950,...,21.84,25.00,140.90,1485.0,0.1434,0.2763,0.3853,0.17760,0.2812,0.08198
4,867387,False,15.71,13.93,102.00,761.7,0.09462,0.09462,0.07135,0.05933,...,17.50,19.25,114.30,922.8,0.1223,0.1949,0.1709,0.13740,0.2723,0.07071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,911150,False,14.53,19.34,94.25,659.7,0.08388,0.07800,0.08817,0.02925,...,16.30,28.39,108.10,830.5,0.1089,0.2649,0.3779,0.09594,0.2471,0.07463
281,857156,False,13.49,22.30,86.91,561.0,0.08752,0.07698,0.04751,0.03384,...,15.15,31.82,99.00,698.8,0.1162,0.1711,0.2282,0.12820,0.2871,0.06917
282,8910251,False,10.60,18.95,69.28,346.4,0.09688,0.11470,0.06387,0.02642,...,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.2940,0.07587
283,8910499,False,13.59,21.84,87.16,561.0,0.07956,0.08259,0.04072,0.02142,...,14.80,30.04,97.66,661.5,0.1005,0.1730,0.1453,0.06189,0.2446,0.07024


### Feature information

In [116]:
print("Column".ljust(30), "Min".ljust(15), "Max".ljust(10), "Max/Min".ljust(10))
print("=" * 65)
data_normalize_values: {str, (float, float)} = {}
for column in train_data.columns:
    if column not in ['ID', 'class']:
        min_val = train_data[column].min()
        max_val = train_data[column].max()
        quot = 'inf'
        if min_val != 0:
            quot = max_val / min_val
        data_normalize_values[column] = (min_val, max_val)
        print(f"{column.ljust(30)}{str(min_val).ljust(15)}{str(max_val).ljust(10)}{str(quot).ljust(10)}")

Column                         Min             Max        Max/Min   
radiusMean                    7.691          25.73     3.3454687296840464
 textureMean                  9.71           39.28     4.045314109165808
 perimeterMean                47.98          174.2     3.6306794497707378
 areaMean                     170.4          2010.0    11.795774647887324
 smoothnessMean               0.06251        0.1634    2.613981762917933
 compactnessMean              0.01938        0.3454    17.82249742002064
 concavityMean                0.0            0.4264    inf       
 concavePointsMean            0.0            0.1913    inf       
 symmetryMean                 0.106          0.304     2.8679245283018866
 fractalDimensionMean         0.04996        0.09744   1.9503602882305846
 radiusStdErr                 0.1115         1.37      12.286995515695068
 textureStdErr                0.3871         3.896     10.064582795143373
 perimeterStdErr              0.873          11.07     12.6804

### Normalize Values

In [117]:
norm_train_data = train_data.copy()
for column in norm_train_data.columns:
    if column not in ['ID', 'class']:
        min_val = data_normalize_values[column][0]
        max_val = data_normalize_values[column][1]
        norm_train_data[column] = (norm_train_data[column] - min_val) / (max_val - min_val)
norm_train_data

Unnamed: 0,ID,class,radiusMean,textureMean,perimeterMean,areaMean,smoothnessMean,compactnessMean,concavityMean,concavePointsMean,...,radiusWorst,textureWorst,perimeterWorst,areaWorst,smoothnessWorst,compactnessWorst,concavityWorst,concavePointsWorst,symmetryWorst,fractalDimensionWorst
0,886452,True,0.347525,0.248225,0.344240,0.234834,0.466746,0.332863,0.229573,0.274229,...,0.315393,0.305936,0.306676,0.200106,0.494871,0.285128,0.274274,0.472165,0.296136,0.159958
1,84348301,True,0.206719,0.360839,0.234511,0.117254,0.792844,0.811361,0.566135,0.549922,...,0.254867,0.440791,0.253876,0.114304,0.909445,0.812734,0.587094,0.884880,1.000000,0.773459
2,9012795,True,0.758301,0.182279,0.739344,0.660796,0.372584,0.405251,0.453096,0.656038,...,0.573041,0.298935,0.558378,0.435623,0.268482,0.243904,0.343932,0.675601,0.229495,0.206514
3,894326,True,0.583680,0.309773,0.560292,0.465645,0.346417,0.283173,0.265009,0.415578,...,0.538279,0.395129,0.494308,0.419014,0.439689,0.236382,0.329316,0.610309,0.245662,0.175783
4,867387,False,0.444537,0.142712,0.427983,0.321429,0.318267,0.230783,0.167331,0.310141,...,0.360788,0.220091,0.342143,0.232261,0.290414,0.156865,0.146068,0.472165,0.228115,0.101779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,911150,False,0.379123,0.325668,0.366582,0.265982,0.211815,0.179805,0.206778,0.152901,...,0.311713,0.498326,0.306676,0.201601,0.195614,0.225246,0.322991,0.329691,0.178431,0.127520
281,857156,False,0.321470,0.425769,0.308430,0.212329,0.247894,0.176676,0.111421,0.176895,...,0.264682,0.602740,0.254619,0.157853,0.247259,0.133616,0.195043,0.440550,0.257295,0.091667
282,8910251,False,0.161262,0.312479,0.168753,0.095673,0.340668,0.292375,0.149789,0.138108,...,0.130950,0.332420,0.136091,0.066835,0.283339,0.212156,0.163761,0.272371,0.270899,0.135662
283,8910499,False,0.327014,0.410213,0.310410,0.212329,0.168996,0.193884,0.095497,0.111971,...,0.250368,0.548554,0.246954,0.145462,0.136187,0.135472,0.124188,0.212680,0.173502,0.098693


## Test data

In [118]:
print("Samples: " + str(test_data.shape[0]))
print("Features: " + str(test_data.shape[1]))

# Iterate over all columns
for column in train_data.columns:
    # Convert bytes to string for each column if necessary
    if train_data[column].dtype == 'object':  # Check if column dtype is object (usually indicates strings)
        train_data[column] = train_data[column].apply(lambda x: x.decode() if isinstance(x, bytes) else x)

Samples: 284
Features: 31


In [119]:
test_data

Unnamed: 0,ID,radiusMean,textureMean,perimeterMean,areaMean,smoothnessMean,compactnessMean,concavityMean,concavePointsMean,symmetryMean,...,radiusWorst,textureWorst,perimeterWorst,areaWorst,smoothnessWorst,compactnessWorst,concavityWorst,concavePointsWorst,symmetryWorst,fractalDimensionWorst
0,9012568,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,0.1721,...,16.20,15.73,104.50,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766
1,844981,13.00,21.82,87.50,519.8,0.12730,0.19320,0.18590,0.09353,0.2350,...,15.49,30.73,106.20,739.3,0.1703,0.5401,0.5390,0.20600,0.4378,0.10720
2,8813129,13.27,17.02,84.55,546.4,0.08445,0.04994,0.03554,0.02456,0.1496,...,15.14,23.60,98.84,708.8,0.1276,0.1311,0.1786,0.09678,0.2506,0.07623
3,881094802,17.42,25.56,114.50,948.0,0.10060,0.11460,0.16820,0.06597,0.1308,...,18.07,28.07,120.40,1021.0,0.1243,0.1793,0.2803,0.10990,0.1603,0.06818
4,88350402,13.64,15.60,87.38,575.3,0.09423,0.06630,0.04705,0.03731,0.1717,...,14.85,19.05,94.11,683.4,0.1278,0.1291,0.1533,0.09222,0.2530,0.06510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,9010598,12.76,18.84,81.87,496.6,0.09676,0.07952,0.02688,0.01781,0.1759,...,13.75,25.99,87.82,579.7,0.1298,0.1839,0.1255,0.08312,0.2744,0.07238
280,862485,11.60,12.84,74.34,412.6,0.08983,0.07525,0.04196,0.03350,0.1620,...,13.06,17.16,82.96,512.5,0.1431,0.1851,0.1922,0.08449,0.2772,0.08756
281,8910506,12.87,16.21,82.38,512.2,0.09425,0.06219,0.03900,0.01615,0.2010,...,13.90,23.64,89.27,597.5,0.1256,0.1808,0.1992,0.05780,0.3604,0.07062
282,8812877,15.75,20.25,102.60,761.3,0.10250,0.12040,0.11470,0.06462,0.1935,...,19.56,30.29,125.90,1088.0,0.1552,0.4480,0.3976,0.14790,0.3993,0.10640


### Normalize
Normalize the test set data by using the same values as in the normalization process of the training data

In [120]:
norm_test_data = test_data.copy()
for column in norm_test_data.columns:
    if column not in ['ID', 'class']:
        min_val = data_normalize_values[column][0]
        max_val = data_normalize_values[column][1]
        norm_test_data[column] = (norm_test_data[column] - min_val) / (max_val - min_val)
norm_test_data

Unnamed: 0,ID,radiusMean,textureMean,perimeterMean,areaMean,smoothnessMean,compactnessMean,concavityMean,concavePointsMean,symmetryMean,...,radiusWorst,textureWorst,perimeterWorst,areaWorst,smoothnessWorst,compactnessWorst,concavityWorst,concavePointsWorst,symmetryWorst,fractalDimensionWorst
0,9012568,0.415710,0.118363,0.393519,0.294303,0.169690,0.153242,0.079573,0.138892,0.333838,...,0.307623,0.112938,0.286082,0.197814,0.221790,0.136156,0.116410,0.281031,0.181585,0.081752
1,844981,0.294307,0.409537,0.313104,0.189933,0.642185,0.533157,0.435976,0.488918,0.651515,...,0.278587,0.569559,0.295807,0.171306,0.629996,0.494080,0.460684,0.707904,0.554416,0.341388
2,8813129,0.309274,0.247210,0.289732,0.204392,0.217465,0.093737,0.083349,0.128385,0.220202,...,0.264273,0.352511,0.253704,0.161175,0.327909,0.094541,0.152650,0.332577,0.185331,0.138026
3,881094802,0.539331,0.536016,0.527016,0.422701,0.377540,0.292068,0.394465,0.344851,0.125253,...,0.384099,0.488584,0.377038,0.264882,0.304563,0.141626,0.239573,0.377663,0.007295,0.085166
4,88350402,0.329785,0.199188,0.312153,0.220102,0.314402,0.143918,0.110342,0.195034,0.331818,...,0.252413,0.214003,0.226646,0.152737,0.329324,0.092588,0.131026,0.316907,0.190063,0.064942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,9010598,0.281002,0.308759,0.268499,0.177321,0.339479,0.184467,0.063039,0.093100,0.353030,...,0.207427,0.425266,0.190664,0.118290,0.343474,0.146120,0.107265,0.285636,0.232256,0.112745
280,862485,0.216697,0.105851,0.208842,0.131659,0.270790,0.171370,0.098405,0.175118,0.282828,...,0.179208,0.156469,0.162863,0.095967,0.437566,0.147292,0.164274,0.290344,0.237776,0.212424
281,8910506,0.287100,0.219817,0.272540,0.185801,0.314600,0.131311,0.091463,0.084422,0.479798,...,0.213561,0.353729,0.198959,0.124203,0.313760,0.143092,0.170256,0.198625,0.401814,0.101189
282,8812877,0.446754,0.356442,0.432736,0.321211,0.396372,0.309858,0.268996,0.337794,0.441919,...,0.445035,0.556164,0.408501,0.287138,0.523169,0.404111,0.339829,0.508247,0.478509,0.336135
