In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from google.colab import drive
drive.mount('/content/drive')

#Set header to none since there is no header in this data
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/pima-indians-diabetes.data.csv", header=None)
#Print the features of the first 5 samples.
df.head()

Mounted at /content/drive


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
#Calculate and print the correlation between the features (columns)
correlation_matrix = df.corr()
correlation_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
1,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
2,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
3,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
4,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548


In [3]:
#Rescale all 9 features into the range of (0,1) and print the scaled value of the features of the first 5 samples.
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df)
scaled_data_df = pd.DataFrame(scaled_data)

# Display the scaled values of the first 5 samples
print(scaled_data_df.head())


          0         1         2         3         4         5         6  \
0  0.352941  0.743719  0.590164  0.353535  0.000000  0.500745  0.234415   
1  0.058824  0.427136  0.540984  0.292929  0.000000  0.396423  0.116567   
2  0.470588  0.919598  0.524590  0.000000  0.000000  0.347243  0.253629   
3  0.058824  0.447236  0.540984  0.232323  0.111111  0.418778  0.038002   
4  0.000000  0.688442  0.327869  0.353535  0.198582  0.642325  0.943638   

          7    8  
0  0.483333  1.0  
1  0.166667  0.0  
2  0.183333  1.0  
3  0.000000  0.0  
4  0.200000  1.0  


In [4]:
#Calculate and print the correlation of the features for the rescaled data.
scaled_correlation_matrix = scaled_data_df.corr()
scaled_correlation_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
1,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
2,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
3,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
4,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548


Have the correlations changed? (Please answer this question in a Text box inside your program.)

No, scaling changes the units but not the underlying relationships. Correlation refers to the relative positioning of the data points.

In [5]:
#Group data by ‘class’.
class_grouped = df.groupby(df.columns[-1])
for name, group in class_grouped:
    print(f"Class: {name}")
    print(group.head())  # Print the first 5 rows of each class

Class: 0
     0    1   2   3   4     5      6   7  8
1    1   85  66  29   0  26.6  0.351  31  0
3    1   89  66  23  94  28.1  0.167  21  0
5    5  116  74   0   0  25.6  0.201  30  0
7   10  115   0   0   0  35.3  0.134  29  0
10   4  110  92   0   0  37.6  0.191  30  0
Class: 1
   0    1   2   3    4     5      6   7  8
0  6  148  72  35    0  33.6  0.627  50  1
2  8  183  64   0    0  23.3  0.672  32  1
4  0  137  40  35  168  43.1  2.288  33  1
6  3   78  50  32   88  31.0  0.248  26  1
8  2  197  70  45  543  30.5  0.158  53  1


In [6]:
#Calculate the correlation of the features (columns) over samples in ‘class’=0
#Calculate the correlaƟon of the features (columns) over samples in ‘class’=1
correlations = {}
for class_value, group in class_grouped:
    print(f"Correlation matrix for class {class_value}:")
    grouped_correlation_matrix = group.drop(df.columns[-1], axis=1).corr()
    print(grouped_correlation_matrix)
    correlations[class_value] = grouped_correlation_matrix

Correlation matrix for class 0:
          0         1         2         3         4         5         6  \
0  1.000000  0.098685  0.133096 -0.118340 -0.131986  0.016495 -0.079951   
1  0.098685  1.000000  0.192795  0.016015  0.352957  0.131749  0.095548   
2  0.133096  0.192795  1.000000  0.187072  0.074626  0.363178  0.027292   
3 -0.118340  0.016015  0.187072  1.000000  0.412790  0.438606  0.095181   
4 -0.131986  0.352957  0.074626  0.412790  1.000000  0.254202  0.227385   
5  0.016495  0.131749  0.363178  0.438606  0.254202  1.000000  0.070664   
6 -0.079951  0.095548  0.027292  0.095181  0.227385  0.070664  1.000000   
7  0.572776  0.228018  0.214694 -0.163788 -0.149234  0.036070  0.041665   

          7  
0  0.572776  
1  0.228018  
2  0.214694  
3 -0.163788  
4 -0.149234  
5  0.036070  
6  0.041665  
7  1.000000  
Correlation matrix for class 1:
          0         1         2         3         4         5         6  \
0  1.000000 -0.054591  0.126963 -0.079165 -0.078563 -0.1590

Are the correlations equal to that of the whole data (item (3) above)?  (Please answer this question in
a Text box inside your program.) (Hint: You may see NaN for some of the results. Do not worry. This is
because the value of one of the features for all of the samples are equal; 0 in this case).

No, see extra code below where i compare the correlations


In [7]:
are_correlations_equal = True
for class_value, group in df.groupby(df.columns[-1]):
    group_corr = group.drop(df.columns[-1], axis=1).corr()
    if not group_corr.equals(correlation_matrix.drop(df.columns[-1], axis=1)):
        are_correlations_equal = False
        break

print(f"Are the correlations within each class equal to the whole dataset's correlations? {are_correlations_equal}")

Are the correlations within each class equal to the whole dataset's correlations? False


Are the correlations between features in class 1 and class 0 the same? (Please answer this question
in a Text box inside your program.)

No, see extra code below where i compare the correlations

In [8]:
class_0_data = df[df[df.columns[-1]] == 0].drop(df.columns[-1], axis=1)
class_1_data = df[df[df.columns[-1]] == 1].drop(df.columns[-1], axis=1)

# Calculate the correlation matrix for each class
class_0_corr = class_0_data.corr()
class_1_corr = class_1_data.corr()

# Compare the correlation matrices between class 1 and class 0
# This is a strict comparison, in practice you might consider a tolerance for floating-point comparisons
are_correlations_equal = class_0_corr.equals(class_1_corr)

print(f"Are the correlations between features in class 1 and class 0 the same? {are_correlations_equal}")

Are the correlations between features in class 1 and class 0 the same? False
