<a href="https://colab.research.google.com/github/JessicaaaJe/Jesscia_Data_Mining_Project/blob/main/introduction_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# mount the drive in order to read in the dataset file.
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# import
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [None]:
# read in the csv file.
# the 2D matrix is built.
data = pd.read_csv('/content/drive/MyDrive/share_folders_csc373/Data/diabetes_dataset.csv')
print(data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [None]:
# List of attributes to scale
attributes_to_scale = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction"]


scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data[attributes_to_scale])

# get a dataframe that only includes scaled attributes.
only_scaled_df = pd.DataFrame(scaled_data, columns=attributes_to_scale)
print(only_scaled_df.head())

# get a dataframe that contains 6 scaled attributes and 3 original attributes.
data_cp = data.copy()
data_cp[attributes_to_scale] = scaler.fit_transform(data[attributes_to_scale])
print(data_cp.head())

    Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0  0.743719       0.590164       0.353535  0.000000  0.500745   
1  0.427136       0.540984       0.292929  0.000000  0.396423   
2  0.919598       0.524590       0.000000  0.000000  0.347243   
3  0.447236       0.540984       0.232323  0.111111  0.418778   
4  0.688442       0.327869       0.353535  0.198582  0.642325   

   DiabetesPedigreeFunction  
0                  0.234415  
1                  0.116567  
2                  0.253629  
3                  0.038002  
4                  0.943638  
   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0            6  0.743719       0.590164       0.353535  0.000000  0.500745   
1            1  0.427136       0.540984       0.292929  0.000000  0.396423   
2            8  0.919598       0.524590       0.000000  0.000000  0.347243   
3            1  0.447236       0.540984       0.232323  0.111111  0.418778   
4            0  0.688442       0.327869

In [None]:
# calculate Euclidean Distance between two observations
# this function will perform (x-y)^2 on each component in the vector of x and y. and sum them up.
def euclidean_distance(x, y):
    return np.sqrt(np.sum((x - y) ** 2))

In [None]:
# cosine similarity is calculated by inner product between x and y over the product of magnitude of x and
# y.
def cosine_similarity(x, y):
    inner_product = np.dot(x, y)
    norm_x = np.linalg.norm(x)
    norm_y = np.linalg.norm(y)
    return inner_product / (norm_x * norm_y)

In [None]:
def format_array(arr):
    return ', '.join(f"{item:.3f}" for item in arr)

In [None]:

while True:
    try:
        # Get user input for indices
        x_idx = int(input(f"Enter the index of the first observation (0 to {len(only_scaled_df)-1} or a negative number to save and exit): "))
        y_idx = int(input(f"Enter the index of the second observation (0 to {len(only_scaled_df)-1} or a negative number to save and exit): "))

        # Check if either index is negative
        if x_idx < 0 or y_idx < 0:
            # Save the updated data dataframe to Google Drive with '_out' appended
            output_filename = "/content/drive/MyDrive/Chen_Jessica_CSC_373/Introduction and Data/diabetes_dataset_out.csv"
            data_cp.to_csv(output_filename, index=False)
            print(f"Data saved to {output_filename}. Exiting program.")
            break

        # get observations based on indices from only_scaled_df
        x = only_scaled_df.iloc[x_idx].values
        y = only_scaled_df.iloc[y_idx].values

        # Compute the metrics
        eu_dist = euclidean_distance(x, y)
        cos_sim = cosine_similarity(x, y)

        # Display results
        print(f"\nObservation {x_idx}: {format_array(x)}")
        print(f"Observation {y_idx}: {format_array(y)}")

        print(f"Euclidean Distance: {eu_dist:.3f}")
        print(f"Cosine Similarity: {cos_sim:.3f}\n")

    except ValueError:
        print("Please enter a valid integer index.")
    except IndexError:
        print("Index out of range. Please enter a valid index.")



Enter the index of the first observation (0 to 767 or a negative number to save and exit): 1
Enter the index of the second observation (0 to 767 or a negative number to save and exit): 4

Observation 1: 0.427, 0.541, 0.293, 0.000, 0.396, 0.117
Observation 4: 0.688, 0.328, 0.354, 0.199, 0.642, 0.944
Euclidean Distance: 0.949
Cosine Similarity: 0.767

Enter the index of the first observation (0 to 767 or a negative number to save and exit): 3
Enter the index of the second observation (0 to 767 or a negative number to save and exit): 6

Observation 3: 0.447, 0.541, 0.232, 0.111, 0.419, 0.038
Observation 6: 0.392, 0.410, 0.323, 0.104, 0.462, 0.073
Euclidean Distance: 0.178
Cosine Similarity: 0.979

Enter the index of the first observation (0 to 767 or a negative number to save and exit): -1
Enter the index of the second observation (0 to 767 or a negative number to save and exit): -2
Data saved to /content/drive/MyDrive/Chen_Jessica_CSC_373/Introduction and Data/diabetes_dataset_out.csv. E