# Preparing the dataset

In [124]:
import pandas as pd
import csv

# Load dataset
url = r"C:\Users\macna\OneDrive\School\UNIC\Spring 2023\BLOC-514\Assignment 1\Part B1 Data.csv"

# Add NaN values for the '?' for User A => Address #2 & User D => Address #2
df = pd.read_csv(url, header=None, na_values='?')

# Displaying the dataset as provided in assignment
# Note that the left most column represents users where 0 = User A, 1 = User B, 2 = User C, and so on...
df.columns = ['Address 1', 'Address 2', 'Address 3', 'Address 4', 'Address 5', 'Address 6', 'Address 7']
pd.set_option('display.width', 120)

print(df)

   Address 1  Address 2  Address 3  Address 4  Address 5  Address 6  Address 7
0          5        NaN          0          6          0          4          0
1          3        0.0          3          0          0          0          3
2          0        0.0          0          0          6          0          8
3          4        NaN          0          7          0          3          0
4          3        0.0          4          0          0          0          0
5          0        0.0          0          0          0          9          7
6          0        0.0          0          0          7         10          0
7          0        0.0          0          0          0          8          9
8         24       87.0         15         21          0         24         32
9          6       16.0          1          5          0          3          2


In [125]:
# The transposed version of the data, which is suited for the weighting algorithm used in the KNNImputer Class
df_tranposed = df.T
print(df_tranposed)

             0    1    2    3    4    5     6    7     8     9
Address 1  5.0  3.0  0.0  4.0  3.0  0.0   0.0  0.0  24.0   6.0
Address 2  NaN  0.0  0.0  NaN  0.0  0.0   0.0  0.0  87.0  16.0
Address 3  0.0  3.0  0.0  0.0  4.0  0.0   0.0  0.0  15.0   1.0
Address 4  6.0  0.0  0.0  7.0  0.0  0.0   0.0  0.0  21.0   5.0
Address 5  0.0  0.0  6.0  0.0  0.0  0.0   7.0  0.0   0.0   0.0
Address 6  4.0  0.0  0.0  3.0  0.0  9.0  10.0  8.0  24.0   3.0
Address 7  0.0  3.0  8.0  0.0  0.0  7.0   0.0  9.0  32.0   2.0


# Finding the missing values

In [118]:
# Using KNNImputer class from scikit-learn machine learning library
# KNNImputer is a data transform that is first configured based on the method used to estimate the missing values.
from numpy import isnan
from sklearn.impute import KNNImputer

# Define imputer with the n_neighbors, weights, and similarity metric I deem to be best for assignment data
imputer = KNNImputer(n_neighbors=5, weights='distance', metric='nan_euclidean')

# Fit on the transposed dataset
imputer.fit(df_tranposed)

# Impute the missing values
Xtrans = imputer.transform(df_tranposed)

# Display the transformed dataset with imputed values
columns = ['User A', 'User B', 'User C', 'User D', 'User E', 'User F', 'User G', 'User H', 'User I', 'User J']
rows = ['Address 1', 'Address 2', 'Address 3', 'Address 4', 'Address 5', 'Address 6', 'Address 7']
Xtrans_df = pd.DataFrame(Xtrans, index=rows, columns=columns)

# Round the values to nearest integer of transactions, as similarity metric provides a real number value as opposed to integer
print(Xtrans_df.round().astype(int))

           User A  User B  User C  User D  User E  User F  User G  User H  User I  User J
Address 1       5       3       0       4       3       0       0       0      24       6
Address 2       3       0       0       3       0       0       0       0      87      16
Address 3       0       3       0       0       4       0       0       0      15       1
Address 4       6       0       0       7       0       0       0       0      21       5
Address 5       0       0       6       0       0       0       7       0       0       0
Address 6       4       0       0       3       0       9      10       8      24       3
Address 7       0       3       8       0       0       7       0       9      32       2
