In [10]:
import pandas as pd
import numpy as np

In [None]:
# Reload with explicit dtypes to keep ISBNs as strings
trainSet = pd.read_csv('rating10user91_trainset.csv', dtype={'userid': int, 'isbn': str, 'rating': float})
testSet  = pd.read_csv('rating10user91_testset.csv',  dtype={'userid': int, 'isbn': str, 'rating': float})

# Drop any malformed rows defensively
trainSet = trainSet.dropna(subset=['userid','isbn','rating'])
testSet  = testSet.dropna(subset=['userid','isbn','rating'])


In [12]:
trainSet.head()

Unnamed: 0,userid,isbn,rating
0,6251,60392452,10.0
1,6251,61009059,7.0
2,6251,140067477,10.0
3,6251,375727345,6.0
4,6251,380789035,7.0


In [13]:
trainSet.shape

(1140, 3)

In [14]:
# Build user–item matrix (users as rows, ISBNs as columns)
user_item = pd.pivot_table(data=trainSet, index='userid', columns='isbn', values='rating')
user_item

isbn,014028009X,014029628X,034538475X,043935806X,044021145X,044022165X,044023722X,044651652X,059035342X,067976402X,...,671003755,671027360,671041789,679781587,743418174,786868716,804106304,805063897,842329129,971880107
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6251,,,,,,,,,,8.0,...,,,,,,,,,,
6575,,,,,,,,,,,...,,,,,,,,,,
7346,,8.0,,,,,,,,,...,,,,,,7.0,9.0,,,
11676,8.0,7.0,6.0,,1.0,,8.0,8.0,10.0,10.0,...,1.0,,5.0,,8.0,9.0,,,9.0,6.0
13552,,,,,,8.0,,8.0,,,...,,,,,10.0,9.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261829,,,,,,,,,,8.0,...,,9.0,,,,,,,,
265115,7.0,9.0,,,,,,,,,...,,,,,,,,,,
270713,,7.0,,,,,,,,,...,,,,10.0,,10.0,8.0,8.0,,
271448,,,,10.0,,,10.0,,,,...,,,,,,,,,,2.0


In [15]:
users = user_item.index.to_list()
n_users = len(users)
rows = []  # will collect tuples: (user_id_1, user_id_2, pcc)

# Minimum co-rated items so variance can be nontrivial; slides don’t specify, but n>=2 is sensible.
min_common = 2

In [16]:
for i in range(n_users):
    ui = users[i]
    ra = user_item.loc[ui]  # Series of ISBN=>rating for user i
    for j in range(i + 1, n_users):
        uj = users[j]
        rb = user_item.loc[uj]

        # Co-rated mask and count
        common_mask = ra.notna() & rb.notna()
        n_common = int(common_mask.sum())

        if n_common < min_common:
            pcc = 0.0  # convention; alternatively use np.nan or skip the pair
        else:
            a_vals = ra[common_mask].to_numpy(dtype=float)
            b_vals = rb[common_mask].to_numpy(dtype=float)

            # Center by the mean over common items (matches the slide formula)
            a_center = a_vals - a_vals.mean()
            b_center = b_vals - b_vals.mean()

            denom = np.linalg.norm(a_center) * np.linalg.norm(b_center)
            if denom == 0.0:
                pcc = 0.0  # undefined due to zero variance on common set; choose a convention
            else:
                pcc = float((a_center * b_center).sum() / denom)

        # Round PCC to 4 decimal places before appending
        rows.append((ui, uj, round(pcc, 4)))

len(rows)  # should be n_users * (n_users - 1) / 2

4095

In [17]:
# Rebuild with correct column mapping
pcc_df = pd.DataFrame(rows, columns=["user_id_1", "user_id_2", "pcc"])

# quick sanity
pcc_df.head(10)

Unnamed: 0,user_id_1,user_id_2,pcc
0,6251,6575,0.0
1,6251,7346,0.0
2,6251,11676,0.0
3,6251,13552,0.0
4,6251,16795,1.0
5,6251,17950,0.0
6,6251,21014,0.0
7,6251,23872,0.0
8,6251,23902,0.0
9,6251,28634,0.0


In [18]:
# If you used Option 1 (three columns):
pcc_df.to_csv("P2Part1_1PCC_Group4.csv", index=False)

In [None]:
# # save a stripped version for submission, and keep the richer file for yourself
# pcc_df[["user_id_1","user_id_2","pcc"]].to_csv("P2Part1_1PCC_Group4.csv", index=False)
# pcc_df.to_csv("P2Part1_1PCC_Group4_with_counts.csv", index=False)  # for your k-NN work