In [22]:
import pandas as pd
import numpy as np

reviews_data = pd.read_csv("../../data/raw/Books_rating.csv")

In [23]:
reviews_data.sample(5)

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
567160,956291965X,The Picture of Dorian Gray,,A3BM8D5WBUT5ZM,marisa haspel,0/0,5.0,1357689600,excellent,"Free of charge and very fast downloading, the ..."
1742952,0849324793,"CRC Standard Mathematical Tables and Formulae,...",,A1ZTEY1Z1SMKPP,"""jawaljoe""",9/10,4.0,998870400,Good Reference for the abstract based mathmatican,The reference covers many topics covered in hi...
2493699,B000K6K0O0,Thicker Than Water,,AQEIISWW9F967,G-Spot,2/2,5.0,1136764800,Three-In-One Delight,My book club members are big fans of Kendra No...
2695527,B00005J6VH,Bridget Jones's Diary,,A2268885WASAML,Io,1/2,2.0,1091491200,Mindless drivel,"I purchased this book a while back, but can't ..."
262304,B000JXV7IM,SCHOOL OF NATURAL HEALING,,A2U70CT2WDSAXS,Terri Emery,12/13,5.0,1315267200,Natural Healing,"The book is very informative, very interesting..."


In [24]:
useful_columns = [
    "Id",
    "Title",
    "User_id",
    "review/score",
    "review/time",
]
reviews_data = reviews_data[useful_columns]

In [25]:
reviews_data = reviews_data[reviews_data.User_id.notna()]

In [26]:
M = reviews_data.Id.unique().shape[0]
print(f"The number of items M = {M}")

The number of items M = 216023


In [27]:
N = reviews_data.User_id.unique().shape[0]
print(f"The number of users N = {N}")

The number of users N = 1008972


In [28]:
print(f"The size of the sparse weight matrix is MxN, which contains {M * N} cells")

The size of the sparse weight matrix is MxN, which contains 217961158356 cells


In [29]:
def calculate_array_size(data_type, num_items):
    bytes_per_item = np.dtype(data_type).itemsize
    total_bytes = bytes_per_item * num_items
    total_gb = total_bytes / (1024**3)
    return total_gb

# Usage example
data_type = np.int8
num_items = M*N
array_size_in_gb = calculate_array_size(data_type, num_items)

print(f"The array will be approximately {array_size_in_gb:.2f} GB")

The array will be approximately 202.99 GB


We have a lot of items and users. We will need an efficient way of performing the calculations

Possible Filters for a pair of i and i' users:
- Threshold of common ratings: If two users have less than a certain amount of common reviewed items, they won't be considered.
- Threshold of weights: If the absolute value of the weight for two users is less than a threshold, it won't be considered.
- Theshold of neighbors: Only the top absolute value weights for a user will be considered.

We still don't know the distributions of many of these variables, but we can get an upper bound using the threshold of neighbors

In [30]:
pd.DataFrame(
    [(n,calculate_array_size(data_type, N * n)) for n in range(1,520,20)],
    columns=["neighbors", "size (GB)"]
)

Unnamed: 0,neighbors,size (GB)
0,1,0.00094
1,21,0.019733
2,41,0.038527
3,61,0.05732
4,81,0.076114
5,101,0.094908
6,121,0.113701
7,141,0.132495
8,161,0.151288
9,181,0.170082


We can see that the use of this threshold already gives us a size that we can work on

## Eliminating multiple reviews

In [31]:
reviews_data.groupby(["User_id", "Id"]).size().max()

28

We can see that a user can have many reviews per book. We only consider the last review.

In [32]:
reviews_data = reviews_data.sort_values('review/time', ascending=False)
reviews_data = reviews_data.drop_duplicates(subset=['Id', 'User_id'], keep='first')

In [33]:
reviews_data.groupby(["User_id", "Id"]).size().max()

1

## Removing users with repeating scores

In [34]:
reviews_data

Unnamed: 0,Id,Title,User_id,review/score,review/time
960831,1850891648,Frankenstein (Isis Large Print Fiction),A3SFQL1UXFOQY,5.0,1362355200
101821,0833512420,Red Storm Rising (Turtleback School & Library ...,AKY9VFVYY5YTT,5.0,1362355200
960830,1850891648,Frankenstein (Isis Large Print Fiction),A2AIRA5U5QG3EB,2.0,1362355200
2572566,0140860428,Jane Eyre (Penguin Classics),AI437UN6Z43GK,5.0,1362355200
2232589,0786114347,Twenty Thousand Leagues Under the Sea (Library...,A6L9KOTK81C0X,5.0,1362355200
...,...,...,...,...,...
2971452,B000G167FA,Silver Pennies,A186DM3LK926XA,5.0,-1
75750,0786280670,"Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...",AE3SEXFJCQLJQ,1.0,-1
75749,0786280670,"Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...",A1DFXP0TM9UKGG,1.0,-1
2152018,B000KPX7RI,"McKeachie's Teaching Tips, Strategies Research...",A1AY4QM3FDINBQ,5.0,-1


In [35]:
distinct_score_per_user = reviews_data.groupby("User_id")["review/score"].nunique()
distinct_score_per_user

User_id
A00109803PZJ91RLT7DPN    1
A00117421L76WVWG4UX95    1
A0015610VMNR0JC9XVL1     2
A002258237PFYJV336T05    1
A00264602WCXBHHFPLTQ4    1
                        ..
AZZZELE3I0CKD            1
AZZZJY3RMN57G            1
AZZZT14MS21I6            1
AZZZYCR4NZADZ            1
AZZZZW74AAX75            1
Name: review/score, Length: 1008972, dtype: int64

In [36]:
reviews_data = reviews_data[reviews_data.User_id.isin(distinct_score_per_user[distinct_score_per_user>1].index)]

In [37]:
reviews_data

Unnamed: 0,Id,Title,User_id,review/score,review/time
2572566,0140860428,Jane Eyre (Penguin Classics),AI437UN6Z43GK,5.0,1362355200
717118,0786197196,"Bloody Ground (The Starbuck Chronicles, Volume...",AI437UN6Z43GK,4.0,1362355200
528155,089693358X,Be Dynamic (Acts 1-12): Experience the Power o...,A221TMACCF8K6S,2.0,1362355200
2130022,0896211460,Rage,AKZMQ0OB9DDWC,4.0,1362355200
2806366,1840323027,Morbid Taste for Bones,A35O6DX8L99DW0,5.0,1362355200
...,...,...,...,...,...
2971448,B000G167FA,Silver Pennies,AAFZZHA2I598B,5.0,-1
2971446,B000G167FA,Silver Pennies,AZUNT3QP2CWTL,5.0,-1
75748,0786280670,"Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...",AMB2Z4FMDBWQY,2.0,-1
75745,0786280670,"Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...",A3LL5TMGX00LA1,2.0,-1


In [38]:
reviews_data = reviews_data[reviews_data.Title.notna()]

In [39]:
reviews_data.to_csv("../../data/cleaned_data/cleaned_reviews.csv", index=False)

## Standardizing reviews by user

In [16]:
mean_score = reviews_data.groupby("User_id")["review/score"].mean()

In [17]:
reviews_data = pd.merge(
    reviews_data,
    mean_score.reset_index(name="user_average"),
    how="inner"
)

In [18]:
reviews_data["dev"] = reviews_data["review/score"] - reviews_data.user_average

## Working with a sample

In this section we will work with a sample of users with the highest count of reviews, to minimize the sparcity of the weight matrix.

In [19]:
top_active_users = reviews_data.User_id.value_counts().head(int(N/100)).index

In [20]:
top_active_users

Index(['A14OJS0VWMOSWO', 'AFVQZQ8PW0L', 'AHD101501WCN1', 'A1X8VZWTOG8IS6',
       'A1K1JW1C5CUSUZ', 'A1S3C5OFU508P3', 'A1N1YEMTI9DJ86', 'A2OJW07GQRNJUT',
       'A1D2C0WDCSHUWZ', 'A1G37DFO8MQW0M',
       ...
       'A20OMCGB73X5LV', 'A18MBO1U4DPY20', 'A3A48XEYWLWH7T', 'AXW5RUGDQVZ6Q',
       'A11I1I9QLMAM1A', 'AP0HLE8WIXI3V', 'A1D2ZN57YG2NMH', 'A13YXU5QJQWZ4C',
       'A2DGHN8YXJTPD', 'ARU5FJM96YGGA'],
      dtype='object', name='User_id', length=10089)

In [21]:
sample_user = "AHD101501WCN1"

This helps us to find which users reviewed a book

In [22]:
reviewed_books = reviews_data[reviews_data.User_id == sample_user].Id

In [23]:
N_INTERSECTION_CANDIDATES = 1000

In [24]:
highest_intersection_users = (
    reviews_data[reviews_data.Id.isin(reviewed_books) & ~reviews_data.User_id.str.fullmatch(sample_user)]
    .User_id
    .value_counts()
    .nlargest(N_INTERSECTION_CANDIDATES)
)
highest_intersection_users.shape

(1000,)

In [25]:
relevant_dev_matrix = reviews_data[
    reviews_data.User_id.isin(highest_intersection_users.index.tolist() + [sample_user]) &
    reviews_data.Id.isin(reviewed_books)
][
    ["User_id", "Id", "dev"]
].pivot(index="Id",columns="User_id", values="dev")

In [26]:
relevant_dev_matrix

User_id,A10A1S5NAQBT21,A10R1WAP6HFFEP,A10T0OW97SFBB,A113KA21MQG9W4,A114YQ7ZT9Y1W5,A115MZZUS4VVM5,A11DCTGTPS7M0C,A11NMT4P8EPWKW,A11RU07SQDXVLP,A11YOT86X3M4GU,...,AYYAIPS6M3HB8,AYZMJYCRIZWIO,AYZN94ZQWIA4F,AZ216MK9KO1S0,AZ2YK1S0QSY60,AZ4UA2B3Q0WNP,AZ85B5Q1UEH5U,AZCR709KP5K34,AZI0O32W4ZYGH,AZJ1N5LS6Q0FD
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0002558718,,,,,,,,,,,...,,,,,,,,,,
0003252507,,,,,,,,,,,...,,,,,,,,,,
0006510345,,,,,,,,,,,...,,,,,,,,,,
0020715609,,,,,,,,,,,...,,,,,,,,,,
0028631919,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B000TNELP8,,,,,,,,,,,...,,,,,,,,,,
B000U2CL5A,,,,,,,,,,,...,,,,,,,,,,
B000U2EW84,,,,,0.747934,,,,,,...,,,,,,,,,,
B000UDFI6I,,,,,,,,,,,...,,,,,,,,,,


In [27]:
relevant_dev_matrix.dropna(subset=[sample_user]).corrwith(relevant_dev_matrix[sample_user], method='pearson')

User_id
A10A1S5NAQBT21         NaN
A10R1WAP6HFFEP         NaN
A10T0OW97SFBB    -0.207184
A113KA21MQG9W4   -0.784863
A114YQ7ZT9Y1W5    0.396050
                    ...   
AZ4UA2B3Q0WNP          NaN
AZ85B5Q1UEH5U     0.634155
AZCR709KP5K34     0.831186
AZI0O32W4ZYGH     1.000000
AZJ1N5LS6Q0FD          NaN
Length: 1001, dtype: float64

In [28]:
weights = relevant_dev_matrix.corrwith(relevant_dev_matrix[sample_user])

In [29]:
relevant_dev_matrix[["ATIKC4HAP3XWV",sample_user]].dropna()

User_id,ATIKC4HAP3XWV,AHD101501WCN1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0791051323,0.96,-1.697328
1561034371,-0.04,0.302672
1582343632,-0.04,0.302672
1596009772,-0.04,0.302672
1596009780,-0.04,0.302672
1596009799,-0.04,0.302672
5556068054,-0.04,0.302672
9562910458,-0.04,0.302672
B0000CMSVK,-0.04,0.302672
B0006QFTGS,-0.04,0.302672


In [30]:
weights[weights.abs() > 0.8]

User_id
A11RU07SQDXVLP    0.966458
A12AG1OXXG1ZNO    1.000000
A13G1TKIKHGV3F    1.000000
A14CM5XTMI0UXE    1.000000
A15YXYYH2D1CEM   -0.954130
                    ...   
ARYY84WIKKLXD     0.814316
ATIKC4HAP3XWV    -1.000000
AVV7VCHCFY3BI     0.861640
AZCR709KP5K34     0.831186
AZI0O32W4ZYGH     1.000000
Length: 76, dtype: float64

Problem with just using the correlation coefficients is that some correlations have greater weight than others. We will proceed to use the confidence intervals, specifically, the lower bound.

In [31]:
from scipy import stats

In [32]:
correlations = {}
confidence_intervals = {}

for col in relevant_dev_matrix.columns:
    if col != sample_user:
        non_nulls = relevant_dev_matrix[[col, sample_user]].dropna()
        
        constant_col = non_nulls[col].unique().shape[0] == 1
        constant_target = non_nulls[sample_user].unique().shape[0] == 1
        if len(non_nulls) > 2 and not constant_col and not constant_target:
            corr, p_value = stats.pearsonr(non_nulls[col], non_nulls[sample_user])
            
            abs_corr = abs(corr)

            z = 1.96  # Z-score for 95% confidence interval
            n = len(non_nulls)
            ci = z * ((abs_corr * (1 - abs_corr)) / (n - 1))**0.5  # Formula for confidence interval
            
            if abs_corr - ci > 0.1:
                correlations[col] = abs_corr
                confidence_intervals[col] = (abs_corr - ci, abs_corr + ci)

In [33]:
weight_df = pd.DataFrame(
    [(k,v[0]) for k,v in confidence_intervals.items()],
    columns=["User_id", "weight"]
)
weight_df.sample(20)

Unnamed: 0,User_id,weight
22,A1ARA52HB238HT,0.467355
80,A1VPXCFO261VWH,0.458732
37,A1DW700HEBQC4P,0.479963
283,AGHIV0V0ON7MO,0.334918
32,A1DGSUJ89KN91K,1.0
262,A57JIBFWM3L0H,0.269374
279,AELXFPTLB6P2L,0.223429
9,A13E0ARAXI6KJW,0.316787
265,A5ZVJ9A6H1YL1,0.201899
158,A2TKY19LMCCPLG,0.662877


In [39]:
relevant_reviews = reviews_data[reviews_data.User_id.isin(weight_df.User_id)][["Id","User_id","dev"]].copy()
relevant_reviews.sample(5)

Unnamed: 0,Id,User_id,dev
116104,B000FAL22U,A2WB4OWBUH2VQX,-0.14094
18970,B0007F2EN4,A2AYSFGUP5VTY3,0.912698
51384,0394299175,A1RAUVCWYHTQI4,-0.305164
460472,0679725393,A1T17LMQABMBN5,0.626888
500467,B0002X7W3I,A1X8VZWTOG8IS6,-0.604096


In [46]:
a = pd.merge(
    relevant_reviews,
    weight_df,
    how="inner",
)
a

Unnamed: 0,Id,User_id,dev,weight
0,0806127406,A3RFXU3P0XKKF4,0.593939,0.262942
1,0631183140,A3RFXU3P0XKKF4,0.593939,0.262942
2,B0007G5NJU,A3RFXU3P0XKKF4,-0.406061,0.262942
3,0394436237,A3RFXU3P0XKKF4,-0.406061,0.262942
4,B0007FLPAC,A3RFXU3P0XKKF4,-0.406061,0.262942
...,...,...,...,...
57258,B000H0BNU4,A32ZLHLBW99OQY,-0.583333,0.422927
57259,0812966937,A32ZLHLBW99OQY,-1.583333,0.422927
57260,B000GNVAW8,A32ZLHLBW99OQY,-0.583333,0.422927
57261,B0006E082U,A32ZLHLBW99OQY,0.416667,0.422927


In [47]:
a["exp_val_lb"] = a.dev * a.weight
a

Unnamed: 0,Id,User_id,dev,weight,exp_val_lb
0,0806127406,A3RFXU3P0XKKF4,0.593939,0.262942,0.156172
1,0631183140,A3RFXU3P0XKKF4,0.593939,0.262942,0.156172
2,B0007G5NJU,A3RFXU3P0XKKF4,-0.406061,0.262942,-0.106770
3,0394436237,A3RFXU3P0XKKF4,-0.406061,0.262942,-0.106770
4,B0007FLPAC,A3RFXU3P0XKKF4,-0.406061,0.262942,-0.106770
...,...,...,...,...,...
57258,B000H0BNU4,A32ZLHLBW99OQY,-0.583333,0.422927,-0.246707
57259,0812966937,A32ZLHLBW99OQY,-1.583333,0.422927,-0.669634
57260,B000GNVAW8,A32ZLHLBW99OQY,-0.583333,0.422927,-0.246707
57261,B0006E082U,A32ZLHLBW99OQY,0.416667,0.422927,0.176220


In [50]:
a = a.groupby("Id")[["weight","exp_val_lb"]].sum()

In [52]:
a["score"] = a.exp_val_lb / a.weight
a

Unnamed: 0_level_0,weight,exp_val_lb,score
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0001047604,0.735053,0.353391,0.480769
0001047825,0.267778,-0.023377,-0.087302
0001050079,0.768605,0.324824,0.422616
0001050087,0.768605,0.324824,0.422616
0001052950,1.575542,0.591715,0.375563
...,...,...,...
B000UVGLZM,0.191014,0.151794,0.794677
B000UYAY0C,0.135905,-0.147044,-1.081967
B000UYFER0,0.195990,0.087316,0.445513
B000UYIM2E,0.179968,-0.252369,-1.402299


In [59]:
titles = reviews_data[["Id", "Title"]].drop_duplicates()
titles

Unnamed: 0,Id,Title
0,0140860428,Jane Eyre (Penguin Classics)
1,0786197196,"Bloody Ground (The Starbuck Chronicles, Volume..."
2,089693358X,Be Dynamic (Acts 1-12): Experience the Power o...
3,B0006AHXZ4,"Gawayne and the Green knight;: A fairy tale,"
4,0896211460,Rage
...,...,...
1003039,B000MXH3LY,Year of the Wild Boar: An American Woman in Ja...
1003059,B000MU91MQ,Julia Kristeva : Readings of Exile and Estrang...
1003131,0026215357,The Math Kit: A Three-Dimensional Tour Through...
1003132,0256125392,Marketing: Creating Value for Customers


In [61]:
top = a.sort_values("score", ascending=False).head(15)
pd.merge(
    titles,
    top,
    left_on="Id",
    right_index=True,
    how="inner",
)

Unnamed: 0,Id,Title,weight,exp_val_lb,score
5012,1932100385,The China Study: The Most Comprehensive Study ...,0.16485,0.415871,2.522727
26971,0899421423,Saint Joseph Daily Prayerbook (St. Joseph),0.440409,0.917518,2.083333
38262,0806111917,Techniques of the Selling Writer,0.330434,0.694543,2.101911
40772,0891414843,Moltke on the Art of War: Selected Writings,0.138152,0.291415,2.109375
45616,0743471652,Tinker,0.330434,0.694543,2.101911
61170,1590302907,Bushido: The Spirit of the Samurai (Shambhala ...,0.301389,0.614369,2.038462
87312,0345401026,Ancient Inventions,0.301389,0.614369,2.038462
132920,B0007DZMEY,Five women,0.138152,0.291415,2.109375
155125,B0006Y5XRK,The Four Lords of the Diamond: Lilith: A Snake...,0.330434,0.694543,2.101911
159091,1580543707,"Ephedra Fact and Fiction: How Politics, the Pr...",0.330434,0.694543,2.101911
