<div>
    <font face="Times New Roman" style="text-align: center;">
        <h1>Beyond Reviews</h1>
        <h2>Validating online consumer reviews using unsupervised machine learning methods.</h2>
        <p><strong>Date:</strong> June 15, 2024</p>
        <p><strong>Student:</strong> Maurits Christiaan Graaf</p>
        <p><strong>Studentnumber:</strong> 660509</p>
        <p><strong>Supervisor:</strong> Dr. D.J. (David) Kusterer</p>
        <p><strong>Second Reader:</strong> Dr. M. (Maciej) Szymanowski</p>
        <p><strong>Department:</strong> Marketing</p>
        <p><strong>University:</strong> Rotterdam School of Management</p>
    </font>
</div>

# Importing Packages
This codeblock imports the necessary packages.

In [13]:
#General packages
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.spatial.distance import cosine
import os

# Dataset

Import the dataset created and prepared in 'Fake reviews notebook ~ data preparation'

In [14]:
# Read from pickle (pkl)
df = pd.read_pickle('concatenated_df.pkl')

#Check whether import functioned correctly
print(df.shape)

print(df.columns)

(149880, 48)
Index(['Review_Rating', 'Review_Title', 'Review_Text', 'Review_Images',
       'Product_ASIN', 'Parent_Product_ASIN', 'User_ID', 'Review_Timestamp',
       'Helpful_Votes', 'Verified_Purchase', 'Main_Category', 'Product_Title',
       'Product_Average_Rating', 'Count_product_ratings', 'Product_Features',
       'Product_Description', 'Product_Price', 'Product_Images',
       'Product_Videos', 'Product_Store', 'Product_Categories',
       'Product_Details', 'Products_Bought_Together', 'Product_Subtitle',
       'Product_Author', 'overall_category', 'Review_Title_Length',
       'Review_Text_Length', 'Rating_Difference', 'Review_Rank',
       'Extreme_Rating', 'Average_Text_Length', 'Text_Length_Difference',
       'Review_Count', 'Singular_Review', 'Review_Words', 'Review_Word_Count',
       'Fully_Capitalized_Words_Count', 'Fully_Capitalized_Words_Proportion',
       'Capital_Letters_Excluding_Start_Count',
       'Capital_Letters_Excluding_Start_Percentage',
       'First

# Fitting the SpEagle

In [15]:
##### ------ Feature Extraction ------ #####
# Compute the maximum number of reviews per day
df['Review_Date'] = pd.to_datetime(df['Review_Timestamp'])
df['Review_ID'] = df.index if 'Review_ID' not in df.columns else df['Review_ID']
df['Reviews_Per_Day'] = df.groupby(['User_ID', df['Review_Date'].dt.date])['Review_ID'].transform('count')
df['Max_Reviews_Per_Day'] = df.groupby('User_ID')['Reviews_Per_Day'].transform('max')
df['MNR'] = MinMaxScaler().fit_transform(df[['Max_Reviews_Per_Day']])

# Compute reviewing burstiness
tau = 28
df['First_Review_Date'] = df.groupby('User_ID')['Review_Date'].transform('min')
df['Last_Review_Date'] = df.groupby('User_ID')['Review_Date'].transform('max')
df['Activity_Span'] = (df['Last_Review_Date'] - df['First_Review_Date']).dt.days
df['BST'] = 1 - (df['Activity_Span'] / tau).clip(upper=1)

# Compute the ratio of first reviews
df['Is_First_Review'] = df.groupby('Product_ASIN')['Review_Timestamp'].rank(method='first') == 1
df['First_Reviews'] = df.groupby('User_ID')['Is_First_Review'].transform('sum')
df['Total_Reviews'] = df.groupby('User_ID')['Review_ID'].transform('count')
df['RFR'] = df['First_Reviews'] / df['Total_Reviews']

# Add a column for duplicate/near-duplicate based on cosine similarity
df['duplicate'] = (df['Cosine_Similarity'] >= 0.9).astype(int)

In [16]:
##### ------ Graph Construction ------ #####
# Constructing the graph
G = nx.Graph()

# Add nodes and edges
for index, row in df.iterrows():
    G.add_node(row['User_ID'], type='user')
    G.add_node(row['Review_ID'], type='review')
    G.add_node(row['Product_ASIN'], type='product')
    G.add_edge(row['User_ID'], row['Review_ID'], relation='writes')
    G.add_edge(row['Review_ID'], row['Product_ASIN'], relation='belongs_to')

In [17]:
##### ------ Prior Estimation ------ #####
# Function to estimate priors using all relevant features
def estimate_priors(df):
    user_features = df[['MNR', 'BST', 'RFR', 'Review_Count', 'Average_Text_Length', 
                        'First_Person_Pronouns_Ratio', 'Exclamation_Sentence_Ratio']].to_numpy()
    review_features = df[['duplicate', 'Cosine_Similarity', 'Extreme_Rating', 
                          'Rating_Difference', 'Sentiment_Scores']].to_numpy()
    user_priors = {user: np.mean(user_features[idx]) for idx, user in enumerate(df['User_ID'].unique())}
    review_priors = {review: np.mean(review_features[idx]) for idx, review in enumerate(df['Review_ID'].unique())}
    return user_priors, review_priors

user_priors, review_priors = estimate_priors(df)

In [18]:
##### ------ Loopy Belief Propagation ------ #####
def loopy_belief_propagation(G, user_priors, review_priors, max_iter=100, epsilon=1e-3):
    # Initialize messages
    messages = {edge: 1.0 for edge in G.edges}
    prev_messages = messages.copy()

    for _ in range(max_iter):
        for edge in G.edges:
            node_u, node_v = edge
            if G.nodes[node_u]['type'] == 'user' and G.nodes[node_v]['type'] == 'review':
                messages[edge] = user_priors[node_u] * review_priors[node_v]
            elif G.nodes[node_u]['type'] == 'review' and G.nodes[node_v]['type'] == 'product':
                messages[edge] = review_priors[node_u]
        
        # Check for convergence
        delta = sum(abs(messages[edge] - prev_messages[edge]) for edge in G.edges)
        if delta < epsilon:
            break
        prev_messages = messages.copy()
    
    return messages

# Run LBP
messages = loopy_belief_propagation(G, user_priors, review_priors)

In [19]:
##### ------ Final Class Probabilities ------ #####
# Assign class probabilities
for node in G.nodes:
    if G.nodes[node]['type'] == 'user':
        G.nodes[node]['spam_prob'] = user_priors[node]
    elif G.nodes[node]['type'] == 'review':
        G.nodes[node]['spam_prob'] = review_priors[node]

In [20]:
# Extract results
user_spam_prob = {node: G.nodes[node]['spam_prob'] for node in G.nodes if G.nodes[node]['type'] == 'user'}
review_spam_prob = {node: G.nodes[node]['spam_prob'] for node in G.nodes if G.nodes[node]['type'] == 'review'}

# Convert to DataFrame
user_spam_df = pd.DataFrame.from_dict(user_spam_prob, orient='index', columns=['User_Spam_Probability'])
review_spam_df = pd.DataFrame.from_dict(review_spam_prob, orient='index', columns=['Review_Spam_Probability'])

In [21]:
##### ------ Merge and Save Combined DataFrame ------ #####
# Merge spam probabilities with the original DataFrame
df = df.merge(user_spam_df, left_on='User_ID', right_index=True, how='left')
df = df.merge(review_spam_df, left_on='Review_ID', right_index=True, how='left')

In [22]:
# Ensure Review_ID is part of the DataFrame
df = df[['Review_ID', 'User_ID', 'User_Spam_Probability', 'Review_Spam_Probability'] + 
        [col for col in df.columns if col not in ['Review_ID', 'User_ID', 'User_Spam_Probability', 'Review_Spam_Probability']]]

# Write SpEagle Output to CSV

In [23]:
# Ensure the Output_Data directory exists
output_dir = 'Output_Data'
os.makedirs(output_dir, exist_ok=True)

# Define path for the output file
output_file_path = os.path.join(output_dir, 'SpEagle_Output.csv')

# Write the complete DataFrame to a CSV file in the Output_Data folder
df.to_csv(output_file_path, index=False)