<div>
    <font face="Times New Roman" style="text-align: center;">
        <h1>Beyond Reviews</h1>
        <h2>Validating online consumer reviews using unsupervised machine learning methods.</h2>
        <p><strong>Date:</strong> June 15, 2024</p>
        <p><strong>Student:</strong> Maurits Christiaan Graaf</p>
        <p><strong>Studentnumber:</strong> 660509</p>
        <p><strong>Supervisor:</strong> Dr. D.J. (David) Kusterer</p>
        <p><strong>Second Reader:</strong> Dr. M. (Maciej) Szymanowski</p>
        <p><strong>Department:</strong> Marketing</p>
        <p><strong>University:</strong> Rotterdam School of Management</p>
    </font>
</div>

In [1]:
##### ------ Importing Packages ------ #####
#General packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_score
from scipy.stats import beta
import json
import os


# Dataset

Import the dataset created and prepared in 'Fake reviews notebook ~ data preparation'

In [2]:
# Read from pickle (pkl)
df = pd.read_pickle('concatenated_df.pkl')

#Check whether import functioned correctly
print(df.shape)

print(df.columns)

(149880, 48)
Index(['Review_Rating', 'Review_Title', 'Review_Text', 'Review_Images',
       'Product_ASIN', 'Parent_Product_ASIN', 'User_ID', 'Review_Timestamp',
       'Helpful_Votes', 'Verified_Purchase', 'Main_Category', 'Product_Title',
       'Product_Average_Rating', 'Count_product_ratings', 'Product_Features',
       'Product_Description', 'Product_Price', 'Product_Images',
       'Product_Videos', 'Product_Store', 'Product_Categories',
       'Product_Details', 'Products_Bought_Together', 'Product_Subtitle',
       'Product_Author', 'overall_category', 'Review_Title_Length',
       'Review_Text_Length', 'Rating_Difference', 'Review_Rank',
       'Extreme_Rating', 'Average_Text_Length', 'Text_Length_Difference',
       'Review_Count', 'Singular_Review', 'Review_Words', 'Review_Word_Count',
       'Fully_Capitalized_Words_Count', 'Fully_Capitalized_Words_Proportion',
       'Capital_Letters_Excluding_Start_Count',
       'Capital_Letters_Excluding_Start_Percentage',
       'First

# Fitting the ASM

In [3]:
# Ensure Review_Date is in datetime format
df['Review_Date'] = pd.to_datetime(df['Review_Timestamp'])

# Function to compute the max number of reviews per day
def max_number_of_reviews(df):
    df['Review_Date'] = pd.to_datetime(df['Review_Timestamp'])
    if 'Review_ID' not in df.columns:
        df['Review_ID'] = df.index  # Create a unique identifier for review_id
    df['Reviews_Per_Day'] = df.groupby(['User_ID', df['Review_Date'].dt.date])['Review_ID'].transform('count')
    df['Max_Reviews_Per_Day'] = df.groupby('User_ID')['Reviews_Per_Day'].transform('max')
    df['MNR'] = MinMaxScaler().fit_transform(df[['Max_Reviews_Per_Day']])
    return df

# Function to compute reviewing burstiness
def reviewing_burstiness(df, tau=28):
    df['First_Review_Date'] = df.groupby('User_ID')['Review_Date'].transform('min')
    df['Last_Review_Date'] = df.groupby('User_ID')['Review_Date'].transform('max')
    df['Activity_Span'] = (df['Last_Review_Date'] - df['First_Review_Date']).dt.days
    df['BST'] = 1 - (df['Activity_Span'] / tau).clip(upper=1)
    return df

# Function to compute the ratio of first reviews
def ratio_of_first_reviews(df):
    df['Is_First_Review'] = df.groupby('Product_ASIN')['Review_Timestamp'].rank(method='first') == 1
    df['First_Reviews'] = df.groupby('User_ID')['Is_First_Review'].transform('sum')
    df['Total_Reviews'] = df.groupby('User_ID')['Review_ID'].transform('count')
    df['RFR'] = df['First_Reviews'] / df['Total_Reviews']
    return df

# Add a column for duplicate/near-duplicate based on cosine similarity
df['duplicate'] = (df['Cosine_Similarity'] >= 0.7).astype(int)

# Apply the missing feature computations
df = max_number_of_reviews(df)
df = reviewing_burstiness(df)
df = ratio_of_first_reviews(df)

In [4]:
print(df.columns)

Index(['Review_Rating', 'Review_Title', 'Review_Text', 'Review_Images',
       'Product_ASIN', 'Parent_Product_ASIN', 'User_ID', 'Review_Timestamp',
       'Helpful_Votes', 'Verified_Purchase', 'Main_Category', 'Product_Title',
       'Product_Average_Rating', 'Count_product_ratings', 'Product_Features',
       'Product_Description', 'Product_Price', 'Product_Images',
       'Product_Videos', 'Product_Store', 'Product_Categories',
       'Product_Details', 'Products_Bought_Together', 'Product_Subtitle',
       'Product_Author', 'overall_category', 'Review_Title_Length',
       'Review_Text_Length', 'Rating_Difference', 'Review_Rank',
       'Extreme_Rating', 'Average_Text_Length', 'Text_Length_Difference',
       'Review_Count', 'Singular_Review', 'Review_Words', 'Review_Word_Count',
       'Fully_Capitalized_Words_Count', 'Fully_Capitalized_Words_Proportion',
       'Capital_Letters_Excluding_Start_Count',
       'Capital_Letters_Excluding_Start_Percentage',
       'First_Person_Prono

In [5]:
# Define the Optimized Author Spamicity Model
class OptimizedAuthorSpamicityModel:
    def __init__(self, n_clusters=2):
        self.n_clusters = n_clusters
        self.cluster_model = KMeans(n_clusters=self.n_clusters)
        self.logistic_model = LogisticRegression()
        self.author_spamicity = None
        self.review_spamicity = None

    def fit(self, df):
        # Select relevant features for clustering
        features = df[['MNR', 'BST', 'RFR', 'duplicate', 
                       'Extreme_Rating', 'Rating_Difference', 'Review_Rank', 'Cosine_Similarity']]
        
        # Standardize features
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(features)
        
        # Fit clustering model
        self.cluster_model.fit(scaled_features)
        
        # Assign cluster labels to authors
        df['Cluster_Label'] = self.cluster_model.labels_
        
        # Compute spamicity score directly based on cluster characteristics
        cluster_spamicity = df.groupby('Cluster_Label')['Cosine_Similarity'].mean().to_dict()
        df['Author_Spamicity'] = df['Cluster_Label'].map(cluster_spamicity)
        
        # Fit a logistic regression model to predict spamicity
        df['Spam'] = (df['Cosine_Similarity'] >= 0.7).astype(int)
        self.logistic_model.fit(scaled_features, df['Spam'])
        df['Review_Spamicity'] = self.logistic_model.predict_proba(scaled_features)[:, 1]
        
        # Flag reviews with spamicity probability greater than 0.5
        df['spam_flag_05'] = df['Review_Spamicity'] > 0.5
        
        # Store the computed spamicity
        self.author_spamicity = df['Author_Spamicity']
        self.review_spamicity = df['Review_Spamicity']
        
        return df

# Instantiate the OptimizedAuthorSpamicityModel
optimized_asm = OptimizedAuthorSpamicityModel(n_clusters=10)

In [6]:
# Fit the model to the dataset
result_df = optimized_asm.fit(df)


  super()._check_params_vs_input(X, default_n_init=10)


# Write ASM Output To CSV

In [7]:
#Output the CSV
# Ensure the Output_Data directory exists
output_dir = 'Output_Data'
os.makedirs(output_dir, exist_ok=True)

# Define path
output_file_path = os.path.join(output_dir, 'ASM_Output.csv')

result_df.to_csv(output_file_path, index=False)
