In [1]:
# libraries
%matplotlib inline

import pandas as pd
import numpy as np
import json
import copy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from scipy import spatial
import matplotlib.pyplot as plt

## Data Wrangling

### Import Dataset

In [None]:
# since read_json does not work for such a large dataset, I worked out something out.
review_data = {'business_id':[], 'date':[], 'review_id':[], 'stars':[], 'text':[], 'user_id':[]}


with open('yelp_academic_dataset_review.json', 'r') as infile:
    for line in infile:
        row = json.loads(line)
        for key in review_data.keys():
            if key != '':
                review_data[key].append(row[key])
            else:
                review_data[key].append("")
review_data

In [None]:
# save review data as a dataframe 
review_data = pd.DataFrame(review_data)

### Modify Dataset

In [None]:
# add a column to count the number of words for each comment
import re
text = review_data.text
storage = []
for i in range(len(text)):
    words = re.findall(r"[\w']+", text[i])
    storage.append(len(words))
review_data["review_len"] = pd.Series(storage)

In [None]:
# add a column to show the month of the review
import re 
date = review_data.date
date_drop_day = []
for i in range(len(date)):
    date_drop_day.append(int(date[i][:-3].replace("-","")))
review_data["date_drop_day"] = pd.Series(date_drop_day)

### Create New Dataset with Unique `business_id`
business_id, date_drop_day, stars, text, all reviews, number of reviews

In [None]:
# unique business id
unique_bus_id = review_data.business_id.unique()

In [None]:
new_review_data = review_data
new_review_data.drop('date', axis=1, inplace=True)
new_review_data.drop('review_id', axis=1, inplace=True)
new_review_data.drop('user_id', axis=1, inplace=True)

## Main 
Assumptions of Fake Reviews: 
1.	Since the purpose of fake reviews is to increase the both the star ratings and quality of comments, during the month where fake reviewers are hired, there will be a significant increase in star rating. 
2.	Since fake reviewers may comments based on certain templates and for the same fake reviewer, he or she might have given out fake reviewers for a number of different business, it is highly likely that businesses which hired fake reviewers will have highly similar reviews.
3.	Only a very small portion of businesses hired fake reviewers.


### 1. Filter Possible Fake Reviews Based on Change of Star Ratings and Number of Reviews

In [None]:
# unique_review_count record the count of each business id
unique_review_count = pd.DataFrame({'count' : new_review_data.groupby( [ "business_id"] ).size()})

In [None]:
# groupby both business id and date
unique_review_count2 = pd.DataFrame({'count' : new_review_data.groupby(
    [ "business_id", "date_drop_day"]).size()}).reset_index()

In [None]:
# for the same business id, filter out suspects who has a huge difference between consecutive months
suspects = []
unique_review_count3 = unique_review_count2
unique_review_count3.drop('date_drop_day', axis=1, inplace=True)

for i in range(len(unique_review_count3.business_id)-1):
    current_index = unique_review_count3.loc[i] 
    next_index = unique_review_count3.loc[i+1]
    if current_index['business_id'] == next_index['business_id'] and current_index['business_id'] not in suspects:
        if current_index['count'] > 30:
            if current_index['count'] * 1.5 <= next_index['count']:
                suspects.append(current_index['business_id'])

### 2. Find if There Is Very Similar Text Between Different Restaurant.

In [None]:
# filter new_review_data with business id in suspects
suspect_df = new_review_data[new_review_data.business_id.isin(suspects)]
suspect_df = suspect_df.reset_index()
del suspect_df['index']

In [None]:
# convert Series into list
text = suspect_df.text
text = text.tolist()
type(text)

In [None]:
# vectorize 
from sklearn.feature_extraction.text import CountVectorizer 

# initialize
vectorizer = CountVectorizer(min_df=0.001)
vectorizer.fit(text)
x = vectorizer.transform(text)
x = x.toarray()

print("Transformed text vector is \n{}".format(x))
print("")
print("Words for each feature:")
print(vectorizer.get_feature_names())

In [None]:
# test 
temp_suspects = [suspects[0], suspects[1]]
temp_suspects

storage = []
exec(open('suspects.txt').read())
for id1 in temp_suspects:
    temp_suspects.remove(id1)
    print(len(temp_suspects))
    for id2 in temp_suspects:
        index1 = list(suspect_df.text[suspect_df.business_id == id1].index)
        index2 = list(suspect_df.text[suspect_df.business_id == id2].index)
        vector_storage1 = x[index1]
        vector_storage2 = x[index2]
        temp_storage = []
        
        for vec1 in vector_storage1:
            for vec2 in vector_storage2: 
                 temp_storage.append(1 - spatial.distance.cosine(vec1, vec2))
        storage.append(temp_storage)
storage

In [None]:
cos_score = pd.DataFrame(index=suspects, columns=suspects)

In [None]:
# compute the cosine between comments under different business 
exec(open('suspects.txt').read())
for id1 in suspects:
    suspects.remove(id1)
    print(len(suspects))
    for id2 in suspects:
        index1 = list(suspect_df.text[suspect_df.business_id == id1].index)
        index2 = list(suspect_df.text[suspect_df.business_id == id2].index)
        vector_storage1 = x[index1]
        vector_storage2 = x[index2]
        temp_storage = []
        
        for vec1 in vector_storage1:
            for vec2 in vector_storage2: 
                 temp_storage.append(1 - spatial.distance.cosine(vec1, vec2))
        cos_score.loc[id1, id2] = temp_storage

### 3. Cluster Comments 
If we directly compute the cosine, it is really computationally expensive. Thus, if we assume similar comment will be clustered into the same group, we can reduce the computation cost by clustering the comments first and then compute and compare the cosine between 2 comments of different `business_id` of each group. 

In [None]:
cluster = KMeans(n_clusters=20)
label = cluster.fit_predict(x)
df_pivot = copy.deepcopy(suspect_df)
df_pivot['label'] = label

In [None]:
k = 0
x = np.load('x_matrix.txt.npy')
df_200_storage = pd.DataFrame(columns=['business1', 'business2', 'group', 'angle'])
for i in range(200):
    print('---->', i)
    temp_suspects = list(df_pivot_200[df_pivot_200.label == i].business_id.unique())
    print('suspect length', len(temp_suspects))
    df_pivot_temp = df_pivot_200[df_pivot_200.label == i]
    print('total length of this group:', len(df_pivot_temp))
    
    for id1 in temp_suspects:
        temp_suspects.remove(id1)
        print('suspects in the current group', len(temp_suspects))
        counter = 0

        if k % 2000 == 0: 
            print('Saving k at k =', k)
            df_200_storage.to_csv('df_200_storage.csv')

        for id2 in temp_suspects:
            print(counter)
            counter = counter + 1

            index1 = list(df_pivot_temp.text[df_pivot_temp.business_id == id1].index)
            index2 = list(df_pivot_temp.text[df_pivot_temp.business_id == id2].index)
            vector_storage1 = x[index1]
            vector_storage2 = x[index2]

            for vec1 in vector_storage1:
                for vec2 in vector_storage2: 
                    df_200_storage.loc[k] = [id1, id2, i, 1 - spatial.distance.cosine(vec1, vec2)]
                    k = k + 1

## Determine Threshold 

In [None]:
plt.hist(df_200_storage.angle, bins = 10)
plt.show()

In [None]:
plt.boxplot(df_200_storage.angle)
plt.show()

In [None]:
df_200_storage.angle.describe() # third quantile is 0.744664

In [None]:
# use third quantile as threshold
final_suspects_detail = df_200_storage[df_200_storage.angle >= 0.744664]
final_suspects_detail.tail()

In [None]:
final_suspects_detail.to_csv('final_suspects_detail.csv')