In [None]:
import os
import re
import csv
import time

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from googletrans import Translator

from model_dependency_script import text_preprocessing, sparse_to_dense
import joblib

Will need to ensure **model_dependency_script** and **joblib** files are located in same file that this notebook is running in order to properly load

### Function to Retrieve User Review Page Links
Input : Book Reviews dataframe
- Extracts user_id and display_name
- Then concatenates them into the string format for goodreads user review pages

Output : list, containing user review page links

In [20]:
def get_user_revpage_href(br):
    userID_Disp = br[['user_id', 'display_name']].reset_index(drop=True)
    duplicates = userID_Disp.duplicated(subset=['user_id'], keep=False)
    clean_df = userID_Disp[~duplicates]

    # Creates list of all user review page hrefs
    userReviewPagehref  = ['https://www.goodreads.com' + '/review/list/{}-{}?order=d&sort=review&view=reviews'.format(clean_df['user_id'].iloc[i], clean_df['display_name'].iloc[i]) for i, x in enumerate(clean_df['user_id'])]
    return(userReviewPagehref)

### Function to Scroll through User Review 
Input: user review page link
- Firstly, it ensures the page is in fact a review page by checking the arguments in the url after loading it via the headless browser
- Secondly, if the page is valid, the total amount of reviews are determined to see if the amount of information to scroll through will surpass what is scrapable
- Lastly, the page is scrolled through until all reviews or maximum reviews possible are loaded and the html for the final result of the page is returned

Output: loaded html containing all (or maximum potential to be scraped) reviews

In [21]:
def Scroll_for_Reviews_getHTML(link):
    
    chrome_options = Options()
    chrome_options.add_argument('--ignore-certificate-errors')
    chrome_options.add_argument('--ignore-ssl-errors')
    chrome_options.add_argument('--headless')
    service = Service(r"C:\Users\marty\OneDrive - The George Washington University\Documents\CSCI 4443\Project\chromedriver_win32\chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Load goodreads page
    driver.get(link)
    time.sleep(3)

    current_url = driver.current_url

    # checking to see if page has proper argument in html defining it to be an accessible review page
    if 'order=d&sort=review&view=reviews' in current_url:

        # Grabbing total number of reviews on page text
        totalReviews = driver.find_element(By.XPATH, "//div[@class = 'buttons clearFloats uitext']/div[@id = 'infiniteStatus']")
        text = totalReviews.text

        # taking text and finding only digits surrounded by spaces then converting to int type
        totrevs = int(re.findall(r'(?<=\s)\d+(?=\s)', text)[0])

        # Determining if page has over 1200 reviews or not
        # if so, scrolling to bottom of page is capped once 1200 reviews are loaded
        # else, scrolls until bottom of page

        if totrevs > 1200:
            for i in range(38):
                # Scroll down to the bottom
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

                # Wait to load the page
                time.sleep(1.5)

        else:
            # Get initial page height
            last_height = driver.execute_script("return document.body.scrollHeight")

            while True:
                # Scroll down to the bottom
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

                # Wait to load the page
                time.sleep(1.5)

                # Calculate new page height and compare with last height
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    # If heights are the same, there is no more content to load
                    break
                last_height = new_height

        # retrieve the html in its fully loaded state
        html = driver.page_source

        driver.quit()
        return(html)
    
    else:
        driver.quit()
        return(0)

### Function Retrieves User Reviews for the loaded html retrieved from Scroll_for_Reviews_getHTML()
Input: Loaded HTML
- uses BeautifulSoup to create bs4.elements from loaded html and searches for the relevant user information

Output: pandas dataframe, containing user reviews (and relevant information related)

In [22]:
def retrieve_info_fromHTML(html, link):
    soup = BeautifulSoup(html, "lxml")

    # grabbing all tr tags under tbody tag that each house one review
    revWrapper = soup.find('tbody', attrs = {'id' : 'booksBody'}).find_all('tr')

    # fetching user ID
    user_ID = [re.findall('\d+', link)[0] for i in revWrapper]

    # grabbing each reviews ID
    review_ID = [revWrapper[i]['id'][7:] for i,x in enumerate(revWrapper)]

    # grabbing book titles 
    book_title = [revWrapper[i]('td')[3]('a')[0]['title'] for i,x in enumerate(revWrapper)]

    # grabbing author name
    author_name = [revWrapper[i]('td')[4]('a')[0].text for i,x in enumerate(revWrapper)]

    # grabbing rating 
    book_rating = [revWrapper[i]('td')[9]('div')[0].text.strip() for i,x in enumerate(revWrapper)]

    # grabbing review 
    book_review = [revWrapper[i]('td')[15]('div')[0].text for i,x in enumerate(revWrapper)]

    # grabbing date added
    date_added = [revWrapper[i]('td')[22]('div')[0].text.strip() for i,x in enumerate(revWrapper)]

    # creating if-else where if first iteration, creates initial array, arr
    # and when it is a successive iteration, it appends this iterations data to the intial array
    arr = np.column_stack((  user_ID, review_ID, book_title, author_name, 
                             book_rating, book_review, date_added  ))
        
    df = pd.DataFrame(arr, columns = ['User ID', 'Review ID', 'Title', 'Author', 'Rating', 'Review', 'Date Added'])

    return(df)

### Function Combines Scroll_for_Reviews_getHTML() and retrieve_info_fromHTML() Into One 
- This streamlines the process of running in the notebook 
- If Scroll_for_Reviews_getHTML() found a private user (or other similar condition) the html will return as zero and not be processed as not reviews are scrapable

In [23]:
def getUserReviews(link):
    html = Scroll_for_Reviews_getHTML(link)

    if html == 0:
        return(None)
    else:
        return(retrieve_info_fromHTML(html, link))

### Iterating through list of users with flagged spam reviews determined Classifer 
Input : user_hrefs_list (list of user review pages links created from flagged_spammers.csv)
- flagged_spammers.csv is a file with saved user_id, display_name who users who were flagged as spam by the spam_classifier
- This file was created by saving the results from the arango query of the goodreads database collection, spam_reviews

- The loop works by maintaining a current state (index) of how many users have been scraped (and or attempted) and then appending to the csv file, spammers_reviews
- every iteration creates a small list of links, linklst, whose elements are subsection of user_hrefs_list where the index is [state:state+niters]
- These are then used when calling the function, getUserReviews()

Output : spammers_reviews.csv file (contains book reviews made by flagged users if they are not private)

In [None]:
# setting state for cell below to run loop, do not reset until finished iterating through following cell
state = 0

In [125]:
# setting number of iterations, then load list of userhrefs, revs
# linklst is a sample taken from the top of revs to grab niters of links to cycle through

niters = 5

# loading dataframe containing flagged spammers user_id and display_name
flagged_spammers = pd.read_csv('flagged_spammers.csv')

# intializing name of file to save scraped reviews to
filename = 'spammer_reviews.csv'

# creating list of links to user review pages
user_hrefs_list = get_user_revpage_href(flagged_spammers)

linklst = user_hrefs_list[state:state+niters]

# iterating for each unique user href to get reviews if available (not if user is private)
for link in linklst:
    revs = getUserReviews(link)
    if revs is None:
        continue
    elif isinstance(revs, pd.DataFrame):
        if os.path.isfile(filename):
            revs.to_csv(filename, index=False, mode='a', header=False)
        else:
            revs.to_csv(filename, index=False, header=True)

state += niters

### Insights
Initially, the flagged users from the arango query results numbered 134. However, upon inspection of duplicates it was found 3 users had reviewed at least 2 of the 3 books that the spam_classifer was tested on. This lowered the unique users flagged for spam down to 131.

The selection also contained 44 private users, leading to the final file containing reviews for 87 out of the 131 flagged users

### Loading saved spammer_review file from previous scrape and cleaning before post-processing

In [None]:
# loading saved spammer_reviews file after finishing the scrape
spammer_reviews = pd.read_csv(r"C:\Users\marty\OneDrive - The George Washington University\Documents\Applied Machine Learning Analytics\Project\spammer_reviews.csv")

# cleaning column names
columns_spam_book_reviews = [i.lower().replace(' ', '_') for i in list(spammer_reviews.keys())]

for i in range(len(columns_spam_book_reviews)):
    spammer_reviews = spammer_reviews.rename(columns = {spammer_reviews.columns[i]: columns_spam_book_reviews[i]})

# creating list of reviews where any review that has not text is set to '123'
# this will allow language detection to properly identify it as 'unknown'

noNones = []

for i, x in enumerate(spammer_reviews['review']):
    if x == '\nNone\n\n':
        noNones += ['123']
    else:
        noNones += [x]

spammer_reviews['review'] = noNones

### Determining the language of each review and creating column in dataframe that houses outcome

In [None]:
def detect_language(text, default_language='unknown'):
    try:
        language = detect(text)
    except LangDetectException as e:
        if "No features in text" in str(e):
            language = default_language
        else:
            raise e
    return language

lang = [detect_language(spammer_reviews['review'].iloc[i]) for i,x in enumerate(spammer_reviews['review'])]
spammer_reviews['lang'] = lang

### Iterating through spammer_reviews and translating reviews to english
- only initialize cell below once and then continue to run second cell to completion 

In [None]:
# initializing state of iterations
translated_rev = []
state = 0

In [None]:
# definining number of reviews, niters, to translate per cell run
translator = Translator()
niters = 2000

# looping through to append translated reviews to the list, translated_rev
lst = spammer_reviews['review'].iloc[state:niters+state]
for text in lst:
    translated_rev += [translator.translate(text, dest='en').text]

state += niters

In [None]:
# replacing original reviews with english translation
spammer_reviews['review'] = translated_rev

In [7]:
# loading the complete dataset
spammer_reviews_c = pd.read_csv(r"C:\Users\marty\OneDrive - The George Washington University\Documents\Applied Machine Learning Analytics\Project\spammer_reviews_c.csv").drop('Unnamed: 0', axis=1)

### Saving version of the file without empty reviews

In [8]:
# saving version of spammer_reviews that is processed and with no reviews that have no text
full_revs = spammer_reviews_c[spammer_reviews_c['lang'] != 'unknown']
full_revs.to_csv(r"C:\Users\marty\OneDrive - The George Washington University\Documents\Applied Machine Learning Analytics\Project\spammer_reviews_cfull.csv")

### Applying Spam Classifier to all applicable reviews from spammer_reviews

In [None]:
a = pd.read_csv(r"C:\Users\marty\OneDrive - The George Washington University\Documents\Applied Machine Learning Analytics\Project\spammer_reviews_cfull.csv").drop('Unnamed: 0', axis=1)
spam_detector = joblib.load('spam_detector.joblib')

In [None]:
label = []

# applying label to each review, in the case of a review that possess no text features (ex. '5.0') a label of 'n/a' is applied
for review in a['review']:
    try:
        label += [spam_detector.predict(review)]
    except LangDetectException as e:
        if "No features in text" in str(e):
            label += ['n/a']
        else:
            raise e
        
# intializing column in dataframe for spam labels
a['spam'] = label
a['spam'] = a['spam'].apply(lambda x: x[0])

In [11]:
# loading the final dataset from saving the dataframe processes in previous cell
a = pd.read_csv(r"C:\Users\marty\OneDrive - The George Washington University\Documents\Applied Machine Learning Analytics\Project\spammer_reviews_cdone.csv").drop('Unnamed: 0', axis =1)
a.head()

Unnamed: 0,user_id,review_id,title,author,rating,review,date_added,lang,spam
0,6470167,500297253,أثر الفراشة,"Darwish, Mahmoud",4.03,[image]\nDid you know that there is a scientif...,"Jan 08, 2013",ar,0
1,6470167,483767778,مشكلة الأفكار في العالم الإسلامي,"ابن نبي, مالك",4.1,\nلا ينقضي عجبي بعد أنهيت هذا الكتاب ، فحينما ...,"Dec 23, 2012",ar,1
2,6470167,513086746,زمن الخيول البيضاء,"Nasrallah, Ibrahim",4.43,\n\n [image]\nزمن الخيول البيضاء..لا أعلم كيف...,"Jan 21, 2013",ar,1
3,6470167,254394165,رياض الصالحين,"النووي, يحيى بن شرف",4.65,May God have mercy on the learned and ascetic ...,"Jan 01, 2012",ar,0
4,6470167,359020135,الطريق إلى القرآن,"السكران, إبراهيم عمر",4.24,[image]\n“The path to the Qur’an”.... its name...,"Jul 01, 2012",ar,0


#### File Recap
- spammer_reviews.csv : original scraped dataset of flagged users spam reviews
- spammer_reviews_c.csv : dataset with translated reviews and additional column with label for the language of the original review
- spammer_reviews_cfull.csv : dataset striped of any empty reviews 
- spammer_reviews_cdone.csv : dataset with additional column with spam label applied via spam_detector.joblib

spammer_reviews_cdone.csv is the final copy for upload into the arango database

File versions are maintained in an effort to keep whole dataset at each step if further analysis proves useful