In [60]:
import re
from bs4 import BeautifulSoup
import pandas as pd

In [339]:
with open("profiles\\vicecity_profile_1.html", 'r') as temp:
    profile_html = temp.read()

soup = BeautifulSoup(profile_html, "html.parser")

In [401]:

def get_profiles_info(profiles):
    
    #####################
    ###### PGP-key ######
    #####################
    def get_pgp_key(soup):
        begin_tag = "-----BEGIN PGP PUBLIC KEY BLOCK-----"
        end_tag   = "-----END PGP PUBLIC KEY BLOCK-----"

        pgpbox  = soup.find("div", {"class": "pgp_box"}).text
        PGP_KEY = pgpbox.strip(begin_tag).strip(end_tag).strip()

        return PGP_KEY

    #####################
    ###### Vendor #######
    #####################
    def get_vendor_info(soup):
        profile_div = soup.find("div", {"class": "profile_image center"})

        VENDOR_NAME = profile_div.find("span").text

        col_4_spans = soup.find("div", {"class": "col-4"}).findAll("span")
        LAST_SEEN, JOIN_DATE = [div.text for div in col_4_spans]
        bio_div = soup.find("div", {"class": "bubble"})
        BIO = bio_div.find("p").text

        SALES = int(soup.select("label:contains(Sales)")[0].next_sibling.next_sibling.text)
        FEEDBACK_SCORE = int(soup.select("label:contains(Feedback)")[1].next_sibling.next_sibling.text)

        PGP_KEY = get_pgp_key(soup)

        VENDOR_INFO =  {"Vendor": {"Name": VENDOR_NAME,
                       "Last seen": LAST_SEEN,
                       "Join date": JOIN_DATE,
                       "Sales": SALES,
                       "Feedback Score": FEEDBACK_SCORE,
                       "Bio": BIO, 
                       "PGP-key": PGP_KEY}}

        return pd.DataFrame(VENDOR_INFO).T

    #####################
    ###### Products #####
    #####################
    def get_products(soup):
        # Product names
        products = soup.find_all("div", {"class": "wLfRight"})
        product_names = [product.get_text() for product in soup.find_all("div", {"class": "wLfName"})]

        # Product Prices
        product_prices_raw = [product.get_text().split("USD")[0] for product in soup.find_all("div", {"class": "wLfPrice"})]
        product_prices = [int(float(price.split(".")[0].replace(",", ""))) for price in product_prices_raw] # missed a couple cents

        # Product vendor
        vendor_names = [product.get_text().replace("\\xa0", " ").split()[0] for product in soup.find_all("div", {"class": "wLfVendor"})]
        product_keys = ["product " + str(i + 1) for i in range(len(products))]

        # Combine product info into dict
        IDENTIFIER = list(get_vendor_info(soup)["Name"])[0]
        identifier_list = [IDENTIFIER for i in range(len(products))]
        ALL_PRODUCTS = {product_keys[i] :
                            {"Product" : product_names[i],
                             "Price" : product_prices[i],
                             "Vendor": vendor_names[i],
                             "IDENTIFIER" : identifier_list[i]} for i in range(len(products))}

        return pd.DataFrame(ALL_PRODUCTS).T


    #####################
    #Profile Discussions#
    #####################
    def get_discussion_info(soup):
        discussion_bodies = soup.find_all("div", {"class": "discussion_body"})
        discussion_texts  = [discussion.find("p").text for discussion in discussion_bodies]

        discussion_headerblocks = soup.find_all("div", {"class": "discussionHeaderBlock"})

        discussion_poster = [header.find("a").text.split("<span ")[0] for header in discussion_headerblocks if type(header.find("a")) != type(None)]
        discussion_poster = [poster.replace("\\xa0", " ").split()[0] for poster in discussion_poster]

        discussion_post_date = [header.findAll("span")[1].text.strip() for header in discussion_headerblocks if type(header.find("span")) != type(None)]


        # Combine discussion info into dict
        discussion_keys = ["Discussion " + str(i + 1) for i in range(len(products))]
        IDENTIFIER = list(get_vendor_info(soup)["Name"])[0]
        identifier_list = [IDENTIFIER for i in range(len(products))]

        ALL_DISCUSSIONS = {discussion_keys[i] :
                            {"Poster" : discussion_poster[i],
                             "Text" : discussion_texts[i],
                             "Date": discussion_post_date[i],
                             "IDENTIFIER" : identifier_list[i]} for i in range(len(discussion_bodies))}

        return pd.DataFrame(ALL_DISCUSSIONS).T

    #####################
    ### Buyer Feedback ##
    #####################
    def get_feedback(soup):
        feedbacks = soup.find_all("div", {"class": "feedback"})
        feedback_subs = soup.find_all("div", {"class": "feedback_subheader"})

        PRODUCTS = [feedback.findAll("a")[0].text.strip() for feedback in feedbacks]
        POSTED_BY = [feedback.findAll("a")[2].text.replace("\\xa0", " ").split()[0] for feedback in feedbacks]
        POSTED_ON = [feedback.findAll("span")[0].text.split("â")[0] for feedback in feedbacks]

        feedbacks_p = [feedback.findAll("p") for feedback in feedbacks]
        FEEDBACK_TEXTS = [(feedback_p[0].text if len(feedback_p) > 0 else "" ) for feedback_p in feedbacks_p]
        APPROX_AMOUNT  = [int(feedback_sub.find("div", {"style":"float:right"}).text.strip().split()[0][1:]) for feedback_sub in feedback_subs]

        feedback_keys = ["Feedback " + str(i + 1) for i in range(len(feedbacks))]
        IDENTIFIER = list(get_vendor_info(soup)["Name"])[0]
        identifier_list = [IDENTIFIER for i in range(len(products))]

        ALL_FEEDBACK = {feedback_keys[i]: 
                        {
                            "Posted by": POSTED_BY[i],
                            "Posted on": POSTED_ON[i],
                            "Product": PRODUCTS[i],
                            "Feedback text": FEEDBACK_TEXTS[i],
                            "Approximate amount (USD)": APPROX_AMOUNT[i],
                            "IDENTIFIER": identifier_list[i]
                        } for i in range(len(feedbacks))}
        return pd.DataFrame(ALL_FEEDBACK).T
    
    
    df_vendors  = pd.DataFrame()
    df_products = pd.DataFrame()
    df_discussions = pd.DataFrame()
    df_feedback = pd.DataFrame()
    
    for profile in profiles:
        with open("profiles\\" + profile, 'r') as temp:
            profile_html = temp.read()

        soup = BeautifulSoup(profile_html, "html.parser")
        
        df_vendors = pd.concat([df_vendors, get_vendor_info(soup)])
        df_products = pd.concat([df_products, get_products(soup)])
        df_discussions = pd.concat([df_discussions, get_discussion_info(soup)])
        df_feedback = pd.concat([df_feedback, get_feedback(soup)])
        
    
    df_vendors.to_csv("VICECITY_vendor_info.csv")
    df_products.to_csv("VICECITY_products_info.csv")
    df_discussions.to_csv("VICECITY_discussions_info.csv")
    df_feedback.to_csv("VICECITY_feedback_info.csv")

    return df_vendors, df_products, df_discussions, df_feedback
       
    
    
    
    

In [402]:
profiles = ["vicecity_profile_1.html","vicecity_profile_2.html"]

vendors, products, discussions, feedback = get_profiles_info(profiles)

In [407]:
discussions

Unnamed: 0,Poster,Text,Date,IDENTIFIER
Discussion 1,t****7,Hi I wanted to buy your 7days tutorial listing,November 2021,Fraudbuddy
Discussion 2,fraudbuddy,PROOF PROOF PROOF,May 2021,Fraudbuddy
Discussion 3,a****2,He buys from himself to gives himself feedbac...,May 2021,Fraudbuddy
Discussion 4,bajie1,careful with your words so as not to be the on...,May 2021,Fraudbuddy
Discussion 5,a****2,Do not buy he is a professional scammers\nHes ...,May 2021,Fraudbuddy
Discussion 6,fraudbuddy,please before you come here and be sending fud...,February 2021,Fraudbuddy
Discussion 7,d****9,Hey he has a good REPUTATION WITH ME SENT HIM ...,February 2021,Fraudbuddy
Discussion 8,S****o,aint nothing more important then the mula,January 2021,Fraudbuddy
Discussion 9,S****3,Hey looking for a stable partner.,December 2020,Fraudbuddy
Discussion 10,fraudbuddy,lol your so funny,August 2020,Fraudbuddy


'Fraudbuddy'