# 01 Webscraping Clean

#### Rens og filtrér hentet data

In [138]:
import bs4
import csv
import requests
import re
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import random
import pickle
import time
import pandas as pd
import glob
import os

reviewErrorCount = [] # used in multithreadwritecsvfile
dataDir = './data/'

### Functions

In [145]:
def dumpFileNames():
    pathname = dataDir + '/*.dump'
    FileNames = []
    for file in glob.glob(pathname, recursive=True):
        file = file.replace(dataDir,'')
        file = file.replace('.dump','')
        FileNames.append(file)
    
    return FileNames

def loadRawData(*args):
    allPagesRequest = []
    for arg in args:
        with open(dataDir+arg+'.dump','rb') as file:
            allPagesRequest = allPagesRequest + pickle.load(file)
    return allPagesRequest

In [141]:
# Frasortere emojis
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

# Frasortere linjeskift, comma, og dobbelt mellemrum & gør al tekst lowercase.
def data_cleaning(data):
    data = data.lower()
    data = data.replace('\r', ' ')
    data = data.replace(",", ' ')
    data = data.replace('"', '')
    data = data.replace('  ', '')
    return data

# Splitter hver url i listen, og returnere hjemmesidenavnet.
def createFiles(url):
    filename = str(url.url)
    filename = filename.split('.')

    if len(filename) == 4:
        filename = filename[2].split('/')
        filename = filename[2]
    else:
        filename = filename[3]
    return filename

# Tjekker om filen eksistere, så den ikke bliver appended til igen. 
def checkFileExists(filename):
    try:
        with open(filename,'rb') as file:
            return 0
    except:
        return 1

In [142]:
# Filtrere den gå data med bs4 og skriver til en csv med det navn som request url'et indholder.
def multithreadwritecsvfile(page):
    loadingbar.update(1)

    filename = dataDir+createFiles(page)+'.csv'
    
    with open(filename, 'a', newline='', encoding='utf-8') as output_file:
        output_writer = csv.writer(output_file)
        
        soup = bs4.BeautifulSoup(page.content,'html.parser')
        for content in soup.find_all('section', attrs={'class':'styles_reviewContentwrapper__zH_9M'}):
            rating=None
            splitted=None
            review=None

            try: 
                rating = content.find('img',alt = True)
                splitted = rating.get('alt').split()
            except Exception as e:
                print('No rating found. Skipping...', e)

            try: 
                review = content.find('p', attrs={'class':'typography_typography__QgicV typography_body__9UBeQ typography_color-black__5LYEn typography_weight-regular__TWEnf typography_fontstyle-normal__kHyN3'}).text
                if len(review) > 2:
                    review = remove_emojis(review)
                    review = data_cleaning(review)
                    if len(review) > 0:
                        output_writer.writerow(['__label__'+splitted[2]+" ", review])
            except Exception as e:
                reviewErrorCount.append(e);

In [143]:
# method to get the list of csv files we have avaliable
def csvFileNames():
    pathname = dataDir + '/*.csv'
    FileNames = []
    for file in glob.glob(pathname, recursive=True):
        FileNames.append(file.replace(dataDir,''))
    
    return FileNames

def mergeCsvFiles(fileNamesInput):

    fileNames = []
    # loop through the list with filenames we want to merge
    for name in fileNamesInput:
        fileNames.append(name + '.csv')

    # path for the joining files
    olddir = os.getcwd()
    os.chdir(dataDir)

    #combine all the files
    combinedCSV = pd.concat([pd.read_csv(f) for f in fileNames])
    print(fileNames)
    
    #export to csv
    combinedCSV.to_csv('combined_csv.csv', index=False, encoding='utf-8')
    os.chdir(olddir)

### Run

In [147]:
dumpFileNames()

['boksen',
 'contourdesign',
 'cphbusiness',
 'eboligskoedet',
 'gamecastle',
 'isports',
 'jyskmobelfabrik',
 'komplett',
 'sas',
 'tapeconnection',
 'thyrep']

In [149]:
allPagesRequest = loadRawData(
 'boksen',
 'contourdesign',
 'cphbusiness',
 'eboligskoedet',
 'gamecastle',
 'isports',
 'jyskmobelfabrik',
 'komplett',
 'sas',
 'tapeconnection',
 'thyrep'
)

In [150]:
for shop in allPagesRequest:
    filename = dataDir+createFiles(shop)+'.csv'
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        output_writer = csv.writer(output_file)
        output_writer.writerow(['rating','review'])
        
with tqdm(total=len(allPagesRequest)) as loadingbar: 
    with ThreadPoolExecutor(8) as ex:
        
        ex.map(multithreadwritecsvfile, allPagesRequest)

print('Done. Skipped reviews:', len(reviewErrorCount))


HBox(children=(FloatProgress(value=0.0, max=509.0), HTML(value='')))


Done. Skipped reviews: 875


In [152]:
csvFileNames()

['boksen.csv',
 'combined_csv.csv',
 'contourdesign.csv',
 'cphbusiness.csv',
 'eboligskoedet.csv',
 'gamecastle.csv',
 'isports.csv',
 'jyskmobelfabrik.csv',
 'komplett.csv',
 'sas.csv',
 'sklearn.csv',
 'tapeconnection.csv',
 'thyrep.csv']

In [154]:
mergeCsvFiles([
 'boksen',
 'contourdesign',
 'cphbusiness',
 'eboligskoedet',
 'gamecastle',
 'isports',
 'jyskmobelfabrik',
 'komplett',
 'sas',
 'tapeconnection',
 'thyrep'
])

['boksen.csv', 'contourdesign.csv', 'cphbusiness.csv', 'eboligskoedet.csv', 'gamecastle.csv', 'isports.csv', 'jyskmobelfabrik.csv', 'komplett.csv', 'sas.csv', 'tapeconnection.csv', 'thyrep.csv']
