# Fragrantica 웹 크롤러

## 1. 사전작업

### 1-1. 필요한 패키지 설치

In [1]:
# 이 부분은 처음 한번만 실행하면 됌.
# 코드 수정 - "The reason is that the last Ubuntu update update supports chromium driver just via snap."
# 최근 우분투 업데이트에서 크롬 드라이버 설치를 snap을 이용해서만 하도록 바뀜
# 고로 snap 없이 설치하는 아래 우회 코드로 변경
# 출처 : https://colab.research.google.com/drive/1cbEvuZOhkouYLda3RqiwtbM-o9hxGLyC
# 출처2 : https://stackoverflow.com/questions/75155063/selenium-use-chrome-on-colab-got-unexpectedly-exited

%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium
pip install selenium-stealth

Executing: /tmp/apt-key-gpghome.BH2F1nKaiZ/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.lrUjGb4KuX/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.cdf4i5xkXD/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Get:1 http://deb.debian.org/debian buster InRelease [122 kB]
Get:2 http://security.ubuntu.com/ubuntu



### 1-2. 필요한 라이브러리 import

In [2]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium_stealth import stealth
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import csv
import os
from datetime import datetime, timedelta
import pandas as pd

### 1-3. 구글 드라이브 마운트

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 1-4. 크롤링 결과를 저장할 파일 생성

In [4]:
fragrantica_perfumes = [["perfume_no", "perfume", "brand", "gender", "review_count", "avg_rating", "review_votes", "accords", "review_avg", "notes", "LSGP", "description"]]
fragrantica_reviews = [["user_name", "perfume_no", "rating", "spring", 'summer', 'fall', 'winter', 'day', 'night']]

if os.path.exists("/content/drive/MyDrive/fragrantica/fragrantica_perfumes.csv"):
  print("파일이 있습니다.")
else:
  # 파일이 없으면 실행할 코드
  with open('/content/drive/My Drive/fragrantica/fragrantica_perfumes.csv', mode='w', newline='', encoding='utf-8') as file:
      writer = csv.writer(file)
      for row in fragrantica_perfumes:
          writer.writerow(row)
  print('파일이 없어서 생성했음')

if os.path.exists("/content/drive/MyDrive/fragrantica/fragrantica_reviews.csv"):
  print("파일이 있습니다.")
else:
  # 파일이 없으면 실행할 코드
  with open('/content/drive/My Drive/fragrantica/fragrantica_reviews.csv', mode='w', newline='', encoding='utf-8') as file:
      writer = csv.writer(file)
      for row in fragrantica_reviews:
          writer.writerow(row)
  print('파일이 없어서 생성했음')

파일이 있습니다.
파일이 있습니다.


### 1-4. Selinium 설정

In [5]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')        # Head-less 설정
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=options)

### 1-5. review 이모티콘과 매칭할 변수 생성

In [6]:
review_class = pd.read_csv('/content/drive/My Drive/fragrantica/fragrantica_review_class.csv')

love = BeautifulSoup(review_class['love'][0], "html.parser").find('svg')
like = BeautifulSoup(review_class['like'][0], "html.parser").find('svg')
ok = BeautifulSoup(review_class['ok'][0], "html.parser").find('svg')
dislike = BeautifulSoup(review_class['dislike'][0], "html.parser").find('svg')
hate = BeautifulSoup(review_class['hate'][0], "html.parser").find('svg')
winter = BeautifulSoup(review_class['winter'][0], "html.parser").find('svg')
spring = BeautifulSoup(review_class['spring'][0], "html.parser").find('svg')
summer = BeautifulSoup(review_class['summer'][0], "html.parser").find('svg')
fall = BeautifulSoup(review_class['fall'][0], "html.parser").find('svg')
day = BeautifulSoup(review_class['day'][0], "html.parser").find('svg')
night = BeautifulSoup(review_class['night'][0], "html.parser").find('svg')

### 1-6. 향수 URL 파일 불러오기

In [19]:
# urls = pd.read_csv('/content/drive/My Drive/fragrantica/fragrantica.csv', names=['brand', 'name', 'url']) # fragrantica
urls = pd.read_csv('/content/drive/My Drive/fragrantica/base_note_top_1000_matched.csv') # base note top 1000

## 2. 설정해줘야할거


### 2-1.  시작 Index, 종료 Index 설정

In [32]:
start = 129
end = len(urls)

## 3. 크롤링 

In [9]:
driver.quit()

### 3-1. 크롤링 진행하면서 각 페이지당 결과 저장됨

In [33]:
def get_driver(options):
  return webdriver.Chrome('chromedriver', options=options)

for i in range(start, end):
  driver = get_driver(options)
  # driver = webdriver.Chrome('chromedriver', options=options)
  stealth(driver,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
        )


  driver.get(urls['url'][i] + "#all-reviews")
  time.sleep(10)

  
  # BeautifulSoup 객체 생성
  soup = BeautifulSoup(driver.page_source, "html.parser")

  gender = soup.find("div", {"id" : "toptop"}).find("small").text

  accords = []
  for data in soup.find_all("div", {"class" : "cell accord-box"}):
    accords.append([data.find("div", {"class" : "accord-bar"}).text, data.find("div", {"class" : "accord-bar"})['style'].split(';')[3].split(':')[1].strip(), data.find("div", {"class" : "accord-bar"})['style'].split(';')[1].split(':')[1].strip()])

  review_rating = 0
  review_votes = 0
  if soup.find('div', {'itemprop' : 'aggregateRating'}) != None:
    review_rating = soup.find('p', {'class' : 'info-note'}).find_all('span')[0].text
    review_votes = soup.find('p', {'class' : 'info-note'}).find_all('span')[2].text.replace(',', '')


  review_avg = []
  for data in soup.find_all('div', {'class' : 'grid-x grid-margin-x grid-margin-y'})[3].find_all('div', {'style' : "display: flex; flex-direction: column; justify-content: space-around; cursor: pointer;"}):
    if len(data.find('div', {'class' : 'voting-small-chart-size'}).find('div').find('div')['style'].split(';')) > 3:
      review_avg.append([data.find('div', {'style' : 'display: flex; justify-content: center;'}).find('span').text, data.find('div', {'class' : 'voting-small-chart-size'}).find('div').find('div')['style'].split(';')[3].split(':')[1].strip()])
      continue
    review_avg.append(['', ''])

  description = soup.find('div', {'itemprop' : 'description'}).find('p').text.strip()

  notes = []
  if soup.find('div', {'id' : 'pyramid'}) != None:
    for note_data in soup.find('div', {'id' : 'pyramid'}).find_all('div', {'style' : 'display: flex; justify-content: center; text-align: center; flex-flow: row wrap; align-items: flex-end; padding: 0.5rem;'}):
      note = []
      for data in note_data.find_all('div'):
        if type(data.find('img')) == type(soup.find('div')) and type(soup.find('div')) == type(data.find('div')):
          note.append(data.find('img')['src'].split('.')[2])
      notes.append(note)

  LSGP = []
  for LSGPS in soup.find_all('div', {'style' : 'padding: 2rem 1.5rem 0px;'}):
    LSGP_data = []
    LSGP_data.append(LSGPS.find('span', {'style' : 'font-size: small;'}).text)

    LSGP_data_temp = []
    for data in LSGPS.find_all('div', {'class' : 'grid-x grid-margin-x'}):
      LSGP_data_temp.append(data.find('span', {'class' : 'vote-button-name'}).text)
      LSGP_data_temp.append(data.find('span', {'class' : 'vote-button-legend'}).text)
    LSGP_data.append(LSGP_data_temp)
    LSGP.append(LSGP_data)

  review_list = []
  for review in soup.find('div', {'class' : 'grid-x grid-padding-x grid-margin-y'}).find_all('div', {'itemprop' : 'review'}):
    ############## 이름 향수번호 평점 봄 여름 가을 겨울 낮 밤밤
    user_name = review.find('b').text
    # user_review = [user_name, i, '', '', '', '', '', '', ''] # fragrantica
    user_review = [user_name, urls['frag_no'][i], '', '', '', '', '', '', '']

    for data in review.find('div', {'class' : 'perfume-vote-box'}):
      target = data.find('svg')
      if target == love:
        user_review[2] = 5
      elif target == like:
        user_review[2] = 4
      elif target == ok:
        user_review[2] = 3
      elif target == dislike:
        user_review[2] = 2
      elif target == hate:
        user_review[2] = 1
      elif target == winter:
        user_review[6] = "winter"
      elif target == spring:
        user_review[3] = "spring"
      elif target == summer:
        user_review[4] = "summer"
      elif target == fall:
        user_review[5] = "fall"
      elif target == day:
        user_review[7] = "day"
      elif target == night:
        user_review[8] = "night"

    isEmpty = True
    for emp in range(2, 9):
      if(user_review[emp] != ''):
        isEmpty = False
    if isEmpty == False:
      review_list.append(user_review)


  # 결과 저장
  # result = [i, urls['name'][i], urls['brand'][i], gender, len(review_list), review_rating, review_votes, accords, review_avg, notes, LSGP, description] # fragrantica
  result = [urls['frag_no'][i], urls['name'][i], urls['brand'][i], gender, len(review_list), review_rating, review_votes, accords, review_avg, notes, LSGP, description] # base note top 1000

  # with open('/content/drive/My Drive/fragrantica/fragrantica_perfumes.csv', mode='a', newline='', encoding='utf-8') as file:
  #   writer = csv.writer(file)
  #   writer.writerow(result)

  # with open('/content/drive/My Drive/fragrantica/fragrantica_reviews.csv', mode='a', newline='', encoding='utf-8') as file:
  #   writer = csv.writer(file)
  #   for row in review_list:
  #     writer.writerow(row)

  with open('/content/drive/My Drive/fragrantica/fragrantica_perfumes_top_1000.csv', mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(result)

  with open('/content/drive/My Drive/fragrantica/fragrantica_reviews_top_1000.csv', mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    for row in review_list:
      writer.writerow(row)

  # 이미지 저장
  import urllib.request
  img_url = soup.find('img', {'itemprop': 'image'})['src']
  # urllib.request.urlretrieve(img_url, '/content/drive/My Drive/fragrantica/img/' + str(i) + ".png")
  urllib.request.urlretrieve(img_url, '/content/drive/My Drive/fragrantica/img/' + str(urls['frag_no'][i]) + ".png")

  

  print('===========================', i, 'of', end, '==========================', (datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S"))
  # print('no :', i) #fragrantica
  print('no :', urls['frag_no'][i])
  print('brand :', urls['brand'][i])
  print('name :', urls['name'][i])
  print('gender :', gender)
  print('main accord :', accords)
  print('review rating :', review_rating)
  print('review votes :', review_votes)
  print('review avg :', review_avg)
  print('description :', description)
  print('notes :', notes)
  print('LSGP :', LSGP)
  print('review_count :', len(review_list))
  print(review_list)
  print()

  driver.quit()
  time.sleep(30)



no : 8089
brand : Creed
name : love in white
gender : for women
main accord : [['floral', '100%', 'rgb(255, 95, 141)'], ['citrus', '92.387%', 'rgb(249, 255, 82)'], ['yellow floral', '67.2482%', 'rgb(255, 220, 16)'], ['white floral', '63.5121%', 'rgb(237, 242, 251)'], ['powdery', '61.7474%', 'rgb(238, 221, 204)'], ['iris', '59.945%', 'rgb(183, 167, 215)'], ['sweet', '59.5977%', 'rgb(238, 54, 59)'], ['savory', '58.4994%', 'rgb(227, 200, 130)'], ['woody', '53.6368%', 'rgb(119, 68, 20)'], ['green', '53.2802%', 'rgb(14, 140, 29)']]
review rating : 3.70
review votes : 2395
review avg : [['love', '83.2579%'], ['like', '100%'], ['ok', '15.3846%'], ['dislike', '66.7421%'], ['hate', '5.54299%'], ['winter', '25.8544%'], ['spring', '91.8276%'], ['summer', '63.5958%'], ['fall', '34.1753%'], ['day', '100%'], ['night', '22.734%']]
description : Love in White by Creed is a Amber Floral fragrance for women. Love in White was launched in 2005. Love in White was created by Olivier Creed and Erwin Creed. 

AttributeError: ignored