In [2]:
import requests
from bs4 import BeautifulSoup
import re

from config import gaishi, minshuu, vorkers
from selenium import webdriver
import os
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

import json
from pprint import pprint

%matplotlib inline

In [1]:
company_dics = {
    "三菱商事": {
        "name": "mitsubishi_shouji"
        "gaishi": "37",
        "vorkers": "a0910000000Frj0"
    },
}

In [None]:
SR_SAVE_FORMAT = "./data/selection_reports/%s.json"
RV_SAVE_FORMAT = "./data/reviews/%s.json"

# 外資就活ドットコム
gaishi_driver = GaishiDriver()
SR_SAVE_PATH = SR_SAVE_FORMAT % name
for company, company_dic in company_dics.items():
    print(company)
    name = company_dic["name"]
    gaishi_driver.access_sr_list_page(company_number=company_dic["gaishi"])
    sr_urls = gaishi_driver.get_sr_urls()
    srs = gaishi_driver.get_each_sr(sr_urls)
    with open(SR_SAVE_PATH, "r") as f:
        json.dump(srs, f)

# Vorkers
vorkers_driver = VorkersDriver()
RV_SAVE_PATH = RV_SAVE_FORMAT % name
for company, company_dic in company_dics.items():
    print(company)
    name = company_dic["name"]
    vorkers_driver.access_top_page(company_id=company_dic["vorkers"])
    vorkers_driver.get_score()
    vorkers_driver.access_review_page(company_id=company_dic["vorkers"])
    dic = vorkers_driver.get_reviews()
    with open(RV_SAVE_PATH, "r") as f:
        json.dump(dic, f)

## 外資就活ドットコム

In [None]:
sr_list_page = "https://gaishishukatsu.com/company/%s/study"

class GaishiDriver(object):
    
    def __init__(self,):
        self.driver = webdriver.PhantomJS(service_log_path=os.path.devnull)
        self.es_list_page = None
        self.login()
        
    def login(self,):
        self.driver.get("https://gaishishukatsu.com/login")
        self.driver.implicitly_wait(1)
        username_field = self.driver.find_element_by_name("data[GsUser][email]")
        password_field = self.driver.find_element_by_name("data[GsUser][password]")
        username_field.send_keys(gaishi["email"])
        self.driver.implicitly_wait(1)
        password_field.send_keys(gaishi["password"])
        self.driver.implicitly_wait(1)
        self.driver.find_element_by_css_selector("#GsUserLoginForm > div._mt-md.content-center-button > p:nth-child(1) > button").click()
        
    def access_sr_list_page(self, company_number):
        self.driver.implicitly_wait(1)
        self.driver.get(sr_list_page % company_number)
        source = self.driver.page_source.encode('utf-8')
        self.sr_list_page = BeautifulSoup(source, "lxml")
        
    def get_sr_urls(self,):
        sr_urls = []
        es_list = self.sr_list_page.find_all("a", class_="_panel-content report-list")
        for es in es_list:
            url = es["ng-href"]
            if str(url).startswith("https://gaishishukatsu.com/selection_reports/"):
                sr_urls.append(url)
        return sr_urls
    
    def get_each_sr(self, sr_urls):
        srs = {}
        for sr_url in sr_urls:
            self.driver.implicitly_wait(1)
            self.driver.get(sr_url)
            source = self.driver.page_source.encode('utf-8')
            sr_page = BeautifulSoup(source, "lxml")
            try:
                importance = sr_page.find("td", class_="impression_importance").text
            except AttributeError:
                importance = ""
            try:
                advice = sr_page.find("td", class_="impression_advice").text
            except:
                advice = ""
            srs[sr_url] = {"importance": importance, "advice": advice}        
        return srs


## VORKERS

In [None]:
top_page = "https://www.vorkers.com/company.php?m_id=%s"
review_page = "https://www.vorkers.com/company_answer.php?m_id=%s&q_no=1"

class VorkersDriver(object):
    
    def __init__(self,):
        self.driver = webdriver.PhantomJS(service_log_path=os.path.devnull)
        self.top_page = None
        self.review_page = None
        self.dic = {}
        self.login()
        
    def login(self,):
        self.driver.get("https://www.vorkers.com/login.php")
        self.driver.implicitly_wait(1)
        username_field = self.driver.find_element_by_name("_username")
        password_field = self.driver.find_element_by_name("_password")
        username_field.send_keys(vorkers["email"])
        self.driver.implicitly_wait(1)
        password_field.send_keys(vorkers["password"])
        self.driver.implicitly_wait(1)
        self.driver.find_element_by_css_selector("#log_in").click()
        
    def access_top_page(self, company_id):
        self.driver.implicitly_wait(1)
        self.driver.get(top_page % company_id)
        source = self.driver.page_source.encode('utf-8')
        self.top_page = BeautifulSoup(source, "lxml")

    def get_score(self,):
        # レーダーチャートのスコア
        radar_elems = self.top_page.find("ul", class_="scoreList-8").find_all("li")
        self.dic["radar"] = {}
        radar_dic = self.dic["radar"]
        for radar_elem in radar_elems:
                key = radar_elem.find("dt").text
                score = radar_elem.find("dd").text
                radar_dic[key] = score
        # 総合評価
        self.dic["総合評価"] = self.top_page.find("span", property="v:average").text
        # 残業時間(月間)と有給休暇消化率
        zangyou, yuukyuu = self.top_page.find_all("dd", class_="d-ib t-r w-85")
        self.dic["残業時間(月間)"] = zangyou.text
        self.dic["有給休暇消化率"] = yuukyuu.text
        
    def access_review_page(self, company_id):
        self.driver.implicitly_wait(1)
        self.driver.get(review_page % company_id)
        source = self.driver.page_source.encode('utf-8')
        self.review_page = BeautifulSoup(source, "lxml")

    def get_reviews(self,):
        reviews = self.review_page.find_all("dd", class_="article_answer")
        review_arr = [review.text.strip("\n").strip() for review in reviews]
        next_pages = self.review_page.find_all("a", class_="paging_link-item")
        if len(next_pages):
            for next_page in next_pages:
                self.driver.implicitly_wait(3)
                url = next_page.get("href")
                self.driver.get(url)
                source = self.driver.page_source.encode('utf-8')
                self.review_page = BeautifulSoup(source, "lxml")
                reviews = self.review_page.find_all("dd", class_="article_answer")
                review_arr += [review.text.strip("\n").strip() for review in reviews]
        self.dic["reviews"] = review_arr
        return self.dic