### ---------- LinkedIn Scraper Notebook ----------

This notebook will help to scrap the job offers for a personalized search on LinkedIn.

The first part is to scrap and save, the second part is to analyze with NLP and the NLTK library

## I. Scraper

In [1]:
# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from gui import Ui_MainWindow
from PyQt5 import QtCore, QtGui, QtWidgets
from threading import Thread
from traceback import print_exc
from datetime import datetime
from random import uniform

import time
import math
import pandas as pd
import numpy as np
import os
import sys
import json
from enum import Enum

In [2]:
from PyQt5.QtWidgets import (
    QApplication,
    QHBoxLayout,
    QVBoxLayout,
    QPushButton,
    QWidget,
    QScrollArea,
    QLabel,
    QComboBox,
    QLineEdit,
    QTabWidget,
)

from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtGui import QPixmap
import sys
import os

In [3]:
from enum import Enum
class Keys_enum(Enum):
    """
    Set of special keys codes.
    """
    NULL = '\ue000'
    CANCEL = '\ue001'  # ^break
    HELP = '\ue002'
    BACKSPACE = '\ue003'
    BACK_SPACE = BACKSPACE
    TAB = '\ue004'
    CLEAR = '\ue005'
    RETURN = '\ue006'
    ENTER = '\ue007'
    SHIFT = '\ue008'
    LEFT_SHIFT = SHIFT
    CONTROL = '\ue009'
    LEFT_CONTROL = CONTROL
    ALT = '\ue00a'
    LEFT_ALT = ALT
    PAUSE = '\ue00b'
    ESCAPE = '\ue00c'
    SPACE = '\ue00d'
    PAGE_UP = '\ue00e'
    PAGE_DOWN = '\ue00f'
    END = '\ue010'
    HOME = '\ue011'
    LEFT = '\ue012'
    ARROW_LEFT = LEFT
    UP = '\ue013'
    ARROW_UP = UP
    RIGHT = '\ue014'
    ARROW_RIGHT = RIGHT
    DOWN = '\ue015'
    ARROW_DOWN = DOWN
    INSERT = '\ue016'
    DELETE = '\ue017'
    SEMICOLON = '\ue018'
    EQUALS = '\ue019'

    NUMPAD0 = '\ue01a'  # number pad keys
    NUMPAD1 = '\ue01b'
    NUMPAD2 = '\ue01c'
    NUMPAD3 = '\ue01d'
    NUMPAD4 = '\ue01e'
    NUMPAD5 = '\ue01f'
    NUMPAD6 = '\ue020'
    NUMPAD7 = '\ue021'
    NUMPAD8 = '\ue022'
    NUMPAD9 = '\ue023'
    MULTIPLY = '\ue024'
    ADD = '\ue025'
    SEPARATOR = '\ue026'
    SUBTRACT = '\ue027'
    DECIMAL = '\ue028'
    DIVIDE = '\ue029'

    F1 = '\ue031'  # function  keys
    F2 = '\ue032'
    F3 = '\ue033'
    F4 = '\ue034'
    F5 = '\ue035'
    F6 = '\ue036'
    F7 = '\ue037'
    F8 = '\ue038'
    F9 = '\ue039'
    F10 = '\ue03a'
    F11 = '\ue03b'
    F12 = '\ue03c'

    META = '\ue03d'
    COMMAND = '\ue03d'

In [4]:
WAIT_XS = 0.05
WAIT_S = 0.5
WAIT_M = 4
WAIT_L = 10

In [5]:
class Driver(webdriver.Chrome):

    """
    ------------- Aim --------------
    Initialize the driver

    ---------- Parameters ----------
    (TYPE)       | NAME           | DESCRIPTION
    (Bool)       | run_background | Decides if the driver is executed on background
    """
    def __init__(self, run_background = False):

        # Initialize the ChromeDriver
        DRIVER_PATH = os.path.join('..', '3. Driver', 'chromedriver.exe')
        options = webdriver.ChromeOptions()
        # Make the window fullscreen at the start of the driver
        options.add_argument("--start-maximized")
        if run_background:
            # Run on background (less ressources used)
            options.add_argument("--headless")
        # Webdriver.Chrome constructor
        super().__init__(options = options, executable_path = DRIVER_PATH)
        
        # Get all the json file elements
        self.my_by_dict = {'NAME': By.NAME, 'CLASS': By.CLASS_NAME, 'ID': By.ID, 'XPATH': By.XPATH, 'CLASS_NAME': By.CLASS_NAME}
        self._load_json_elements()

        # Map the name called and the functions
        self.function_dict = {'Load': self.load_url, 'Fill': self.fill, 'Click': self.clickButton, 'Wait': self.wait, 
            'Get': self.get_page_contents, 'Scrap': self.scrap_element_on_page, 'Refresh': self.refresh_page, 
            'Export': self.export_data, 'Sleep': self.force_sleep, 'Scroll': self._scroll_on_page}

        # Initialize usefull variables
        self.attributes_scraped = set()
        self.page_content = []

    """
    ------------- Aim --------------
    Driver loads a given url.

    ---------- Parameters ----------
    (TYPE)       | NAME          | DESCRIPTION
    (str)        | url           | Url to be loaded
    """
    def load_url(self, url):
        self.get(url)

    """
    ------------- Aim --------------
    Load page elements from login.json
    """
    def _load_json_elements(self):
        f = open(os.path.join('..', '7. Config', 'login.json'))
        self.page_elements = json.load(f)['page_elements']
        f.close()

    """
    ------------- Aim --------------
    Load login credentials from a json file

    ---------- Parameters ----------
    (TYPE)       | NAME          | DESCRIPTION
    (str)        | path          | Name of the file to load
    ------------ Output ------------
    (TYPE)       | NAME        | DESCRIPTION
    (dict)       | credentials | Dict of loaded credentials
    """
    # TODO a refaire
    def _load_json_credentials(self, path):
        f = open(os.path.join('..', '7. Config', path))
        credentials = json.load(f)['credentials']
        f.close()    
        return credentials   


    """
    ------------- Aim --------------
    Maps a given action to the corresponding function and executes it with given arguments

    ---------- Parameters ----------
    (TYPE)       | NAME          | DESCRIPTION
    (str)        | action        | Action to execute
    (dict)       | args          | Arguments for the execution
    """
    def do(self, action, args):
        self.function_dict[action](**args)

    """
    ------------- Aim --------------
    Returns random value for timeout

    ------------ Output ------------
    (TYPE)       | NAME   | DESCRIPTION
    (int)        |        | Random int
    """
    def _timeout(self):
        return uniform(4, 6)

    """
    ------------- Aim --------------
    Waits for a given duration

    ---------- Parameters ----------
    (TYPE)       | NAME          | DESCRIPTION
    (int)        | duration      | Duration to wait
    """
    def force_sleep(self, duration):
        time.sleep(duration)

 
    """
    ------------- Aim --------------
    Wait for an element to be loaded on the page

    ---------- Parameters ----------
    (TYPE)       | NAME          | DESCRIPTION
    (str)        | element_name  | Name of the element to be waited
    """   
    def wait(self, element_name):
        # get the enum and tag for the element
        enum, tag = self._send_elements_info(element_name)
        try:
            element_present = EC.presence_of_element_located((enum, tag))
            WebDriverWait(self, self._timeout()).until(element_present)
        except TimeoutException:
            pass

    """
    ------------- Aim --------------
    Refreshes the current page and waits for a given element to be loaded

    ---------- Parameters ----------
    (TYPE)       | NAME             | DESCRIPTION
    (str)        | wait_for_element | Element to wait for after refreshing the page
    """
    def refresh_page(self, wait_for_element):
        self.refresh()
        self.wait(wait_for_element)

    """
    ------------- Aim --------------
    Fills a specific field with given keys

    ---------- Parameters ----------
    (TYPE)          | NAME          | DESCRIPTION
    (str)           | element_name  | Name of the field element to fill
    (str) or (lsit) | key           | Key or list of keys to be sent to the field
    """
    # TODO a refaire
    def fill(self, element_name, key):
        # get the enum and tag for the element
        enum, tag = self._send_elements_info(element_name)
        page_line_edit = self.find_element(enum, tag)
        # Clear the field (sometimes text is already input)
        page_line_edit.clear()
        # Verify if the input is a list
        if isinstance(key, list):
            for k in key:
                # Verify if the key is a specific key_enum or text
                if "Keys_enum" in k:
                    page_line_edit.send_keys(Keys_enum[k.split(".")[1]].value)
                else:
                    page_line_edit.send_keys(k)
        else:
            if "Keys_enum" in key:
                page_line_edit.send_keys(Keys_enum[key.split(".")[1]].value)
            else:
                page_line_edit.send_keys(key)

    """
    ------------- Aim --------------
    Clicks on a given button and waits for an element to be loaded on the page if specified

    ---------- Parameters ----------
    (TYPE)       | NAME             | DESCRIPTION
    (str)        | element_name     | Name of the button element to be clicked
    (str)        | wait_for_element | Element to wait for if specified
    """    
    def clickButton(self, element_name, wait_for_element = ''):
        # get the enum and tag for the button
        enum, tag = self._send_elements_info(element_name)
        self.find_element(enum, tag).click()
        time.sleep(2)
        if wait_for_element != '':
            self.wait(wait_for_element)
    
    """
    ------------- Aim --------------
    Export data to a given file. If data is not specified, exports the scraped page content

    ---------- Parameters ----------
    (TYPE)       | NAME      | DESCRIPTION
    (str)        | path      | Name of the export file
    (list)       | data      | Data to be exported
    """
    def export_data(self, path, data = None):
        if data is not None:
            with open(path, 'a+') as file:
                for row in data:
                    file.write(str(row) + '\n')
        else:
            pd.DataFrame(self.page_content, columns = ["names", "jobs", "location", "urls"]).to_excel(os.path.join('..', '2. Exports', '1. Export scraper',
                path + '.xlsx'), encoding = "utf-8", index = False, columns = ["names", "jobs", "location", "urls"])
  
    # """
    # ------------- Aim --------------
    # Use the user credentials to connect to LinkedIn.

    # ---------- Parameters ----------
    # (TYPE)       | NAME          | DESCRIPTION
    # (webdriver)  | driver        | Chrome driver object
    # (str)        | email_user    | User's email to connect to LinkedIn
    # (str)        | password_user | User's password to connect to LinkedIn

    # ------------ Output ------------
    # (TYPE)       | NAME   | DESCRIPTION
    # (webdriver)  | driver | The Chrome webdriver object logged on LinkedIn.
    # """
    def scrap_element_on_page(self, element_name, attribute_to_get = '', page_locator = '', page_button = '', page_limit = -1):
        enum, tag = self._send_elements_info(element_name)
        page_enum, page_tag = self._send_elements_info(page_locator)
        
        # number of pages detected to scrap
        number_of_pages = len(self.find_elements(page_enum, page_tag))
        print(number_of_pages)

        # Max number of pages to scrap if page limit is not specified
        if page_limit == -1:
            page_limit = 1000000

        # TODO
        number_of_pages = 3
        for current_page in range(number_of_pages):
            
            # Scroll to the first element on the list of elements to scrap
            els = self.find_elements(enum, tag)
            self.execute_script("arguments[0].scrollIntoView();", els[0])
            time.sleep(WAIT_S)
            current_view = 0

            # Scroll down on the page to let every element appear
            while current_view != len(els):
                els = self.find_elements(enum, tag)
                self.execute_script("arguments[0].scrollIntoView();", els[current_view])
                current_view += 1
                time.sleep(WAIT_XS)

            # Scrap all the wanted elements on the page
            element_scraped = self.find_elements(enum, tag)

            # Extract the attribute from the job offer
            for sub_part in element_scraped:
                attribute_data = sub_part.get_attribute(attribute_to_get)
                self.attributes_scraped.add(attribute_data)

            # Export data page by page
            self.export_data(os.path.join('..', '2. Exports', '1. Export scraper', 'elements_extracted.txt'), list(self.attributes_scraped))
            self.attributes_scraped.clear()
            
            # Verify if page limit is reached
            if current_page > page_limit - 1:
                break
            else:
                print("Trying page button")
                next_page_enum, next_page_tag = self._send_elements_info(page_button)
                
                if next_page_enum == By.XPATH:
                    
                    res = next_page_tag.split('|')
                    new_tag = "//" + res[0] + "[contains(" + res[1] + ", '" + res[2] + str(current_page+2) + "')]"
                    print(new_tag)
                else:
                    new_tag = tag
                print('New tag = ', new_tag)
                self.find_element(next_page_enum, new_tag).click()

    """
    ------------- Aim --------------
    Scraps the specified elements on the current page

    ---------- Parameters ----------
    (TYPE)          | NAME          | DESCRIPTION
    (str) or (list) | element_name  | Name or list of names of elements to be scraped on current page
    """
    def get_page_contents(self, element_name, attribute):
        current_page_element = []
        # Verify if element_name is a list
        if isinstance(element_name, list):  
            for element_on_page in element_name:
                enum, tag = self._send_elements_info(element_on_page)
                try:
                    found_element = self.find_elements(enum, tag)
                    found_elements_text = []
                    for elem in found_element:
                        found_elements_text.append(elem.get_attribute(attribute))
                    if found_elements_text:                       
                        concat_elem = " |_/- ".join(found_elements_text)
                        current_page_element.append(concat_elem)
                    else: 
                        current_page_element.append("-")

                except:
                    current_page_element.append("-")
        else:
            enum, tag = self._send_elements_info(element_on_page)
            try:
                current_page_element.append(self.find_element(enum, tag))
            except:
                current_page_element.append("-")
        current_page_element.append(self.current_url)
        self.page_content.append(current_page_element)

    """
    ------------- Aim --------------
    Not implemented yet
    """
    def _scroll_on_page(self, element_name):
        # Scroll to the first element on the list of elements to scrap
        enum, tag = self._send_elements_info(element_name)
        els = self.find_elements(enum, tag)
        self.execute_script("arguments[0].scrollIntoView();", els[0])
        time.sleep(WAIT_S)
        current_view = 0

        # Scroll down on the page to let every element appear
        while current_view != len(els):
            els = self.find_elements(enum, tag)
            self.execute_script("arguments[0].scrollIntoView();", els[current_view])
            current_view += 1
            time.sleep(WAIT_XS)

    """
    ------------- Aim --------------
    Maps the name of a given element to its tag and 'By' enumeration

    ---------- Parameters ----------
    (TYPE)       | NAME          | DESCRIPTION
    (str)        | element_name  | Name of the element to be mapped

    ------------ Output ------------
    (TYPE)       | NAME   | DESCRIPTION
    (enum)       | enum   | 'By' enumeration of the given element
    (str)        | tag    | HTML tag of the given element
    """
    def _send_elements_info(self, element_name):
        tag  = self.page_elements[element_name]['tag']
        enum = self.my_by_dict[self.page_elements[element_name]['enum']]
        return enum, tag

In [6]:
class Action():

    """
    ------------- Aim --------------
    Initialize the action

    ---------- Parameters ----------
    (TYPE)       | NAME          | DESCRIPTION
    (str)        | action_name   | Name of the action
    (str)        | to_do         | Action to execute
    (dict)       | params        | Dict of params for execution
    """
    def __init__(self, action_name, to_do, params):
        self.name   = action_name
        self.to_do  = to_do
        self.params = params
        self.possible_actions = ['Fill', 'Click', 'Wait', 'Load', 'Sleep']
        self.done = False

    """
    ------------- Aim --------------
    Execute itself through a driver

    ---------- Parameters ----------
    (TYPE)       | NAME          | DESCRIPTION
    (webdriver)  | driver        | Driver that will execute the action
    """
    def do(self, driver):
        print('Starting ' + self.name + '...')
        try:
            driver.do(self.to_do, self.params)
            self.done = True
            print('Success: ' + self.name)
        except Exception as e:
            print("Error in " + self.name)
            print(e)

    # """
    # ------------- Aim --------------
    # Maps the name of a given element to its tag and 'By' enumeration

    # ---------- Parameters ----------
    # (TYPE)       | NAME          | DESCRIPTION
    # (str)        | element_name  | Name of the element to be mapped

    # ------------ Output ------------
    # (TYPE)       | NAME   | DESCRIPTION
    # (enum)       | enum   | 'By' enumeration of the given element
    # (str)        | tag    | HTML tag of the given element
    # """
    def draw(self, task_container):
        # Create a Horizontal Box Layout to contain the action info
        action_container = QHBoxLayout()

        # Creating a container for the to_do and fill it with the current value of the Action
        to_do_container = QHBoxLayout()
        to_do_label = QLabel('Action:')
        to_do_combobox = QComboBox()
        to_do_combobox.addItems(self.possible_actions)
        to_do_combobox.setCurrentText(self.to_do)
        to_do_container.addWidget(to_do_label, 1)
        to_do_container.addWidget(to_do_combobox, 1)
        to_do_container.setObjectName('to_do_container')
        action_container.addLayout(to_do_container)

        # Creating a container for the page element concerned by the action
        try:
            field_container = QHBoxLayout()
            field_label = QLabel('Field name:')
            field_line_edit = QLineEdit()
            field_line_edit.setText(self.params['element_name'])
            field_container.addWidget(field_label, 1)
            field_container.addWidget(field_line_edit, 1)
            field_container.setObjectName('field_container')
            action_container.addLayout(field_container)
        except:
            print('Error element name for: ' + self.name)

        # Creating a container for the optional element concerned by the action
        for param in self.params:
            if param == 'element_name':
                continue
            other_container = QHBoxLayout()
            other_container.addWidget(QLabel(param), 1)
            line_edit_object = QLineEdit()
            line_edit_object.setText(str(self.params[param]))
            line_edit_object.setObjectName('line_edit_' + param)
            other_container.addWidget(line_edit_object, 1)

            action_container.addLayout(other_container)

        print('Object ' + self.name + ' drawn')

        return action_container


In [7]:

class Task():
    """
    ------------- Aim --------------
    Initialize the task, which is a list of actions to be executed

    ---------- Parameters ----------
    (TYPE)       | NAME          | DESCRIPTION
    (str)        | task_name     | Name of the task
    (webdriver)  | driver        | Driver that will execute the actions
    """
    def __init__(self, task_name, driver):
        self.name = task_name
        self.driver = driver
        self.actions = []
        self.json_elements = self._load_json_elements(self.name)
        self.done = False

    """
    ------------- Aim --------------
    Adds an action or a list of actions to the list of actions to execute

    ---------- Parameters ----------
    (TYPE)       | NAME          | DESCRIPTION
    (action)     | action        | Action or list of actions to be added
    """
    def add_action(self, action):
        if isinstance(action, list):
            self.actions += action
        else:
            self.actions.append(action)

    """
    ------------- Aim --------------
    Execute through a driver the actions in the list of actions to execute
    """
    def execute(self):
        for action in self.actions:
            action.do(self.driver)

    """
    ------------- Aim --------------
    Load the json elements of the actions in a given task

    ---------- Parameters ----------
    (TYPE)       | NAME          | DESCRIPTION
    (str)        | task_name     | Name of the task elements to load
    ------------ Output ------------
    (TYPE)       | NAME   | DESCRIPTION
    (dict)       |        | Dict of the elements loaded
    """
    def _load_json_elements(self, task_name):
        f = open(os.path.join('..', '7. Config', 'tasks.json'))
        return json.load(f)['LinkedIn'][task_name]

    def draw(self):
        task_container = QVBoxLayout()

        # Add a label to the task_container
        task_label = QLabel(self.name)
        task_label.setObjectName('task_label')
        task_container.addWidget(task_label, 1)

        # Add tasks to a vertical box stored itself in a scrollarea
        action_scroll_area = QScrollArea()
        vbox               = QVBoxLayout()
        widget             = QWidget()
        for action in self.actions:
            vbox.addLayout(action.draw())
        
        widget.setLayout(vbox)
        action_scroll_area.setWidget(widget)
        action_scroll_area.setVerticalScrollBarPolicy(QtCore.Qt.ScrollBarAlwaysOn)
        action_scroll_area.setHorizontalScrollBarPolicy(QtCore.Qt.ScrollBarAlwaysOn)
        action_scroll_area.setWidgetResizable(True)

        task_container.addWidget(action_scroll_area, 4)

        return task_container


In [8]:
class Website():
    def __init__(self, website_name):
        self.name = website_name
        self.tasks = []
        self.json_elements =  self._load_json_elements()
        self.done = False

    def add_task(self, task):
        if isinstance(task, list):
            self.tasks += task
        else:
            self.tasks.append(task)

    def execute(self):
        for task in self.tasks:
            task.execute()
            self.done = True

    def _load_json_elements(self):
        f = open(os.path.join('..', '7. Config', 'tasks.json'))
        return json.load(f)[self.name]

    def draw(self, center_layout):
        layout = QVBoxLayout()
        tabs = QTabWidget()
        tabs.addTab(self.generate_visual(), self.name)
        layout.addWidget(tabs)
        center_layout.setLayout(layout)
        
    def generate_visual(self):
        widget       = QWidget()
        center       = QScrollArea()
        vbox         = QVBoxLayout()
        
        for task in self.tasks:
            vertical_container = QVBoxLayout()
            task_elem = task.draw()
            vertical_container.addLayout(task_elem)

            vbox.addLayout(vertical_container)

        widget.setLayout(vbox)
        center.setWidget(widget)

        # Scroll Area Properties
        center.setVerticalScrollBarPolicy(QtCore.Qt.ScrollBarAlwaysOn)
        center.setHorizontalScrollBarPolicy(QtCore.Qt.ScrollBarAlwaysOn)
        center.setWidgetResizable(True)

        return center

In [9]:
f = open(os.path.join('..', '7. Config', 'login.json'))
logins = json.load(f)
f.close()
n = len(logins["credentials"])
threads = [None] * n
# drivers = [Driver()] * n

In [10]:
d = Driver()
t = Task("Connect", d)
for elem in t.json_elements:
    t.add_action(Action(action_name = elem, to_do = t.json_elements.get(elem).get('to_do'), params = t.json_elements.get(elem).get('params')))
t.execute()
t = Task("People Search", d)
for elem in t.json_elements:
    t.add_action(Action(action_name = elem, to_do = t.json_elements.get(elem).get('to_do'), params = t.json_elements.get(elem).get('params')))
t.execute()
for i in range(50):
    t = Task("Scrap People", d)
    for elem in t.json_elements:
        t.add_action(Action(action_name = elem, to_do = t.json_elements.get(elem).get('to_do'), params = t.json_elements.get(elem).get('params')))
    t.execute()
d.close()

t = Task("Export data", d)
for elem in t.json_elements:
    t.add_action(Action(action_name = elem, to_do = t.json_elements.get(elem).get('to_do'), params = t.json_elements.get(elem).get('params')))
t.execute()

# links_to_scrap = set()
# with open(os.path.join('..', '2. Exports', '1. Export scraper', 'elements_extracted.txt')) as file:
#     for line in file:
#         links_to_scrap.add(line)
# links_to_scrap = list(links_to_scrap)

# d = Driver()

# t = Task("Connect multi", d)
# for elem in t.json_elements:
#     t.add_action(Action(action_name = elem, to_do = t.json_elements.get(elem).get('to_do'), params = t.json_elements.get(elem).get('params')))
# t.execute()


# for link in links_to_scrap:
#     t = Task("Load url", d)
#     for elem in t.json_elements:
#         t.add_action(Action(action_name = elem, to_do = t.json_elements.get(elem).get('to_do'), params = {"url" : link}))
#     t.execute()
#     t = Task("Scrap multi", d)
#     for elem in t.json_elements:
#         t.add_action(Action(action_name = elem, to_do = t.json_elements.get(elem).get('to_do'), params = t.json_elements.get(elem).get('params')))
#     t.execute()
#     time.sleep(5)
# t = Task("Export data", d)
# for elem in t.json_elements:
#     t.add_action(Action(action_name = elem, to_do = t.json_elements.get(elem).get('to_do'), params = t.json_elements.get(elem).get('params')))
# t.execute()

# d.close()



Starting Load home page...
Success: Load home page
Starting Wait to load login...
Success: Wait to load login
Starting Fill log in...
Success: Fill log in
Starting Fill password...
Success: Fill password
Starting Force sleep...
Success: Force sleep
Starting Click Button login...
Success: Click Button login
Starting Fill search bar...
Success: Fill search bar
Starting Wait to load people button...
Success: Wait to load people button
Starting Click on People...
Success: Click on People
Starting Force to sleep &...
Success: Force to sleep &
Starting Click on Filter...
Success: Click on Filter
Starting Click on Belgium...
Success: Click on Belgium
Starting Force to sleep...
Success: Force to sleep
Starting Click on Validate...
Success: Click on Validate
Starting Scroll...
Success: Scroll
Starting Click on Next...
Success: Click on Next
Starting Force to sleep...
Success: Force to sleep
Starting Get page contents...
Success: Get page contents
Starting Get page urls...
Success: Get page urls

In [None]:
website = Website("LinkedIn")
driver = Driver()

for w_elem in website.json_elements:
    t = Task(w_elem, driver)
    for t_elem in t.json_elements:
        t.add_action(Action(action_name = t_elem, to_do = t.json_elements.get(t_elem).get('to_do'), params = t.json_elements.get(t_elem).get('params')))
    website.add_task(t)
# website.execute()

In [None]:
class Window(QWidget):
    def __init__(self):
        super(Window, self).__init__()
        self.setWindowTitle('Pimp my Scrap (PMS)')

In [None]:
if __name__ == "__main__":
    app = QApplication(sys.argv)
    window = Window()
    website.draw(window)
    window.show()
    sys.exit(app.exec())

In [12]:
df = pd.read_excel(os.path.join('..', '2. Exports', '1. Export scraper','teeest.xlsx'))
df.head()

Unnamed: 0,names,jobs,location,urls
0,Manaëlle Perchet\nView Manaëlle Perchet’s prof...,Head of Impact 🌍I WeImpact Initiator I Leader ...,"Brussels Region, Belgium |_/- Brussels Metropo...",https://www.linkedin.com/search/results/people...
1,https://www.linkedin.com/in/manaelleperchet |_...,https://www.linkedin.com/search/results/people...,,
2,Isabelle de Cambry\nView Isabelle de Cambry’s ...,Chief Sustainability Officer at Silox Group |_...,Brussels Metropolitan Area |_/- Brussels |_/- ...,https://www.linkedin.com/search/results/people...
3,https://www.linkedin.com/in/isabelledecambry |...,https://www.linkedin.com/search/results/people...,,
4,Dominique Debecker\nView Dominique Debecker’s ...,Deputy Chief Sustainability Officer |_/- Helpi...,Brussels Metropolitan Area |_/- Brussels |_/- ...,https://www.linkedin.com/search/results/people...


In [26]:
separator = " |_/- "
new_df = []
rows_to_add = []

for index, row in df.iterrows():
    if (index % 2) == 0:
        for i in range(len(row['names'].split(separator))):
            row_to_add =[]
            row_to_add.append(row['names'].split(separator)[i].split("View")[0])
            row_to_add.append(row['jobs'].split(separator)[i])
            row_to_add.append(row['location'].split(separator)[i])
            rows_to_add.append(row_to_add)
        for r in rows_to_add:
            new_df.append(r)
    else:
        for i in range(len(row['names'].split(separator))):
            rows_to_add[i].append(row['names'].split(separator)[i])
        rows_to_add = []

print(new_df[0])
print(new_df[1])

pd.DataFrame(new_df, columns = ["names", "jobs", "location", "urls"]).to_excel(os.path.join('..', '2. Exports', '1. Export scraper','csr.xlsx'), encoding = "utf-8", index = False, columns = ["names", "jobs", "location", "urls"])


['Manaëlle Perchet\n', 'Head of Impact 🌍I WeImpact Initiator I Leader RSE I Think Tank Advisor I Agile & Digital Sustainability I Lauréate HUB35 2022', 'Brussels Region, Belgium', 'https://www.linkedin.com/in/manaelleperchet']
['Philippe Meunier\n', 'Senior Sustainability Manager', 'Brussels Metropolitan Area', 'https://www.linkedin.com/in/philippe-meunier-00aa3728']
