### ---------- LinkedIn Scraper Notebook ----------

This notebook will help to scrap the job offers for a personalized search on LinkedIn.

The first part is to scrap and save, the second part is to analyze with NLP and the NLTK library

## I. Scraper

In [3]:
from gui import Ui_MainWindow

from lib.driver import Driver
from lib.action import Action
from lib.task import Task
from threading import Thread
import math

import sys
import os
import json

In [4]:
from PyQt5.QtWidgets import (
    QApplication,
    QHBoxLayout,
    QVBoxLayout,
    QPushButton,
    QWidget,
    QScrollArea,
    QLabel,
    QComboBox,
    QLineEdit,
    QTabWidget,
    QStackedLayout,
    QGridLayout,
    QFormLayout)

from PyQt5 import QtCore
from PyQt5 import QtGui
from PyQt5 import QtWidgets
from PyQt5.QtGui import QPixmap

In [5]:
class Dataset():
    def __init__(self):
        self.variables = dict()
        self.variables['urls'] = []
        self.variables['page_content'] = []
        self.variables['urls_index'] = [-1]

    def add_inner_variables(self, list_of_variables):
        for elem in list_of_variables:
            self.variables[elem] = None

    def get_variable_value(self, var):
        try:
            return self.variables[var]
        except:
            return None

    def set_variable_value(self, var, value):
        self.variables[var] = value

In [6]:
class Window(QWidget):
    def __init__(self):
        super(Window, self).__init__()
        self.resize(1000, 800)
        self.setWindowTitle('Pimp my Scrap (PMS)')
        self.current_task_index = 0

        path_stylesheet = os.path.join('..', '7. Config', 'stylesheets', 'style_window.json')
        with open(path_stylesheet) as f:
            self.stylesheet = json.load(f)

        # This will be the main layout for the window. If a panel has to be changed or added, add it here.
        self.main_two_parts = QHBoxLayout()

        # The left panel will include the 'Scrap' button and the 'Parameter' button
        panel_left_widget = QWidget(self)
        panel_left_layout = QVBoxLayout(self)
        # Creates and connects the button to change layouts
        button_scrap = QPushButton('Scrap')
        button_scrap.clicked.connect(lambda: self.switch_page(1))
        button_parameter = QPushButton('Parameter')
        button_parameter.clicked.connect(lambda: self.switch_page(0))
        panel_left_layout.addWidget(button_scrap, 1)
        panel_left_layout.addWidget(button_parameter, 1)

        # Adding a stacked layout to diplay the two windows
        self.stackedLayout = QStackedLayout()

        self.stackedLayout.addWidget(self.load_ui_parameter())
        self.stackedLayout.addWidget(self.load_ui_scraper())
        
        panel_left_widget.setLayout(panel_left_layout)
        self.setLayout(self.main_two_parts)
        self.main_two_parts.addWidget(panel_left_widget, 1)
        self.main_two_parts.addLayout(self.stackedLayout, 4)

    def switch_page(self, nb):
        self.stackedLayout.setCurrentIndex(nb)

    def change_action(self, position):
        self.stackedActionLayout.setCurrentIndex(self.stackedActionLayout.currentIndex() + position)

    def load_ui_scraper(self):
        widget = QWidget()
        main_layout = QVBoxLayout()

        # 1. Create a grid layout to put the logo in the top right corner
        top_widget = QWidget()
        logo_layout = QGridLayout()

        # Loads the image and prepare it
        logo = QLabel()
        image_path = os.path.join('..', '6. Images', 'logo_headmind.png')
        pixmap = QPixmap(image_path)
        pixmap_good_size = pixmap.scaledToHeight(50)
        logo.setPixmap(pixmap_good_size)
        
        # Positionning the logo in the top right corner
        logo_layout.addWidget(logo, 0, 1)
        logo_layout.setColumnStretch(0, 1)
        logo_layout.setRowStretch(1, 1)
        top_widget.setLayout(logo_layout)

        # 2. Creating a QVBoxLayout to fill it with the search options
        center_widget = QWidget()
        center_layout = QVBoxLayout()

        center_layout.addWidget(QLabel('Search Parameters'), 1)
        # Placing the two text edit zones
        search_bar = QLineEdit()
        search_bar.setPlaceholderText('Junior data analyst')
        center_layout.addWidget(search_bar, 1)
        pages_bar = QLineEdit()
        pages_bar.setPlaceholderText('Nb of pages')
        center_layout.addWidget(pages_bar, 1)
        center_widget.setLayout(center_layout)

        # 3. Creating the Search button
        bottom_widget = QWidget()
        bottom_layout = QGridLayout()
        bottom_layout.addWidget(QLabel(), 0, 0)
        bottom_layout.addWidget(QLabel(), 2, 2)
        self.button_search = QPushButton('Search')
        self.button_search.setObjectName('button_search')
        bottom_layout.addWidget(self.button_search, 1, 1)

        bottom_widget.setLayout(bottom_layout)

        main_layout.addWidget(top_widget)
        main_layout.addWidget(center_widget)
        main_layout.addWidget(bottom_widget)
        widget.setLayout(main_layout)
        
        return widget

    def load_ui_parameter(self):
        self.widget_parameter = QWidget()
        self.widget_parameter.setObjectName('widget_parameter')

        task_layout = QVBoxLayout()

        splited_layout = QHBoxLayout()
        button_widget = QWidget()
        button_previous = QPushButton('Previous task')
        button_previous.setStyleSheet(self.stylesheet['button_style'])
        button_previous.clicked.connect(lambda: self.change_action(-1))
        button_next = QPushButton('Next task')
        button_next.setStyleSheet(self.stylesheet['button_style'])
        button_next.clicked.connect(lambda: self.change_action(1))
        splited_layout.addWidget(button_previous)
        splited_layout.addWidget(button_next)

        button_widget.setLayout(splited_layout)

        # Adding the stacked layout to the window
        self.stackedActionLayout = QStackedLayout()
        self.stackedActionLayout.setObjectName('action_layout')

        bottom_part = QWidget()
        bottom_part.setLayout(self.stackedActionLayout)

        task_layout.addWidget(button_widget, 1)
        task_layout.addWidget(bottom_part, 6)

        self.widget_parameter.setLayout(task_layout)
        return self.widget_parameter

In [7]:
# This class is the core of this file. It is loading the tasks.json file, creates all the windows related to it
# and is the key to access to every object. Methods should be called from the main and no other object needs
# to be connected to run the scraper.

# The current implemented methods are:
# execute: runs a specified subpart of the tasks.json file
# draw: displays the GUI

class Program():
    def __init__(self, name):
        self.website_to_display = name
        self.tasks_file = self._browse_file()
        self.logins = self._browse_logins()
        self.hierarchy  = self._generate_hierarchy()
        self.hierarchy_backup = self.hierarchy.copy()
        self.dataset = Dataset()

    def _generate_hierarchy(self):
        hierarchy = {}
        for task_name in self.tasks_file:
            # Creates the new task
            new_task = Task(task_name, self.tasks_file[task_name])

            # Fill the new task with the inner actions
            for action_name in new_task.json_elements:
                if "Driver" not in task_name and action_name != "multi":
                    new_task.add_action(
                                        Action(action_name = action_name,
                                               to_do       = new_task.json_elements.get(action_name).get('to_do'),
                                               params      = new_task.json_elements.get(action_name).get('params')
                                              )
                                       )

            hierarchy[task_name] = new_task
        return hierarchy
    
    def _browse_file(self):
        try:
            file_path = os.path.join('..', '7. Config', 'appdata', self.website_to_display + '.json')

            with open(file_path) as f:
                full_file = json.load(f)
                tasks     = full_file[self.website_to_display]
        except Exception as e:
            print(e)
            print('Error while reading ' + self.website_to_display + '.json')
            tasks = None
        
        return tasks

    def _browse_logins(self):
        try:
            file_path = os.path.join('..', '7. Config', 'login.json')

            with open(file_path) as f:
                full_file = json.load(f)
                logins     = full_file["credentials"]
        except Exception as e:
            print(e)
            print('Error while reading '  + 'login.json')
            logins = None
        return logins

    def _manage_drivers(self, run_background, t):
        # Driver action: multi = False
        if not self.hierarchy[t].json_elements.get('multi'):
            self.drivers.append(Driver(next(iter(self.logins))["email"],
                                       next(iter(self.logins))["password"],
                                       self.dataset,
                                       run_background))
        # Driver action: multi = True
        else:
            self.drivers = []
            # Create a list of Driver objects corresponding to the logins stored. They currently read the 
            # logins corresponding to LinkedIn.
            for login in self.logins:
                self.drivers.append(Driver(login["email"],
                                           login["password"],
                                           self.dataset,
                                           run_background))
    
    def _launch_execution(self, t, driver, repeat = 1):
        try:
            self.hierarchy[t].execute(driver, repeat)
        except Exception as e:
            print('Error in '+ self.hierarchy[t].name + ': ')
            print(e)

    def _threaded_execution(self, t, repeat = 1):
        threads = [None] * len(self.drivers)
        for i in range(len(self.drivers)):
            threads[i] = Thread(target =  self.hierarchy[t].execute,
                                args=((self.drivers[i], repeat)))
            threads[i].start()
        for i in range(len(self.drivers)):
            threads[i].join()

    # Execute the all program as it is stored in the hierarchy
    def execute(self, repeat_some_tasks = {}, run_background = False):
        self.drivers = []
        for t in self.hierarchy:
            # If the task is called Driver, it's about create several driver for the entire Program
            if "Driver" in t:
                self._manage_drivers(run_background, t)            
            # If the user wants to repeat a task of many, the repeat_some_tasks dictionary will not be empty.
            # If the task is in the dictionary, we have to repeat the task several times.
            if t in repeat_some_tasks.keys():
                # If there is no 'multi' or 'multi' = False in the task
                if not self.hierarchy[t].json_elements.get('multi'):
                    # Comment later
                    if isinstance(repeat_some_tasks[t], str):
                        self._launch_execution(t, self.drivers[-1], len(self.dataset.variables[repeat_some_tasks[t]]))
                    else:
                        self._launch_execution(t, self.drivers[-1], repeat_some_tasks[t])
                else:
                    with open(os.path.join('..', '2. Exports', '1. Export scraper', 'elements_extracted.txt')) as file:
                        n = len(file.readlines())
                    self._threaded_execution(t, math.floor(n/len(self.drivers)))
            else:
                if not self.hierarchy[t].json_elements.get('multi'):
                    self._launch_execution(t, self.drivers[-1])
                else:
                    self._threaded_execution(t)

    def draw(self):
        app = QApplication(sys.argv)
        window = Window()
        window.button_search.clicked.connect(lambda: self.execute())

        keys_list = list(self.hierarchy)
        key = window.current_task_index

        for task in self.hierarchy:
            if 'Driver' not in task:
                vertical_container = QVBoxLayout()
                task_elem = self.hierarchy[task].draw()
                vertical_container.addLayout(task_elem)
                test = QWidget()
                test.setLayout(vertical_container)

                window.stackedActionLayout.addWidget(test)

        window.show()
        sys.exit(app.exec())

In [8]:
if __name__ == "__main__":
    p = Program('linkedin_daily')
#   p.execute({'Scrap multi': "urls"})
#   p.execute({'Scrap multi': 6})
#   p.execute()
#   p.draw()

Starting Load home page...
Success: Load home page
Starting Wait to load login...
Success: Wait to load login
Starting Fill log in...
Success: Fill log in
Starting Fill password...
Success: Fill password
Starting Force sleep...
Success: Force sleep
Starting Click Button login...
Success: Click Button login
Starting Fill search bar...
Success: Fill search bar
Starting Wait to load job button...
Success: Wait to load job button
Starting Force sleep 1...
Success: Force sleep 1
Starting Click on Jobs...
Success: Click on Jobs
Starting Force sleep...
Success: Force sleep
Starting Click on Filter...
Message: no such element: Unable to locate element: {"method":"xpath","selector":"//button[text()[contains(.,'Date Posted')]]"}
  (Session info: chrome=108.0.5359.125)
Stacktrace:
Backtrace:
	Ordinal0 [0x003EACD3+2075859]
	Ordinal0 [0x0037EE61+1633889]
	Ordinal0 [0x0027B7BD+571325]
	Ordinal0 [0x002AAC2F+764975]
	Ordinal0 [0x002AAE1B+765467]
	Ordinal0 [0x002DD0F2+970994]
	Ordinal0 [0x002C7364+8815

In [9]:
# Charte graphique
# Preque noir: #0A0F27
# Bleu marine: #001952
# Bleu foncé: #043775
# Bleu moyen: #12769E
# Blanc cassé: #FCF6F3