In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import csv

from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
import asyncio
import time

SCRAP USER FUNCTION

In [6]:
async def scrap_user(row, proxy):
    """
    Scrap the user profile
    
    Parameters
    ----------
    row: dict
        The row of the dataframe
    proxy: str
        The proxy to use
    """
    
    link = row['link']
    creation_date = row['creation_date']
    profil = row['profil']

    # define custom options for the Selenium driver
    options = Options()

    options.add_argument(f'--proxy-server={proxy}')
    options.add_argument("window-size=800,800")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("start-maximized")
    options.add_argument("enable-automation")
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-dev-shm-usage")

    # create the ChromeDriver instance with custom options
    driver = webdriver.Chrome(options=options)

    wait = WebDriverWait(driver, 5)
    driver.get(link)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    data = {}

    price_element = soup.find('div', {'data-testid': 'profile-price'})
    if price_element:
        price = price_element.find('span', class_='block-list__price').text
        data['price'] = price.strip()
    
    # Récupérer l'expérience
    experience_element = soup.find('span', string='Expérience')
    if experience_element:
        experience = experience_element.find_next('span', class_='profile-indicators-content').text
        data['experience'] = experience.strip()
    
    # Récupérer le taux de réponse
    response_rate_element = soup.find('span', string='Taux de réponse')
    if response_rate_element:
        response_rate = response_rate_element.find_next('span', class_='profile-indicators-content').text
        data['response_rate'] = response_rate.strip()
    
    # Récupérer le temps de réponse
    response_time_element = soup.find('span', string='Temps de réponse')
    if response_time_element:
        response_time = response_time_element.find_next('span', class_='profile-indicators-content').text
        data['response_time'] = response_time.strip()
        
    # Récupérer le nom 
    name_element = soup.find('div', {'data-testid': 'profile-fullname'})
    if name_element:
        name = name_element.text
        data['name'] = name.strip()
        
    # Récupérer le métier
    headline_element = soup.find('div', {'data-testid': 'profile-headline'})
    if headline_element:
        headline = headline_element.text
        data['headline'] = headline.strip()
        
    # Récupérer le nombre de missions
    missions_element = soup.find('div', {'data-testid': 'profile-counter-missions'})
    if missions_element:
        missions = missions_element.find('strong').text
        data['missions'] = missions.strip()
        
    # Récupérer toutes les catégories
    categories_elements = soup.find_all('li', {'class': 'categories__list-item'})
    categories = [category.find('a').text for category in categories_elements]
    data['categories'] = categories 
    
    # Récupérer les compétences
    competences_element = soup.find_all('div', {'class': 'profile-expertises__content-list-item__label'})
    competences = [competence.find('a', class_='joy-link joy-link_teal').text.strip() for competence in competences_element]

    data['competences'] = competences
    
    # Récupérer le statut "Supermalter"
    supermalter_element = soup.find('span', class_='joy-badge-level__tag blue')
    if supermalter_element:
        supermalter = supermalter_element.get_text(strip=True)
        data['supermalter'] = supermalter
        
    # Récupérer la localisation
    location_element = soup.find('dl', {'class': 'profile__location-and-workplace-preferences__item'})
    if location_element:
        location_label = location_element.find('dt', {'data-testid': 'profile-location-address-label'})
        location_value = location_element.find('dd', {'data-testid': 'profile-location-preference-address'})

        if location_label and location_value:
            location = {location_label.text: location_value.text}
            data['location'] = location
            
    # Récupérer la préférence de télétravail
    teletravail_element = soup.find('dl', {'class': 'profile-page-mission-preferences__item'})
    if teletravail_element:
        teletravail_label = teletravail_element.find('dt')
        teletravail_value = teletravail_element.find('dd')

        if teletravail_label and teletravail_value:
            teletravail_preference = {teletravail_label.text: teletravail_value.text}
            data['teletravail_preference'] = teletravail_preference
            
    # Récupérer le nombre de recommandations
    recommendations_element = soup.find('span', {'data-testid': 'profile-counter-recommendations'})
    if recommendations_element:
        recommendations_count = int(recommendations_element.text.split()[0])
        data['recommendations'] = recommendations_count   
        

    # Récupérer le message de présentation
    presentation_element = soup.find('div', {'class': 'profile-description__content'})
    if presentation_element:
        presentation_message = presentation_element.get_text(strip=True)
        data['presentation'] = presentation_message
        
    # add link of the profile
    data['link'] = row['link']
    
    # add created date
    data['creation_date'] = row['creation_date']
    
    # add name to the data
    data['profil'] = row['profil']
          
    driver.quit() # close the browser
    # time.sleep(5) # wait for 5 seconds to avoid getting banned
    
    return data # return the data scrapped

In [7]:
async def scrap_all_users_proxy(df): # links is a list of links to scrap.
    """
    Scrap the profile of every link in the list of links.
    
    Parameters
    ----------
    links : list
        List of links to scrap.
        
    Returns
    -------
    all_data : dataFrame
        DataFrame containing all the data scrapped.
    """
    all_data = [] # list to store all the data scrapped
    
    links = df['link'].tolist()
    creation_date = df['creation_date'].tolist()
    profils = df['profil'].tolist()
    
    # backup_csv(df)
    
    for i in range(len(links)):
        row = {
            'link': links[i],
            'creation_date': creation_date[i],
            'profil': profils[i]
        }
        scrapped_user = await scrap_user(row, proxy_pool[0]) # scrap the user
        print("Scraped ",i," user: ",scrapped_user)
        
        all_data.append(scrapped_user)
        
    return all_data # return the list of data scrapped

RUN THE SCRIPT WITH THE LINKS

In [8]:
profile_links = pd.read_csv('../data/links.csv')
profile_links['profil'] = profile_links['profil'].apply(lambda x: x.replace('https://www.malt.fr/profile/', ''))

# add column link to the DataFrame
profile_links['link'] = profile_links['profil'].apply(lambda x: f'https://www.malt.fr/profile/{x}')

profile_links['scraped'] = False # add column scraped to the DataFrame

# get first 10 rows of the DataFrame
profile_links = profile_links.iloc[:10]

async def main():
    data = await scrap_all_users_proxy(profile_links)
    return data

data = await main()

Scraped  0  user:  {'price': '300\xa0€', 'experience': '15 ans et +', 'response_rate': '100%', 'response_time': '12h', 'name': 'Brice Tillet', 'headline': 'Composer / Sound Designer / Music Producer', 'categories': ['Sound Designer'], 'competences': [], 'location': {'Localisation': 'Paris, France'}, 'teletravail_preference': {'Peut travailler dans vos locaux à': 'Paris et 50km autour'}, 'recommendations': 6, 'presentation': "Bonjour,je suis musicien, compositeur et sound designer depuis 2005.J'aime composer à l'image, travailler les textures sonores, jouer avec le rythme et les silences.N'hésitez pas à me contacter !Brice", 'link': 'https://www.malt.fr/profile/bricetillet', 'creation_date': '2014-01-08', 'profil': 'bricetillet'}
Scraped  1  user:  {'price': '250\xa0€', 'experience': '-', 'response_rate': '100%', 'response_time': 'Quelques jours', 'name': 'Mickael M.', 'headline': 'Graphiste', 'categories': ['Graphiste'], 'competences': [], 'location': {'Localisation': '21000 Dijon, Fra

In [9]:
data

[{'price': '300\xa0€',
  'experience': '15 ans et +',
  'response_rate': '100%',
  'response_time': '12h',
  'name': 'Brice Tillet',
  'headline': 'Composer / Sound Designer / Music Producer',
  'categories': ['Sound Designer'],
  'competences': [],
  'location': {'Localisation': 'Paris, France'},
  'teletravail_preference': {'Peut travailler dans vos locaux à': 'Paris et 50km autour'},
  'recommendations': 6,
  'presentation': "Bonjour,je suis musicien, compositeur et sound designer depuis 2005.J'aime composer à l'image, travailler les textures sonores, jouer avec le rythme et les silences.N'hésitez pas à me contacter !Brice",
  'link': 'https://www.malt.fr/profile/bricetillet',
  'creation_date': '2014-01-08',
  'profil': 'bricetillet'},
 {'price': '250\xa0€',
  'experience': '-',
  'response_rate': '100%',
  'response_time': 'Quelques jours',
  'name': 'Mickael M.',
  'headline': 'Graphiste',
  'categories': ['Graphiste'],
  'competences': [],
  'location': {'Localisation': '21000 D

SAVE THE DATA TO CSV

In [10]:
def save_to_csv(data, csv_filename):
    # Assurez-vous que la liste de données n'est pas vide
    if not data:
        print("Aucune donnée à enregistrer.")
        return

    # Créez une liste de noms de colonnes pour le CSV en incluant les nouveaux éléments
    fieldnames = ['name', 'headline', 'price', 'experience', 'response_rate', 'response_time', 'missions', 'categories', 'competences', 'supermalter', 'location','presentation', 'recommendations', 'teletravail_preference']

    # Ouvrir le fichier CSV en mode écriture
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        # Écrire les en-têtes
        writer.writeheader()

        # Écrire les données
        for entry in data:
            writer.writerow(entry)

    print(f"Données enregistrées dans {csv_filename}")

In [11]:
csv_filename = 'malt_data.csv'
save_to_csv(data, csv_filename)

ValueError: dict contains fields not in fieldnames: 'profil', 'link', 'creation_date'