## Machine Learning for NLP : Projet 2

Important libraries

In [1]:
import pandas as pd
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
import requests
import time

##### Scrapping on Cloth Shops on Trustpilot

In [2]:
my_urls = ["https://fr.trustpilot.com/categories/clothing_store", "https://fr.trustpilot.com/categories/electronics_technology", "https://fr.trustpilot.com/categories/jewelry_store", "https://fr.trustpilot.com/categories/cosmetics_store"]

Getting all shops

In [3]:
def get_companies_info(my_url):
    comp_info = []

# opening url and grabbing the web page
    uClient = urlopen(my_url)
    page_html = uClient.read()
    uClient.close()

    # html parsing
    page_soup = soup(page_html, 'html.parser')


    companies = page_soup.findAll("div", {'class' : "paper_paper__1PY90 paper_outline__lwsUX card_card__lQWDv card_noPadding__D8PcU styles_wrapper__2JOo2"})
    for element in companies:
        info_dict = {}
        info_dict['name'] = element.find('p').text
        info_dict['href'] = element.find('a').get('href')
        info_dict['type'] = [ele.text for ele in element.findAll('span', {'class' : "typography_body-s__aY15Q typography_appearance-default__AAY17"})]
        comp_info.append(info_dict)
    print(comp_info)

    return comp_info

##### Getting total pages for every shop

In [4]:
def get_pages(comp_info): 
    pages = {}
    types = {}
    for ele in comp_info:
        response = requests.get(f"https://fr.trustpilot.com/{ele['href']}")
        p_soup = soup(response.text, 'html.parser')
        
        pagination_div = p_soup.find("div", class_="styles_pagination__6VmQv")
        pagination_link = pagination_div.find("a", {"name": "pagination-button-last"})
        
        if pagination_link is not None:
            aria_label = pagination_link["aria-label"]
            pages[ele['href']] = int(aria_label[-2::])
            
        
        else:
            pages[ele['href']] = 1

        types[ele['href']] = ele['type']
        #pages["name"] = ele['name']
        #time.sleep(1)

    print(pages)
    return pages, types


##### Getting all the reviews, score and shop's name into a dataframe

In [7]:
def get_reviews(pages):

    reviews_data = []

    base_url = "https://fr.trustpilot.com"
    for endpoint in pages.keys():
        for page in range(1, pages[endpoint] + 1):
            url = f"{base_url}{endpoint}?page={page}"
            response = requests.get(url)

            if response.status_code != 200:
                print(f"Échec du chargement de la page {page}")
                continue

            my_soup = soup(response.content, 'html.parser')


            divs = my_soup.find_all('div', class_='styles_reviewCardInner__EwDq2')

            
            for div in divs:

                name = div.find('span', class_='typography_heading-xxs__QKBS8 typography_appearance-default__AAY17').text
                section = div.find('section', class_='styles_reviewContentwrapper__zH_9M')
                rating = section.find('div', class_='styles_reviewHeader__iU9Px')['data-service-review-rating']
                #review_title = section.find('h2', {'data-service-review-title-typography': True}).text
                text = section.find('p', {'data-service-review-text-typography': True}).text if section.find('p', {'data-service-review-text-typography': True}) else ''
                review_date = section.find('time')['datetime'] if section.find('time') else ''
                experience_date = section.find('p', class_="typography_body-m__xgxZ_ typography_appearance-default__AAY17").get_text(strip=True).split(':')[-1].strip()
                types = pages[endpoint]

                reviews_data.append({'shop': str(endpoint)[8::], 'types': types, 'name': name, 'score': int(rating), 'review': str(text), 'review_date': str(review_date), 'experience_date': str(experience_date)})
            time.sleep(1)  

    return reviews_data

In [6]:
shops = []
scores = []
reviews = []
review_dates = []  
experience_dates = []
names = []
types = []
whole_dict = {}

In [12]:
for url in my_urls:
    comp_info = get_companies_info(url)
    pages, type_list = get_pages(comp_info)
    reviews_data = get_reviews(pages)
    
    for i in range(len(reviews_data)):
        names.append(reviews_data[i]['name'])
        types.append(reviews_data[i]['types'])
        shops.append(reviews_data[i]['shop'])
        scores.append(reviews_data[i]['score'])
        reviews.append(reviews_data[i]['review'])
        review_dates.append(reviews_data[i]['review_date'])
        experience_dates.append(reviews_data[i]['experience_date'])

[{'name': 'Maison de MaMoulia', 'href': '/review/maisondemamoulia.fr', 'type': ['Magasin de vêtements', 'Magasin de vêtements pour enfants', 'Magasin de laine', 'Magasin de vêtements pour femmes', 'Magasin de vêtements pour bébés', 'Magasin de vêtements', 'Magasin de vêtements pour enfants', 'Magasin de laine', 'Magasin de vêtements pour femmes', 'Magasin de vêtements pour bébés']}, {'name': 'Boutique Route du Sud', 'href': '/review/boutiqueroutedusud.com', 'type': ['Magasin de vêtements', 'Magasin de vêtements']}, {'name': 'Clickandrock', 'href': '/review/clickandrock.fr', 'type': ['Magasin de vêtements', 'Magasin de chaussures', 'Magasin de vêtements', 'Magasin de chaussures']}, {'name': 'Fuzz Bayonne', 'href': '/review/fuzz-bayonne.com', 'type': ['Magasin de musique', 'Magasin de guitares', 'Magasin de disques', 'Magasin de vêtements', 'Magasin de musique', 'Magasin de guitares', 'Magasin de disques', 'Magasin de vêtements']}, {'name': 'Kamshoes', 'href': '/review/kamshoes.fr', 'typ

In [14]:
whole_dict = {}
whole_dict['name'] = names
whole_dict['shop'] = shops
whole_dict['score'] = scores
whole_dict['review'] = reviews
whole_dict['review_date'] = review_dates
whole_dict['experience_date'] = experience_dates

In [15]:
data = pd.DataFrame(whole_dict)
data.head()

Unnamed: 0,name,shop,score,review,review_date,experience_date
0,Claudine,maisondemamoulia.fr,5,J'ai commandé une paire de chaussettes en lain...,2024-01-11T12:26:55.000Z,18 décembre 2023
1,adeline barbe,maisondemamoulia.fr,5,La propriétaire du site est une personne très ...,2024-01-09T22:36:48.000Z,02 décembre 2023
2,Audrey L.,maisondemamoulia.fr,5,"Des produits de qualité, un envoi toujours rap...",2024-01-09T20:49:21.000Z,09 janvier 2024
3,Client,maisondemamoulia.fr,5,"Livraison en 2 jours chrono, commande expédiée...",2024-01-08T16:31:37.000Z,23 décembre 2023
4,Alex,maisondemamoulia.fr,5,"Parfait ! Livraison rapide, emballage soigné, ...",2024-01-11T17:33:14.000Z,15 décembre 2023


##### Dataframe statistics

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38293 entries, 0 to 38292
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             38293 non-null  object
 1   shop             38293 non-null  object
 2   score            38293 non-null  int64 
 3   review           38293 non-null  object
 4   review_date      38293 non-null  object
 5   experience_date  38293 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.8+ MB


In [17]:
data['score'].value_counts()

score
5    33701
4     2921
1      726
3      641
2      304
Name: count, dtype: int64

In [18]:
#Save into a csv for ulterior processing
data.to_csv('data.csv', index=False)