In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import dateparser
import re


def extract_reviews(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    data = []
    
    pagination = soup.find("div", class_="styles_pagination__6VmQv")
    last_page = pagination.find_all("a")[-2].text
    last_page = pd.to_numeric(last_page)
    
    for i in range(1, last_page + 1):
        page_url = url + "?page=" + str(i)
        page_response = requests.get(page_url)
        page_soup = BeautifulSoup(page_response.content, 'html.parser')
        
        reviews = page_soup.find_all("div", class_="styles_cardWrapper__LcCPA styles_show__HUXRb styles_reviewCard__9HxJJ")
        
        for review in reviews:
            review_data = {}
            
            review_data["Company"] = soup.find("h1", class_="typography_default__hIMlQ typography_appearance-default__AAY17 title_title__i9V__").span.get_text(strip=True)
            review_data["Customer"] = review.find("span", class_="typography_heading-xxs__QKBS8 typography_appearance-default__AAY17").text
            review_data["Number_review"] = review.find("div", class_="styles_consumerExtraDetails__fxS4S").find("span").text
            review_data["Language"] = review.find("div", class_="typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l styles_detailsIcon__Fo_ua").text
            review_data["Title"] = review.find("h2", class_="typography_heading-s__f7029 typography_appearance-default__AAY17").text
            review_data["Date_review"] = review.find("div", class_="styles_reviewHeader__iU9Px").time.text
            reply = review.find("div").find("p", class_="typography_body-m__xgxZ_ typography_appearance-default__AAY17 styles_message__shHhX")
            review_data['Reply'] = reply.text if reply else None
            review_data['Date_reply'] = review.find("div", class_="styles_content__Hl2Mi").time.text if reply else None
            review_data["Rating"] = review.find("section", class_="styles_reviewContentwrapper__zH_9M").div["data-service-review-rating"]
            type_element = review.find_next("div", class_="typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l styles_detailsIcon__yqwWi")
            review_data["Status"] = type_element.find("span").text if type_element else None
            review_data["Experience Date"] = review.find("div", class_="styles_reviewContent__0Q2Tg").text
            
            data.append(review_data)
    
    df = pd.DataFrame(data)
    return df

younited_url = "https://www.trustpilot.com/review/www.younited-credit.com"
df_younited = extract_reviews(younited_url)
df_younited.to_csv("../data/raw/younited.csv")
cofidis_url = "https://www.trustpilot.com/review/www.cofidis.fr"
df_cofidis = extract_reviews(cofidis_url)
df_cofidis.to_csv("../data/raw/cofidis.csv")
floa_url = "https://www.trustpilot.com/review/www.floabank.fr"
df_floa = extract_reviews(floa_url)
df_floa.to_csv("../data/raw/floa.csv")
orange_url = "https://www.trustpilot.com/review/www.orangebank.fr"
df_orange = extract_reviews(orange_url)
df_orange.to_csv("../data/raw/orange.csv")
bourso_url = "https://www.trustpilot.com/review/boursorama-banque.com"
df_bourso = extract_reviews(bourso_url)
df_bourso.to_csv("../data/raw/bourso.csv")
anytime_url = "https://www.trustpilot.com/review/anyti.me"
df_anytime = extract_reviews(anytime_url)
df_anytime.to_csv("../data/raw/anytime.csv")
df_all = pd.concat([df_younited, df_cofidis, df_orange, df_floa, df_bourso, df_anytime], axis=0)

df_all = df_all.reset_index(drop=True)
df_all[["Number_review", "review"]]= df_all["Number_review"].str.split(" ", n = 1, expand = True)
df_all[["ExperienceDate", "Date_experience"]]= df_all["Experience Date"].str.split(":", n = 1, expand = True)
df_all[["Experience", "Date"]]= df_all["Experience Date"].str.split("Date", n = 1, expand = True)
new_df = df_all[["Company", "Customer", "Number_review", "Language", "Title", "Date_review", "Reply", "Date_reply", "Rating", "Status", "Experience", "Date_experience"]]
new_df.groupby(by="Status").describe()
new_df[["Status"]] = new_df[["Status"]].fillna(method="ffill")
new_df[["Number_review", "Rating"]] = new_df[["Number_review", "Rating"]].astype("int64")
# Convertir les colonnes "Date_review", "Date_reply" et "Date_experience" en dates
new_df["Date_reply"] = new_df["Date_reply"].fillna(new_df["Date_review"])
new_df['Date_review'] = new_df['Date_review'].apply(lambda x: dateparser.parse(x).strftime('%Y-%m-%d') if isinstance(x, str) and dateparser.parse(x) is not None else x)
new_df[new_df['Date_review'].str.contains("Updated")]
# Create a regular expression to match the date format
date_regex = re.compile(r'^\d{4}-\d{2}-\d{2}$')

# Identify the invalid dates
invalid_dates = []
for date in new_df['Date_review']:
    if not date_regex.match(date):
        invalid_dates.append(date)

# Print the number of invalid dates
print(len(invalid_dates))
min_date = min(new_df['Date_review'])
new_df['Date_review'] = new_df['Date_review'].apply(lambda x: min_date if not date_regex.match(x) else x)
#Convertir la colonne "Date_reply" en dates
new_df["Date_reply"] = new_df["Date_reply"].apply(lambda x: dateparser.parse(x) if isinstance(x, str) else x)
#Remplacer les valeurs manquantes de "Date_reply" par les valeurs de "Date_review"
new_df["Date_reply"] = new_df.apply(lambda row: row["Date_review"] if pd.isna(row["Date_reply"]) else row["Date_reply"], axis=1)
#Convertir la colonne "Date_reply" en dates
new_df["Date_experience"] = new_df["Date_experience"].apply(lambda x: dateparser.parse(x) if isinstance(x, str) else x)
#Remplacer les valeurs manquantes de "Date_reply" par les valeurs de "Date_review"
new_df["Date_experience"] = new_df.apply(lambda row: row["Date_experience"] if pd.isna(row["Date_experience"]) else row["Date_experience"], axis=1)
#Convertir la colonne "Date_reply" en dates
new_df["Date_review"] = new_df["Date_review"].apply(lambda x: dateparser.parse(x) if isinstance(x, str) else x)
#Remplacer les valeurs manquantes de "Date_reply" par les valeurs de "Date_review"
new_df["Date_review"] = new_df.apply(lambda row: row["Date_review"] if pd.isna(row["Date_review"]) else row["Date_review"], axis=1)

# Fonction de conversion personnalisée
def convert_review_date(date_str):
    try:
        parsed_date = dateparser.parse(date_str)
        if parsed_date:
            return parsed_date.strftime('%Y-%m-%d')
    except Exception:
        pass
    return date_str

# Appliquer la fonction de conversion à la colonne Date_review
new_df['Date_review'] = new_df['Date_review'].apply(convert_review_date)
#new_df["Response_time"] = new_df["Date_reply"] - new_df["Date_review"]
new_df['Date_reply'] = new_df['Date_reply'].apply(convert_review_date)

new_df["Response_time"] = new_df["Date_reply"] - new_df["Date_review"]
new_df[["Reply"]] = new_df[["Reply"]].fillna("No Reply")
new_df = new_df.dropna(how="any")

columns_to_convert = ['Date_review', 'Date_experience', 'Date_reply']

for col in columns_to_convert:
    new_df[col] = pd.to_datetime(new_df[col]).dt.strftime('%Y-%m-%d')

new_df["Response_time"] = new_df["Response_time"].astype(str)
new_df[["Response_time", "Day"]] = new_df["Response_time"].str.split(" ", n=1, expand=True)
new_df = new_df.drop("Day", axis=1)

#new_df["Date_reply"] = new_df["Date_reply"].fillna(new_df["Date_review"])
new_df.isnull().sum()
new_df["Response_time"] = new_df["Response_time"].astype('int64')

# Calculer le temps de réponse moyen pour chaque entreprise
average_response_time_by_company = new_df.groupby('Company')['Response_time'].mean()

# Afficher le résultat
average_response_time_by_company

new_df.to_csv("../data/processed/avis_clients.csv")





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["Date_reply"] = new_df["Date_reply"].fillna(new_df["Date_review"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Date_review'] = new_df['Date_review'].apply(lambda x: dateparser.parse(x).strftime('%Y-%m-%d') if isin

31


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["Date_reply"] = new_df["Date_reply"].apply(lambda x: dateparser.parse(x) if isinstance(x, str) else x)
  new_df["Date_reply"] = new_df.apply(lambda row: row["Date_review"] if pd.isna(row["Date_reply"]) else row["Date_reply"], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["Date_reply"] = new_df.apply(lambda row: row["Date_review"] if pd.isna(row["Date_reply"]) else row["Date_reply"], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

In [2]:
new_df

Unnamed: 0,Company,Customer,Number_review,Language,Title,Date_review,Reply,Date_reply,Rating,Status,Experience,Date_experience,Response_time
0,Younited Credit,Alexis,2,FR,Very good service,2023-10-23,"Bonjour,Nous sommes enchantés de vous lire et ...",2023-10-24,5,Verified,Very good service,2023-10-18,1
1,Younited Credit,Exo Saru,3,FR,Super simple,2023-08-24,"Bonjour,Nous vous remercions d’avoir pris le t...",2023-08-25,5,Verified,"Super simpleSuper simple, très rapide, interfa...",2023-08-17,1
2,Younited Credit,Getifra,3,FR,Younited Credit a démontré son…,2023-08-05,"Bonjour,Nous vous remercions d’avoir pris le t...",2023-08-08,5,Verified,Younited Credit a démontré son…Younited Credit...,2023-07-27,3
3,Younited Credit,Duval,9,FR,Rapide efficace simple un pur bonheur…,2022-11-28,"Bonjour,Merci d’avoir pris le temps de nous fa...",2022-11-30,5,Verified,Rapide efficace simple un pur bonheur…Rapide e...,2022-11-28,2
4,Younited Credit,Soso,2,FR,Younited une procédure simple,2023-02-25,"Bonjour,Nous vous remercions pour ce commentai...",2023-02-27,5,Verified,Younited une procédure simpleYounited permet u...,2023-02-19,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2548,Anytime,RAPHAEL N,1,FR,ANYTIME IS NOT A BANK BUT THEY HAVE DONE A GRE...,2015-09-05,No Reply,2015-09-05,5,Invited,ANYTIME IS NOT A BANK BUT THEY HAVE DONE A GRE...,2015-09-05,0
2549,Anytime,FREDERIC LEGOUGE,8,FR,CARTE ANYTIME,2015-07-14,No Reply,2015-07-14,5,Invited,CARTE ANYTIMESuper ..............................,2015-07-14,0
2550,Anytime,Mokhtar,2,FR,Anytime,2015-07-13,No Reply,2015-07-13,5,Invited,AnytimeSuper excellent,2015-07-13,0
2551,Anytime,JEAN-LOUIS CATALDO,2,FR,Très intéressant,2015-07-13,No Reply,2015-07-13,4,Invited,Très intéressantUn système très bien étudié,2015-07-13,0
