In [80]:
import os, sys, re
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime, timedelta
import numpy as np

current_directory = os.getcwd()
root_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
sys.path.append(root_directory)

* **Concate all data**

In [81]:
import os
import pandas as pd

# Directory containing the CSV files
#directory = r'c:\Users\mohammed\OneDrive\Documents\QFM -S2\BankReviewIntelligence\ScrapperService\production_standalone\row_data\parcket\2024-06-08'
directory = "../../ScrapperService/production_standalone/row_data/parcket/2024-06-08"
# List to hold the data from each CSV file
data_frames = []

# Iterate over all files in the directory
for country in os.listdir(directory):
    if country != ".DS_Store":
        for city in os.listdir(directory+"/"+country):
            if city.endswith(".parquet"):
                file_path = os.path.join(directory+"/"+country, city)
                df = pd.read_parquet(file_path)
                data_frames.append(df)

# Concatenate all DataFrames
all_data = pd.concat(data_frames, ignore_index=True)
all_data['Reviewer_Publish_Date'] = all_data['Reviewer_Publish_Date'].str.replace('\xa0', ' ')

# Save the concatenated DataFrame to a new CSV file
all_data.to_parquet('concatenated_data.parquet', index=False)
print(all_data.shape)
#all_data.head()

(1710, 14)


* **Preprocessing**

In [82]:
from parquet_preprocessing import preprocess_dataframe
all_data = pd.read_parquet('concatenated_data.parquet')

In [83]:
data_set= preprocess_dataframe(all_data)

In [84]:
data_set

Unnamed: 0,Country,Town,Bank_Name,Bank_Phone_number,Bank_Address,Bank_Website,Reviewer_Nane,Reviewer_Sart,Reviewer_Text,Reviewer_Publish_Date,Reviewer_Like_Reaction,Reviewer_Profil_Link,Reviewer_Owner_Reply,Reviewer_Owner_Reply_Date,Topic,Sentiment,Sub_Topic
0,Bénin,Abomey,Agence Boa Abomey,+229 21 36 51 48,"52M7+5Pg, Abomey, Bénin",not available,Daniel Alfaro,5,"Utilisez le guichet automatique, cela fonction...",2023-06-29,1,https://www.google.com/maps/contrib/1048093050...,,0000-00-00,,,
1,Bénin,Abomey,Uba (United Bank Of Africa) Abomey-Calavi Branch,Not available,"F954+Vhf, Abomey Calavi, Bénin",not available,Moudhil Moutairou,2,NAN,2019-06-30,0,https://www.google.com/maps/contrib/1178074439...,,0000-00-00,,,
2,Bénin,Abomey,Banque Atlantique Abomey-Calavi,+33 7 56 88 93 03,"C9V3+Wmf, Abomey Calavi, Bénin",not available,Benedite Sentibanez,3,NAN,2019-06-30,0,https://www.google.com/maps/contrib/1157606660...,,0000-00-00,,,
3,Bénin,Abomey,Axa Banque Agence Abomey-Calavi,Not available,"F923+6Xq, Cotonou, Bénin",not available,Kyaam Boyz,5,NAN,2022-06-29,0,https://www.google.com/maps/contrib/1069854224...,,0000-00-00,,,
4,Bénin,Abomey,Boa,Not available,"F964+Ffq, Rnie2, Abomey Calavi, Bénin",not available,Julien Lokossou,3,NAN,2021-06-29,0,https://www.google.com/maps/contrib/1134636252...,,0000-00-00,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1705,Senegal,Touba,Banque Atlantique,Not available,"V45Q+258, Touba, Sénégal",not available,Mamadou Wane,5,NAN,2021-06-29,0,https://www.google.com/maps/contrib/1127744667...,,0000-00-00,,,
1706,Senegal,Touba,Bsic,Not available,"V428+2W9, N3, Touba, Sénégal",not available,Khapsou Sour,5,NAN,2021-06-29,0,https://www.google.com/maps/contrib/1126701107...,,0000-00-00,,,
1707,Senegal,Touba,T.Q.G.,+221 77 717 01 27,"V45C+5Rp, Touba, Sénégal",not available,Fadal Ndiaye,5,NAN,2020-06-29,0,https://www.google.com/maps/contrib/1008380028...,,0000-00-00,,,
1708,Senegal,Touba,Microcred,Not available,"R4R7+C83, N3, Touba, Sénégal",not available,Lamine Gueye,2,NAN,2016-06-30,0,https://www.google.com/maps/contrib/1173845293...,,0000-00-00,,,


* **Topic generation**

In [94]:
def fill_na_based_on_stars(row):
    if pd.isna(row['Reviewer_Text']):
        if row['Reviewer_Sart'] == 5:
            return 'Très bon'
        elif row['Reviewer_Sart'] == 4:
            return 'bon'
        elif row['Reviewer_Sart'] == 3:
            return 'neutre'
        elif row['Reviewer_Sart'] == 2:
            return 'mauvais'
        elif row['Reviewer_Sart'] == 1:
            return 'Très mauvais'
    else:
        return row['Reviewer_Text']
    
def map_stars_to_sentiment(stars):
    if stars >= 4:
        return 'positive'
    elif stars == 3:
        return 'neutral'
    else:
        return 'negative'

data_set['Sentiment'] = data_set['Reviewer_Sart'].apply(map_stars_to_sentiment)

data_set['Reviewer_Text'] = data_set['Reviewer_Text'].astype(str).replace('NAN', np.nan)
data_set['Reviewer_Text'] = data_set.apply(fill_na_based_on_stars, axis=1)


data_set.head()

Unnamed: 0,Country,Town,Bank_Name,Bank_Phone_number,Bank_Address,Bank_Website,Reviewer_Nane,Reviewer_Sart,Reviewer_Text,Reviewer_Publish_Date,Reviewer_Like_Reaction,Reviewer_Profil_Link,Reviewer_Owner_Reply,Reviewer_Owner_Reply_Date,Topic,Sentiment,Sub_Topic,Preprocessed_review
0,Bénin,Abomey,Agence Boa Abomey,+229 21 36 51 48,"52M7+5Pg, Abomey, Bénin",not available,Daniel Alfaro,5,"Utilisez le guichet automatique, cela fonction...",2023-06-29,1,https://www.google.com/maps/contrib/1048093050...,,0000-00-00,cite web et application,positive,application mobile performante,utilisez guichet automatique cela fonctionne p...
1,Bénin,Abomey,Uba (United Bank Of Africa) Abomey-Calavi Branch,Not available,"F954+Vhf, Abomey Calavi, Bénin",not available,Moudhil Moutairou,2,mauvais,2019-06-30,0,https://www.google.com/maps/contrib/1178074439...,,0000-00-00,cite web et application,negative,application ne marche pas,
2,Bénin,Abomey,Banque Atlantique Abomey-Calavi,+33 7 56 88 93 03,"C9V3+Wmf, Abomey Calavi, Bénin",not available,Benedite Sentibanez,3,neutre,2019-06-30,0,https://www.google.com/maps/contrib/1157606660...,,0000-00-00,cite web et application,neutral,navigation simple,
3,Bénin,Abomey,Axa Banque Agence Abomey-Calavi,Not available,"F923+6Xq, Cotonou, Bénin",not available,Kyaam Boyz,5,Très bon,2022-06-29,0,https://www.google.com/maps/contrib/1069854224...,,0000-00-00,cite web et application,positive,application mobile performante,
4,Bénin,Abomey,Boa,Not available,"F964+Ffq, Rnie2, Abomey Calavi, Bénin",not available,Julien Lokossou,3,neutre,2021-06-29,0,https://www.google.com/maps/contrib/1134636252...,,0000-00-00,cite web et application,neutral,navigation simple,


In [95]:
from nltk.stem.isri import ISRIStemmer
Text = data_set['Reviewer_Text']
bank = data_set['Bank_Name']
Sentiment  = data_set['Sentiment']

preprossed_text = []
stop_words = set(stopwords.words('french'))
stop_words.add('Je')
stop_words.add('vais')
stop_words.add('it')
st = ISRIStemmer()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.split()
    words = [st.stem(word) for word in text if word not in stop_words]
    return ' '.join(words)

for text in Text:
    text = preprocess_text(text)
    preprossed_text.append(text)

data_set['Preprocessed_review'] = preprossed_text


In [96]:
topics = {
    "positive": {
        
        "service_client": ["excellent service client", "réponses rapides aux demandes", "personnel serviable"],
        "produits_financiers": ["produits de haute qualité", "conditions avantageuses", "innovations régulières"],
        "frai bancaires": ["frais bancaires compétitifs", "offres de crédit attractives", "conditions de prêt favorables"],
        "cite web et application": ["cite web et application faciles à utiliser", "navigation intuitive", "services en ligne efficaces", "application mobile performante",  "application mobile innovante"],
        "gestion_des_comptes": ["gestion transparente des comptes", "outils efficaces de suivi financier", "faibles frais bancaires"],
        "sécurité": ["sécurité des transactions garantie", "protection renforcée des données", "gestion proactive des risques"],
        "localisation_accessibilité": ["emplacements pratiques", "accessibilité optimale aux agences", "guichets automatiques disponibles"],
        "services_additionnels": ["offres spéciales attrayantes", "programmes de fidélité avantageux", "services d'investissement performants"]
    },
    "neutral": {
        
        "service client": ["service client standard", "réponses acceptables", "temps d'attente modéré"],
        "produits financiers": ["produits standards", "conditions habituelles", "aucune innovation notable"],
        "frais bancaires": ["frais bancaires moyens", "offres de crédit classiques", "conditions de prêt standard"],
        "cite web et application": ["interface correcte", "navigation simple", "services en ligne fonctionnels"],
        "gestion des comptes": ["gestion des comptes moyenne", "informations claires mais basiques", "frais bancaires standard"],
        "sécurité": ["mesures de sécurité standards", "réponses adéquates aux risques", "sécurité des données respectée"],
        "localisation accessibilité": ["localisation acceptable des agences", "accessibilité correcte", "services de guichets automatiques suffisants"],
        "services additionnels": ["offres de crédit ordinaires", "programmes de fidélité standards", "services d'investissement réguliers"]
    },
    "negative": {
        
        "service client": ["service client médiocre", "réponses lentes aux demandes", "manque d'assistance"],
        "produits financiers": ["produits de qualité inférieure", "conditions défavorables", "manque d'adaptabilité"],
        "frais bancaires": ["frais bancaires élevés", "offres de crédit peu avantageuses", "conditions de prêt restrictives"],
        "cite web et application ": ["cite web ne marche pas", "application ne marche pas", "cite web et application ne marche pas"],
        "gestion des comptes": ["difficulté dans la gestion des comptes", "informations confuses ou trompeuses", "frais bancaires élevés"],
        "sécurité": ["préoccupations sérieuses de sécurité", "incidents de fraude fréquents", "protection des données insuffisante"],
        "manque de guichets et agence": ["mauvaise localisation des agences", "accessibilité limitée", "manque de guichets automatiques"],
        "services additionnels": ["offres de crédit peu avantageuses", "programmes de fidélité décevants", "services d'investissement sous-performants"]
    }
}


In [97]:
from sklearn.preprocessing import Normalizer
    
vectrizer = TfidfVectorizer()

X = vectrizer.fit_transform(data_set['Preprocessed_review'])
X.shape

model = TruncatedSVD(n_components=300, n_iter=7, random_state=42)

X = model.fit_transform(X)
normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)


# Deviding data into posetive, negative and neutral
positive_data = data_set[data_set['Sentiment'] == 'positive']
neutral_data = data_set[data_set['Sentiment'] == 'neutral']
negative_data = data_set[data_set['Sentiment'] == 'negative']


def get_topic_similarity(topic, review):
    review_vector = vectrizer.transform([review])
    review_vector = model.transform(review_vector)
    review_vector = normalizer.transform(review_vector)
    topic_vector = vectrizer.transform([topic])
    topic_vector = model.transform(topic_vector)
    topic_vector = normalizer.transform(topic_vector)
    similarity = cosine_similarity(review_vector, topic_vector)
    return similarity[0][0]

def get_review_topics(review, sentiment):
    main_topic_similarities = {}
    if review == "bon" or review == "très bon":
        return "Expérence", "Expérence positive"
    elif review == "neutre":
        return "Expérence", "Expérence standart"
    elif review == "mauvais" or review == "très mauvais":
        return "Expérence", "Expérence négative"
    
    for main_topic in topics[sentiment].keys():
        similarity = get_topic_similarity(main_topic, review)
        main_topic_similarities[main_topic] = similarity
    
    
    # Choisir le sujet principal avec la plus grande similarité
    chosen_main_topic = max(main_topic_similarities, key=main_topic_similarities.get)
    
    # Calculer la similarité avec les sous-sujets du sujet principal choisi
    sub_topic_similarities = {}
    for sub_topic in topics[sentiment][chosen_main_topic]:
        similarity = get_topic_similarity(sub_topic, review)
        sub_topic_similarities[sub_topic] = similarity
    
    # Choisir le sous-sujet avec la plus grande similarité
    chosen_sub_topic = max(sub_topic_similarities, key=sub_topic_similarities.get)
    
    return chosen_main_topic, chosen_sub_topic

review = "je ne trouve pas d'agence bancaire dans ma région"
sentiment = 'negative'
main_topic, sub_topic = get_review_topics(review, sentiment)
print(f"Review: {review}")
print(f"Main Topic: {main_topic}")
print(f"Sub Topic: {sub_topic}")  

Review: je ne trouve pas d'agence bancaire dans ma région
Main Topic: manque de guichets et agence
Sub Topic: accessibilité limitée


In [103]:
def get_topics_from_reviews(reviews, sentiment):
    main_topics = []
    sub_topics = []
    for review in reviews:
        
        main_topic, sub_topic = get_review_topics(review, sentiment)
        main_topics.append(main_topic)
        sub_topics.append(sub_topic)
    return main_topics, sub_topics

positive_reviews = positive_data['Preprocessed_review']
neutral_reviews = neutral_data['Preprocessed_review']
negative_reviews = negative_data['Preprocessed_review']

positive_main_topics, positive_sub_topics = get_topics_from_reviews(positive_reviews, 'positive')
neutral_main_topics, neutral_sub_topics = get_topics_from_reviews(neutral_reviews, 'neutral')
negative_main_topics, negative_sub_topics = get_topics_from_reviews(negative_reviews, 'negative')
print(positive_main_topics)
#adding the topics to the data_set

def add_topics_to_data_set(main_topics, sub_topics, sentiment):
    j = 0
    for i in range(len(data_set)):
        if data_set['Sentiment'][i] == sentiment :
            data_set['Topic'][i] = main_topics[j]
            data_set['Sub_Topic'][i] = sub_topics[j]
            j += 1

add_topics_to_data_set(positive_main_topics, positive_sub_topics, 'positive')
add_topics_to_data_set(neutral_main_topics, neutral_sub_topics, 'neutral')
add_topics_to_data_set(negative_main_topics, negative_sub_topics, 'negative')

data_set.head()

['cite web et application', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'cite web et application', 'Expérence', 'Expérence', 'Expérence', 'cite web et application', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'sécurité', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'Expérence', 'frai bancaires', 'frai bancaires', 'cite web et

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_set['Topic'][i] = main_topics[j]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_set['Sub_Topic'][i] = sub_topics[j]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_set['Topic'][i] = main_topics[j]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_set['Sub_Topic'][i] = sub_topics[j]
A value is

Unnamed: 0,Country,Town,Bank_Name,Bank_Phone_number,Bank_Address,Bank_Website,Reviewer_Nane,Reviewer_Sart,Reviewer_Text,Reviewer_Publish_Date,Reviewer_Like_Reaction,Reviewer_Profil_Link,Reviewer_Owner_Reply,Reviewer_Owner_Reply_Date,Topic,Sentiment,Sub_Topic,Preprocessed_review
0,Bénin,Abomey,Agence Boa Abomey,+229 21 36 51 48,"52M7+5Pg, Abomey, Bénin",not available,Daniel Alfaro,5,"Utilisez le guichet automatique, cela fonction...",2023-06-29,1,https://www.google.com/maps/contrib/1048093050...,,0000-00-00,cite web et application,positive,application mobile performante,utilisez guichet automatique cela fonctionne p...
1,Bénin,Abomey,Uba (United Bank Of Africa) Abomey-Calavi Branch,Not available,"F954+Vhf, Abomey Calavi, Bénin",not available,Moudhil Moutairou,2,mauvais,2019-06-30,0,https://www.google.com/maps/contrib/1178074439...,,0000-00-00,Expérence,negative,Expérence négative,mauvais
2,Bénin,Abomey,Banque Atlantique Abomey-Calavi,+33 7 56 88 93 03,"C9V3+Wmf, Abomey Calavi, Bénin",not available,Benedite Sentibanez,3,neutre,2019-06-30,0,https://www.google.com/maps/contrib/1157606660...,,0000-00-00,Expérence,neutral,Expérence standart,neutre
3,Bénin,Abomey,Axa Banque Agence Abomey-Calavi,Not available,"F923+6Xq, Cotonou, Bénin",not available,Kyaam Boyz,5,Très bon,2022-06-29,0,https://www.google.com/maps/contrib/1069854224...,,0000-00-00,Expérence,positive,Expérence positive,très bon
4,Bénin,Abomey,Boa,Not available,"F964+Ffq, Rnie2, Abomey Calavi, Bénin",not available,Julien Lokossou,3,neutre,2021-06-29,0,https://www.google.com/maps/contrib/1134636252...,,0000-00-00,Expérence,neutral,Expérence standart,neutre


In [105]:
data_set.to_csv('Macro_table_LSA.csv', index=False)