In [84]:
import os
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [131]:
import os
import pandas as pd

# Directory containing the CSV files
directory = r'c:\Users\mohammed\OneDrive\Documents\QFM -S2\BankReviewIntelligence\ScrapperService\production_standalone\row_data\parcket\2024-06-08\Senegal'

# List to hold the data from each CSV file
data_frames = []

# Iterate over all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".parquet"):
        file_path = os.path.join(directory, filename)
        df = pd.read_parquet(file_path)
        df['Ville'] = filename.replace('.parquet', '')
        data_frames.append(df)

# Concatenate all DataFrames
all_data = pd.concat(data_frames, ignore_index=True)

def rename_duplicate_columns(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    return df

all_data = rename_duplicate_columns(all_data)

# Save the concatenated DataFrame to a new CSV file
all_data.to_parquet('concatenated_data.parquet', index=False)
print(all_data.shape)
all_data.head()


(277, 15)


Unnamed: 0,Country,Town,Bank_Name,Bank_Phone_number,Bank_Address,Bank_Website,Reviewer_Nane,Reviewer_Sart,Reviewer_Text,Reviewer_Publish_Date,Reviewer_Like_Reaction,Reviewer_Profil_Link,Reviewer_Owner_Reply,Reviewer_Owner_Reply_Date,Ville
0,Senegal,Bignona,CBAO Bignona,Not available,"QQXF+48G, Bignona, Sénégal",Not available,Boubacar Diallo,2,NAN,il y a 5 ans,0,https://www.google.com/maps/contrib/1175237482...,,NAN,Bignona
1,Senegal,Bignona,Credit Mutuel du Sénégal Bignona,+221 77 531 18 23,"RQ3C+PRP, N5, Bignona, Sénégal",Not available,Gnansou Camara,5,NAN,il y a 8 mois,0,https://www.google.com/maps/contrib/1025446772...,,NAN,Bignona
2,Senegal,Bignona,Microcred Bignona,Not available,"QQWF+W78, Bignona, Sénégal",Not available,Malang bassene Bodian,5,NAN,il y a 5 ans,0,https://www.google.com/maps/contrib/1018032682...,,NAN,Bignona
3,Senegal,Dakar,Banque de Dakar,+221 33 849 86 00,"13000, Sénégal",bdk.sn,Imhotep BAKHOUM,4,NAN,il y a 7 ans,0,https://www.google.com/maps/contrib/1162056137...,,NAN,Dakar
4,Senegal,Dakar,LA BANQUE AGRICOLE (LBA),+221 33 839 36 36,"Place de l'indépendance, 31-33 Rue Amadou Assa...",labanqueagricole.sn,Matar Diop,3,NAN,il y a 9 ans,0,https://www.google.com/maps/contrib/1006801030...,,NAN,Dakar


In [132]:
reviews = "concatenated_data.parquet"
data_set = pd.read_parquet(reviews)
data_set.rename(columns={'Reviewer_Nane': 'Name'}, inplace=True)
data_set.rename(columns={'Reviewer_Sart': 'Stars'}, inplace=True)
data_set.rename(columns={'Reviewer_Text': 'Review'}, inplace=True)
data_set.rename(columns={'Reviewer_Publish_Date': 'Date'}, inplace=True)
data_set.rename(columns={'ReviewerLike_Reaction': 'Stars'}, inplace=True)
data_set.rename(columns={'Reviewer_Like_Reaction': 'Likes'}, inplace=True)
data_set.drop(columns=['Reviewer_Profil_Link'], inplace=True)
data_set.drop(columns=['Reviewer_Owner_Reply'], inplace=True)
data_set.drop(columns=['Reviewer_Owner_Reply_Date'], inplace=True)



data_set.head()

Unnamed: 0,Country,Town,Bank_Name,Bank_Phone_number,Bank_Address,Bank_Website,Name,Stars,Review,Date,Likes,Ville
0,Senegal,Bignona,CBAO Bignona,Not available,"QQXF+48G, Bignona, Sénégal",Not available,Boubacar Diallo,2,NAN,il y a 5 ans,0,Bignona
1,Senegal,Bignona,Credit Mutuel du Sénégal Bignona,+221 77 531 18 23,"RQ3C+PRP, N5, Bignona, Sénégal",Not available,Gnansou Camara,5,NAN,il y a 8 mois,0,Bignona
2,Senegal,Bignona,Microcred Bignona,Not available,"QQWF+W78, Bignona, Sénégal",Not available,Malang bassene Bodian,5,NAN,il y a 5 ans,0,Bignona
3,Senegal,Dakar,Banque de Dakar,+221 33 849 86 00,"13000, Sénégal",bdk.sn,Imhotep BAKHOUM,4,NAN,il y a 7 ans,0,Dakar
4,Senegal,Dakar,LA BANQUE AGRICOLE (LBA),+221 33 839 36 36,"Place de l'indépendance, 31-33 Rue Amadou Assa...",labanqueagricole.sn,Matar Diop,3,NAN,il y a 9 ans,0,Dakar


In [134]:
def fill_na_based_on_stars(row):
    if pd.isna(row['Review']):
        if row['Stars'] == 5:
            return 'Très bon'
        elif row['Stars'] == 4:
            return 'bon'
        elif row['Stars'] == 3:
            return 'neutre'
        elif row['Stars'] == 2:
            return 'mauvais'
        elif row['Stars'] == 1:
            return 'Très mauvais'
    else:
        return row['Review']
    
def map_stars_to_sentiment(stars):
    if stars >= 4:
        return 'positive'
    elif stars == 3:
        return 'neutral'
    else:
        return 'negative'

data_set['sentiment_category'] = data_set['Stars'].apply(map_stars_to_sentiment)

data_set['Review'] = data_set['Review'].astype(str).replace('NAN', np.nan)
data_set['Review'] = data_set.apply(fill_na_based_on_stars, axis=1)


In [135]:
data_set.head()

Unnamed: 0,Country,Town,Bank_Name,Bank_Phone_number,Bank_Address,Bank_Website,Name,Stars,Review,Date,Likes,Ville,sentiment_category
0,Senegal,Bignona,CBAO Bignona,Not available,"QQXF+48G, Bignona, Sénégal",Not available,Boubacar Diallo,2,mauvais,il y a 5 ans,0,Bignona,negative
1,Senegal,Bignona,Credit Mutuel du Sénégal Bignona,+221 77 531 18 23,"RQ3C+PRP, N5, Bignona, Sénégal",Not available,Gnansou Camara,5,Très bon,il y a 8 mois,0,Bignona,positive
2,Senegal,Bignona,Microcred Bignona,Not available,"QQWF+W78, Bignona, Sénégal",Not available,Malang bassene Bodian,5,Très bon,il y a 5 ans,0,Bignona,positive
3,Senegal,Dakar,Banque de Dakar,+221 33 849 86 00,"13000, Sénégal",bdk.sn,Imhotep BAKHOUM,4,bon,il y a 7 ans,0,Dakar,positive
4,Senegal,Dakar,LA BANQUE AGRICOLE (LBA),+221 33 839 36 36,"Place de l'indépendance, 31-33 Rue Amadou Assa...",labanqueagricole.sn,Matar Diop,3,neutre,il y a 9 ans,0,Dakar,neutral


In [136]:
Text = data_set['Review']
bank = data_set['Bank_Name']
Sentiment  = data_set['sentiment_category']

preprossed_text = []
stop_words = set(stopwords.words('french'))
stop_words.add('Je')
stop_words.add('vais')
stop_words.add('it')
st = ISRIStemmer()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.split()
    words = [st.stem(word) for word in text if word not in stop_words]
    return ' '.join(words)

for text in Text:
    text = preprocess_text(text)
    preprossed_text.append(text)


data = pd.DataFrame({'Bank':bank,'preprocessed_review':preprossed_text , 'Sentiment':Sentiment})
data.head()

Unnamed: 0,Bank,preprocessed_review,Sentiment
0,CBAO Bignona,mauvais,negative
1,Credit Mutuel du Sénégal Bignona,très bon,positive
2,Microcred Bignona,très bon,positive
3,Banque de Dakar,bon,positive
4,LA BANQUE AGRICOLE (LBA),neutre,neutral


In [85]:

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [92]:
def classify_sentiment(review):
    result = classifier(review)[0]
    return result['label']


data['sentiment'] = data['preprocessed_review'].apply(classify_sentiment)
data.head()

Unnamed: 0,Bank,preprocessed_review,sentiment
0,Banque Populaire,bon,4 stars
1,Crédit du Maroc - Agence Founty,neutre,3 stars
2,Banque Populaire Siège Centre Sud,mauvais,1 star
3,Société Générale Bank,retirer largent distributeur retiennent visa L...,1 star
4,Crédit du Maroc,bon,4 stars


In [142]:
topics = {
    "positive": {
        "bon": ["généralement bon"],
        "service_client": ["excellent service client", "réponses rapides aux demandes", "personnel serviable"],
        "produits_financiers": ["produits de haute qualité", "conditions avantageuses", "innovations régulières"],
        "expérience_utilisateur": ["interface intuitive", "navigation fluide", "facilité d'utilisation des services en ligne"],
        "gestion_des_comptes": ["gestion transparente des comptes", "outils efficaces de suivi financier", "faibles frais bancaires"],
        "sécurité": ["sécurité des transactions garantie", "protection renforcée des données", "gestion proactive des risques"],
        "localisation_accessibilité": ["emplacements pratiques", "accessibilité optimale aux agences", "guichets automatiques disponibles"],
        "services_additionnels": ["offres spéciales attrayantes", "programmes de fidélité avantageux", "services d'investissement performants"]
    },
    "neutral": {
        "neutre": ["généralement neutre"],
        "service_client": ["service client standard", "réponses acceptables", "temps d'attente modéré"],
        "produits_financiers": ["produits standards", "conditions habituelles", "aucune innovation notable"],
        "expérience_utilisateur": ["interface correcte", "navigation fonctionnelle", "services en ligne satisfaisants"],
        "gestion_des_comptes": ["gestion des comptes moyenne", "informations claires mais basiques", "frais bancaires standard"],
        "sécurité": ["mesures de sécurité standards", "réponses adéquates aux risques", "sécurité des données respectée"],
        "localisation_accessibilité": ["localisation acceptable des agences", "accessibilité correcte", "services de guichets automatiques suffisants"],
        "services_additionnels": ["offres de crédit ordinaires", "programmes de fidélité standards", "services d'investissement réguliers"]
    },
    "negative": {
        "mauvais": ["généralement mauvais"],
        "service_client": ["service client médiocre", "réponses lentes aux demandes", "manque d'assistance"],
        "produits_financiers": ["produits de qualité inférieure", "conditions défavorables", "manque d'adaptabilité"],
        "expérience_utilisateur": ["interface confuse", "navigation difficile", "problèmes fréquents avec les services en ligne"],
        "gestion_des_comptes": ["difficulté dans la gestion des comptes", "informations confuses ou trompeuses", "frais bancaires élevés"],
        "sécurité": ["préoccupations sérieuses de sécurité", "incidents de fraude fréquents", "protection des données insuffisante"],
        "localisation_accessibilité": ["mauvaise localisation des agences", "accessibilité limitée", "manque de guichets automatiques"],
        "services_additionnels": ["offres de crédit peu avantageuses", "programmes de fidélité décevants", "services d'investissement sous-performants"]
    }
}

# Exemples de sous-sujets spécifiques pour chaque sujet principal et sentiment
for sentiment, sentiment_topics in topics.items():
    print(f"Sentiment : {sentiment}")
    for main_topic, subtopics_list in sentiment_topics.items():
        print(f"  Sujet principal : {main_topic}")
        print("  Sous-sujets :")
        for subtopic in subtopics_list:
            print(f"    - {subtopic}")
    print()



Sentiment : positive
  Sujet principal : bon
  Sous-sujets :
    - généralement bon
  Sujet principal : service_client
  Sous-sujets :
    - excellent service client
    - réponses rapides aux demandes
    - personnel serviable
  Sujet principal : produits_financiers
  Sous-sujets :
    - produits de haute qualité
    - conditions avantageuses
    - innovations régulières
  Sujet principal : expérience_utilisateur
  Sous-sujets :
    - interface intuitive
    - navigation fluide
    - facilité d'utilisation des services en ligne
  Sujet principal : gestion_des_comptes
  Sous-sujets :
    - gestion transparente des comptes
    - outils efficaces de suivi financier
    - faibles frais bancaires
  Sujet principal : sécurité
  Sous-sujets :
    - sécurité des transactions garantie
    - protection renforcée des données
    - gestion proactive des risques
  Sujet principal : localisation_accessibilité
  Sous-sujets :
    - emplacements pratiques
    - accessibilité optimale aux agences
    

In [143]:

    
vectrizer = TfidfVectorizer()

X = vectrizer.fit_transform(data['preprocessed_review'])
X.shape

svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
X = svd.fit_transform(X)
X.shape

# Deviding data into posetive, negative and neutral
positive_data = data[data['Sentiment'] == 'positive']
neutral_data = data[data['Sentiment'] == 'neutral']
negative_data = data[data['Sentiment'] == 'negative']


def get_topic_similarity(topic, review):
    review_vector = vectrizer.transform([review])
    review_vector = svd.transform(review_vector)
    topic_vector = vectrizer.transform([topic])
    topic_vector = svd.transform(topic_vector)
    similarity = cosine_similarity(review_vector, topic_vector)
    return similarity[0][0]

def get_review_topics(review, sentiment):
    main_topic_similarities = {}
    for main_topic in topics[sentiment].keys():
        similarity = get_topic_similarity(main_topic, review)
        main_topic_similarities[main_topic] = similarity
    
    # Choisir le sujet principal avec la plus grande similarité
    chosen_main_topic = max(main_topic_similarities, key=main_topic_similarities.get)
    
    # Calculer la similarité avec les sous-sujets du sujet principal choisi
    sub_topic_similarities = {}
    for sub_topic in topics[sentiment][chosen_main_topic]:
        similarity = get_topic_similarity(sub_topic, review)
        sub_topic_similarities[sub_topic] = similarity
    
    # Choisir le sous-sujet avec la plus grande similarité
    chosen_sub_topic = max(sub_topic_similarities, key=sub_topic_similarities.get)
    
    return chosen_main_topic, chosen_sub_topic




# Exemple d'utilisation
review = "the service was very bad and the staff was very rude"
sentiment = 'negative'
main_topic, sub_topic = get_review_topics(review, sentiment)
print(f"Review: {review}")
print(f"Main Topic: {main_topic}")
print(f"Sub Topic: {sub_topic}")




    

Review: the service was very bad and the staff was very rude
Main Topic: service_client
Sub Topic: service client médiocre


In [144]:
def get_topics_from_reviews(reviews, sentiment):
    main_topics = []
    sub_topics = []
    for review in reviews:
        main_topic, sub_topic = get_review_topics(review, sentiment)
        main_topics.append(main_topic)
        sub_topics.append(sub_topic)
    return main_topics, sub_topics

positive_reviews = positive_data['preprocessed_review']
neutral_reviews = neutral_data['preprocessed_review']
negative_reviews = negative_data['preprocessed_review']

positive_main_topics, positive_sub_topics = get_topics_from_reviews(positive_reviews, 'positive')
neutral_main_topics, neutral_sub_topics = get_topics_from_reviews(neutral_reviews, 'neutral')
negative_main_topics, negative_sub_topics = get_topics_from_reviews(negative_reviews, 'negative')

positive_data['main_topic'] = positive_main_topics
positive_data['sub_topic'] = positive_sub_topics

neutral_data['main_topic'] = neutral_main_topics
neutral_data['sub_topic'] = neutral_sub_topics

negative_data['main_topic'] = negative_main_topics
negative_data['sub_topic'] = negative_sub_topics


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_data['main_topic'] = positive_main_topics
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_data['sub_topic'] = positive_sub_topics
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neutral_data['main_topic'] = neutral_main_topics
A value is trying to be set on a copy of a slice from a 

In [145]:
Final_result = pd.concat([positive_data, neutral_data, negative_data], ignore_index=True)
Final_result.to_csv('Final_result.csv', index=False)
Final_result.head()


Unnamed: 0,Bank,preprocessed_review,Sentiment,main_topic,sub_topic
0,Credit Mutuel du Sénégal Bignona,très bon,positive,bon,généralement bon
1,Microcred Bignona,très bon,positive,bon,généralement bon
2,Banque de Dakar,bon,positive,bon,généralement bon
3,BRIDGE BANK SÉNÉGAL Agence Horizon,bon,positive,bon,généralement bon
4,BANQUE ATLANTIQUE Agence Sacre coeur,bon,positive,bon,généralement bon


In [None]:
import torch
import torch.nn as nn

class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.bert = model
        self.dropout = nn.Dropout(0.1)
        self.relu =  nn.ReLU()
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 3)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input_ids, attention_mask):
        _, output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = self.fc1(output)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc4(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc5(x)
        return self.softmax(x)