In [1]:
import os, sys, re
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime, timedelta

current_directory = os.getcwd()
root_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
sys.path.append(root_directory)

* **Concate all data**

In [2]:
import os
import pandas as pd

# Directory containing the CSV files
#directory = r'c:\Users\mohammed\OneDrive\Documents\QFM -S2\BankReviewIntelligence\ScrapperService\production_standalone\row_data\parcket\2024-06-08'
directory = "../../ScrapperService/production_standalone/row_data/parcket/2024-06-08"
# List to hold the data from each CSV file
data_frames = []

# Iterate over all files in the directory
for country in os.listdir(directory):
    if country != ".DS_Store":
        for city in os.listdir(directory+"/"+country):
            if city.endswith(".parquet"):
                file_path = os.path.join(directory+"/"+country, city)
                df = pd.read_parquet(file_path)
                data_frames.append(df)

# Concatenate all DataFrames
all_data = pd.concat(data_frames, ignore_index=True)
all_data['Reviewer_Publish_Date'] = all_data['Reviewer_Publish_Date'].str.replace('\xa0', ' ')

# Save the concatenated DataFrame to a new CSV file
all_data.to_parquet('concatenated_data.parquet', index=False)
print(all_data.shape)
#all_data.head()

(1710, 14)


* **Preprocessing**

In [3]:
from parquet_preprocessing import preprocess_dataframe
all_data = pd.read_parquet('concatenated_data.parquet')

In [4]:
df = preprocess_dataframe(all_data)

In [5]:
df

Unnamed: 0,Country,Town,Bank_Name,Bank_Phone_number,Bank_Address,Bank_Website,Reviewer_Nane,Reviewer_Sart,Reviewer_Text,Reviewer_Publish_Date,Reviewer_Like_Reaction,Reviewer_Profil_Link,Reviewer_Owner_Reply,Reviewer_Owner_Reply_Date,Topic,Sentiment,Sub_Topic
0,Kenya,Nyeri,Standard Chartered Bank Nyeri Branch,+254 20 3293900,"Kenyatta Rd, Nyeri, Kenya",standardchartered.com,V M,5,NAN,2018-06-29,0,https://www.google.com/maps/contrib/1169461831...,,0000-00-00,,,
1,Kenya,Kitui,Co-Op Bank Atm - Kitui Branch,+254 44 4422370,"Emco House, Kilungya Makuti Street, Kitui, Kenya",co-opbank.co.ke,Emman Emmah,5,NAN,2023-10-31,0,https://www.google.com/maps/contrib/1125754277...,,0000-00-00,,,
2,Kenya,Voi,Kcb Voi,Not available,"Voi, Kenya",not available,Isaac Akuru Miruka,5,NAN,2021-06-28,0,https://www.google.com/maps/contrib/1090182152...,,0000-00-00,,,
3,Kenya,Kitale,Standard Chartered Bank - Kitale Branch,+254 20 3293900,"Ground Floor- Teryet Business Centre, Kenyatta...",standardchartered.com,Tony Tonny,1,NAN,2020-06-28,0,https://www.google.com/maps/contrib/1018542844...,,0000-00-00,,,
4,Kenya,Homa Bay,"Barclays Bank, Homa Bay",Not available,"Fff4+7W4, Homa Bay, Kenya",not available,Declan Ottaro,3,Salle bancaire,2018-06-29,0,https://www.google.com/maps/contrib/1023346861...,,0000-00-00,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1705,Côte D'Ivoire,Divo,Sgci,+225 51225122,"Rjrr+4W8, Divo, Côte D’Ivoire",not available,Marie Germaine,5,NAN,2019-06-29,0,https://www.google.com/maps/contrib/1054373247...,,0000-00-00,,,
1706,Côte D'Ivoire,Divo,Coopec Divo,Not available,"Rjmm+P5W, Divo, Côte D’Ivoire",not available,Comoe Djibril,2,NAN,2019-06-29,0,https://www.google.com/maps/contrib/1132578295...,,0000-00-00,,,
1707,Côte D'Ivoire,Divo,Coopec Divo,Not available,"Rjmm+P5W, Divo, Côte D’Ivoire",not available,Comoe Djibril,2,NAN,2019-06-29,0,https://www.google.com/maps/contrib/1132578295...,,0000-00-00,,,
1708,Côte D'Ivoire,Divo,Hamed & El Djo Transfert Divo,+225 05 56 33 4482,"Bp 385, Divo, Côte D’Ivoire",not available,Moctar Kone (Kmoctar),3,NAN,2023-06-28,0,https://www.google.com/maps/contrib/1036178797...,,0000-00-00,,,


* **Topic generation**

In [6]:
from llm.topificator import TopicExtractor
from tqdm import tqdm
extractor = TopicExtractor(model="llama3", patience=5)

In [7]:
df_macro = pd.DataFrame(columns=list(df.columns))
output_file = 'macro_llamma.csv'

In [8]:
save_interval = 1
row_accumulated = 0

In [9]:
def generate_static_topics_and_sentiments(stars):
    if stars == 1:
        return ("Expérience", "Negative", [])
    elif stars == 2:
        return ("Expérience", "Negative", [])
    elif stars == 3:
        return ("Expérience", "Neutral", [])
    elif stars == 4:
        return ("Expérience", "Positive", [])
    elif stars == 5:
        return ("Expérience", "Positive", [])
    else:
        return ("Experience", "Neutral", [])

In [10]:
# Itérer sur chaque ligne du DataFrame
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    review = row['Reviewer_Text']
    
    try:
        topics = extractor.extract(review, type='SINGLE_SOURCE')
        topics_array = topics['topics']
    
        if len(topics_array) == 0:
            static_topic, static_sentiment, static_sub_topic = generate_static_topics_and_sentiments(row['Reviewer_Sart'])
            new_row = row.copy()
            new_row['Topic'] = static_topic
            new_row['Sentiment'] = static_sentiment
            new_row['Sub_Topic'] = static_sub_topic
            df_macro = pd.concat([df_macro, pd.DataFrame([new_row])], ignore_index=True)
            row_accumulated += 1
        else:
            for tuple_ in topics_array:
                new_row = row.copy()
                new_row['Topic'] = tuple_[0]
                new_row['Sentiment'] = tuple_[1]
                new_row['Sub_Topic'] = tuple_[2]
                df_macro = pd.concat([df_macro, pd.DataFrame([new_row])], ignore_index=True)
                row_accumulated += 1
            
        # Sauvegarder à chaque intervalle défini
        if row_accumulated >= save_interval:
            df_macro.to_csv(output_file, index=False)
            row_accumulated = 0
    except Exception as e:
        print(f"Erreur rencontrée à l'index {index}: {e}")
        pass

  1%|█▍                                                                                                       | 24/1710 [03:39<3:51:06,  8.22s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response
Attempt 2 failed with error: Failed to parse response: No JSON object could be identified in the response
Attempt 3 failed with error: Failed to parse response: No JSON object could be identified in the response
Attempt 4 failed with error: Failed to parse response: No JSON object could be identified in the response


  2%|██▏                                                                                                      | 35/1710 [05:01<3:38:55,  7.84s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 10%|██████████                                                                                              | 166/1710 [14:32<2:14:48,  5.24s/it]

Erreur rencontrée à l'index 165: list index out of range


 10%|██████████▉                                                                                             | 179/1710 [15:27<2:03:21,  4.83s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 12%|████████████▌                                                                                           | 206/1710 [20:12<3:33:01,  8.50s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 18%|██████████████████▋                                                                                     | 308/1710 [31:53<2:10:11,  5.57s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 19%|███████████████████▉                                                                                    | 327/1710 [36:41<5:23:45, 14.05s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 20%|█████████████████████▏                                                                                  | 349/1710 [41:20<4:32:17, 12.00s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 23%|███████████████████████▌                                                                                | 388/1710 [46:22<2:00:21,  5.46s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 23%|███████████████████████▊                                                                                | 391/1710 [48:09<4:04:14, 11.11s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 25%|██████████████████████████▏                                                                             | 430/1710 [50:41<1:09:33,  3.26s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 28%|████████████████████████████▊                                                                           | 474/1710 [54:33<2:35:34,  7.55s/it]

Attempt 1 failed with error: Failed to parse response: invalid syntax. Perhaps you forgot a comma? (<string>, line 2)


 30%|██████████████████████████████▍                                                                       | 511/1710 [1:04:14<2:08:42,  6.44s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 31%|███████████████████████████████▏                                                                      | 522/1710 [1:08:05<5:32:44, 16.81s/it]

Attempt 1 failed with error: Failed to parse response: invalid syntax. Perhaps you forgot a comma? (<string>, line 2)


 31%|███████████████████████████████▎                                                                      | 524/1710 [1:09:48<8:37:36, 26.19s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 31%|███████████████████████████████▉                                                                      | 536/1710 [1:11:28<3:54:20, 11.98s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 32%|████████████████████████████████▏                                                                     | 540/1710 [1:12:18<3:56:21, 12.12s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 51%|████████████████████████████████████████████████████▎                                                 | 878/1710 [1:35:27<1:52:01,  8.08s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 57%|██████████████████████████████████████████████████████████▎                                           | 978/1710 [1:47:08<1:48:43,  8.91s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 65%|██████████████████████████████████████████████████████████████████▍                                    | 1103/1710 [2:00:08<47:39,  4.71s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 76%|████████████████████████████████████████████████████████████████████████████▉                        | 1302/1710 [2:22:47<2:04:37, 18.33s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response
Attempt 2 failed with error: Failed to parse response: No JSON object could be identified in the response
Attempt 3 failed with error: Failed to parse response: No JSON object could be identified in the response


 83%|█████████████████████████████████████████████████████████████████████████████████████▉                 | 1426/1710 [2:41:36<25:07,  5.31s/it]

Attempt 1 failed with error: Failed to parse response: name 'topics' is not defined


 84%|██████████████████████████████████████████████████████████████████████████████████████▌                | 1437/1710 [2:44:46<43:25,  9.54s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 85%|███████████████████████████████████████████████████████████████████████████████████████                | 1446/1710 [2:46:34<49:42, 11.30s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 85%|███████████████████████████████████████████████████████████████████████████████████████▎               | 1449/1710 [2:47:19<53:00, 12.19s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response
Attempt 2 failed with error: Failed to parse response: No JSON object could be identified in the response


 86%|████████████████████████████████████████████████████████████████████████████████████████▋              | 1472/1710 [2:50:46<17:15,  4.35s/it]

Attempt 1 failed with error: Failed to parse response: invalid syntax (<string>, line 9)
Attempt 2 failed with error: Failed to parse response: No JSON object could be identified in the response


 87%|███████████████████████████████████████████████████████████████████████████████████████▋             | 1484/1710 [2:56:35<1:12:31, 19.26s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 91%|█████████████████████████████████████████████████████████████████████████████████████████████▋         | 1556/1710 [3:11:37<35:04, 13.67s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍| 1701/1710 [3:42:06<02:16, 15.20s/it]

Attempt 1 failed with error: Failed to parse response: No JSON object could be identified in the response


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1710/1710 [3:44:50<00:00,  7.89s/it]


In [11]:
# Sauvegarder une dernière fois après la fin de la boucle
df_macro.to_csv(output_file, index=False)

# Afficher un message une fois le processus terminé
print(f"DataFrame mis à jour et sauvegardé dans {output_file}")

DataFrame mis à jour et sauvegardé dans macro_llamma.csv
