# ======================================
# NLP Model Training for SATIM FAQ Bot
# ======================================


In [9]:

# Import dependencies
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import json
import pickle
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import re
import warnings

warnings.filterwarnings('ignore')


# Download required NLTK data
nltk.download('punkt', force=True)
nltk.download('stopwords', quiet=True)

print("Setup complete!")


[nltk_data] Downloading package punkt to C:\Users\Morsi Store
[nltk_data]     DZ\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Setup complete!



# ======================================
# 1. Load Scraped Data
# ======================================


In [10]:

df = pd.read_csv('../data/processed/satim_faqs_cleaned.csv')

print(f"Loaded {len(df)} FAQs")
print(f"Categories: {list(df['category'].unique())}")
print("\nDataset Overview:")
print(df.info())
print("\nFirst few rows:")
print(df.head())


Loaded 4 FAQs
Categories: ['Contact', 'Informations', 'Paiements', 'Support']

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   question         4 non-null      object
 1   answer           4 non-null      object
 2   category         4 non-null      object
 3   source_url       4 non-null      object
 4   question_length  4 non-null      int64 
 5   answer_length    4 non-null      int64 
dtypes: int64(2), object(4)
memory usage: 324.0+ bytes
None

First few rows:
                                            question  \
0  Comment puis-je contacter le service client SA...   
1      Quels sont les horaires d'ouverture de SATIM?   
2   Comment puis-je effectuer un paiement via SATIM?   
3            Que faire en cas de problème technique?   

                                              answer      category  \
0  Vous pouvez 


# ======================================
# 2. Text Preprocessing
# ======================================


In [None]:
import spacy
import pandas as pd
import re

# Load French model
nlp = spacy.load("fr_core_news_md")

# Optional: Your stopwords if you have a custom list
french_stopwords = nlp.Defaults.stop_words

def preprocess_french_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Lowercase and basic cleaning
    text = text.lower()
    text = re.sub(r'[^\w\s\-àâäçéèêëïîôöùûüÿ]', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    # spaCy processing
    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc
        if token.text not in french_stopwords and not token.is_punct and len(token.text) > 2
    ]

    return ' '.join(tokens)


In [14]:
sample_text = "Comment puis-je contacter le service client de SATIM pour résoudre mon problème?"
processed_text = preprocess_french_text(sample_text)
print(f"Original: {sample_text}")
print(f"Processed: {processed_text}")


Original: Comment puis-je contacter le service client de SATIM pour résoudre mon problème?
Processed: je contacter service client satim résoudre problème



# ======================================
# 3. Prepare Training Data
# ======================================


In [15]:
print("Preprocessing text data...")
df['processed_question'] = df['question'].apply(preprocess_french_text)
df['processed_answer'] = df['answer'].apply(preprocess_french_text)
df = df[(df['processed_question'].str.len() > 0) & (df['processed_answer'].str.len() > 0)]
print(f"After preprocessing: {len(df)} FAQs")


Preprocessing text data...
After preprocessing: 4 FAQs


In [16]:
for i in range(3):
    print(f"\n--- Example {i+1} ---")
    print(f"Original Q: {df.iloc[i]['question']}")
    print(f"Processed Q: {df.iloc[i]['processed_question']}")



--- Example 1 ---
Original Q: Comment puis-je contacter le service client SATIM?
Processed Q: je contacter service client satim

--- Example 2 ---
Original Q: Quels sont les horaires d'ouverture de SATIM?
Processed Q: horaire ouverture satim

--- Example 3 ---
Original Q: Comment puis-je effectuer un paiement via SATIM?
Processed Q: je effectuer paiement satim




# ======================================
# 4. TF-IDF Vectorizer
# ======================================



In [17]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8,
    sublinear_tf=True
)
question_vectors = vectorizer.fit_transform(df['processed_question'])

print(f"TF-IDF matrix shape: {question_vectors.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
print(f"Sample features: {vectorizer.get_feature_names_out()[:20]}")


TF-IDF matrix shape: (4, 2)
Vocabulary size: 2
Sample features: ['je' 'satim']



# ======================================
# 5. FAQ Similarity Model
# ======================================



In [18]:
class FAQSimilarityModel:
    def __init__(self, vectorizer, question_vectors, faq_data):
        self.vectorizer = vectorizer
        self.question_vectors = question_vectors
        self.faq_data = faq_data.reset_index(drop=True)

    def find_best_match(self, query, top_k=3, min_similarity=0.1):
        processed_query = preprocess_french_text(query)
        if not processed_query:
            return []
        query_vector = self.vectorizer.transform([processed_query])
        similarities = cosine_similarity(query_vector, self.question_vectors).flatten()
        top_indices = similarities.argsort()[-top_k:][::-1]
        results = []
        for idx in top_indices:
            similarity = similarities[idx]
            if similarity >= min_similarity:
                results.append({
                    'question': self.faq_data.iloc[idx]['question'],
                    'answer': self.faq_data.iloc[idx]['answer'],
                    'category': self.faq_data.iloc[idx]['category'],
                    'similarity': float(similarity),
                    'confidence': self.calculate_confidence(similarity)
                })
        return results

    def calculate_confidence(self, similarity):
        if similarity >= 0.8:
            return 'high'
        elif similarity >= 0.5:
            return 'medium'
        elif similarity >= 0.2:
            return 'low'
        else:
            return 'very_low'


In [19]:

faq_model = FAQSimilarityModel(vectorizer, question_vectors, df)
print("✓ FAQ Similarity Model created successfully")


✓ FAQ Similarity Model created successfully



# ======================================
# 6. Test Model
# ======================================


In [21]:

test_queries = [
    "Comment contacter SATIM?",
    "Quels sont vos horaires d'ouverture?",
    "Comment faire un paiement?",
    "J'ai un problème technique",
    "Où êtes-vous situés?",
    "Comment créer un compte?",
    "Problème avec ma carte",
    "Tarifs et frais"
]


In [22]:

print("\n" + "=" * 80)
for i, query in enumerate(test_queries, 1):
    print(f"\n🔍 Test Query {i}: '{query}'")
    print("-" * 50)
    results = faq_model.find_best_match(query, top_k=2)
    if results:
        for j, result in enumerate(results, 1):
            print(f"\n  Match {j} (Similarity: {result['similarity']:.3f}, Confidence: {result['confidence']})")
            print(f"  Q: {result['question']}")
            print(f"  A: {result['answer'][:150]}...")
            print(f"  Category: {result['category']}")
    else:
        print("  ❌ No suitable matches found")
print("\n" + "=" * 80)




🔍 Test Query 1: 'Comment contacter SATIM?'
--------------------------------------------------

  Match 1 (Similarity: 1.000, Confidence: high)
  Q: Quels sont les horaires d'ouverture de SATIM?
  A: SATIM est généralement ouvert du dimanche au jeudi de 8h00 à 17h00. Les horaires peuvent varier selon les services....
  Category: Informations

  Match 2 (Similarity: 0.629, Confidence: medium)
  Q: Comment puis-je effectuer un paiement via SATIM?
  A: SATIM propose plusieurs méthodes de paiement électronique. Contactez-nous pour connaître les options disponibles selon votre situation....
  Category: Paiements

🔍 Test Query 2: 'Quels sont vos horaires d'ouverture?'
--------------------------------------------------
  ❌ No suitable matches found

🔍 Test Query 3: 'Comment faire un paiement?'
--------------------------------------------------
  ❌ No suitable matches found

🔍 Test Query 4: 'J'ai un problème technique'
--------------------------------------------------
  ❌ No suitable matches

In [26]:
list_of_questions = df['question'].tolist()  # or use df['processed_question'] if you want cleaned version
import torch


In [29]:
from sentence_transformers import SentenceTransformer, util
import torch  # ✅ This is the missing import

# Load multilingual model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Define list of questions
list_of_questions = df['question'].tolist()  # or use df['processed_question'].tolist() for cleaned

# Encode all FAQ questions
corpus_embeddings = model.encode(list_of_questions, convert_to_tensor=True)

# Encode the user query
query = "Comment faire un paiement?"
query_embedding = model.encode(query, convert_to_tensor=True)

# Compute cosine similarity
cosine_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)

# Top-k results
top_k = 3
top_results = torch.topk(cosine_scores, k=top_k)

# Print matches
print("\nTop Matches:\n" + "-" * 40)
for score, idx in zip(top_results[0][0], top_results[1][0]):
    index = idx.item()  # convert tensor to int
    print(f"Score: {score.item():.3f}")
    print(f"Q: {df.iloc[index]['question']}")
    print(f"A: {df.iloc[index]['answer'][:150]}...")
    print(f"Category: {df.iloc[index]['category']}")
    print("-" * 40)



Top Matches:
----------------------------------------
Score: 0.854
Q: Comment puis-je effectuer un paiement via SATIM?
A: SATIM propose plusieurs méthodes de paiement électronique. Contactez-nous pour connaître les options disponibles selon votre situation....
Category: Paiements
----------------------------------------
Score: 0.209
Q: Comment puis-je contacter le service client SATIM?
A: Vous pouvez contacter le service client SATIM par téléphone, email ou en visitant nos bureaux. Consultez notre page contact pour plus d'informations....
Category: Contact
----------------------------------------
Score: 0.169
Q: Que faire en cas de problème technique?
A: En cas de problème technique, contactez immédiatement notre support technique. Nous vous aiderons à résoudre le problème rapidement....
Category: Support
----------------------------------------


In [31]:
queries = ["Comment faire un paiement?", "Paiement avec SATIM", "Méthodes de paiement disponibles"]
# Encode all, average embeddings


In [32]:
def semantic_search(query, df, model, top_k=3, threshold=0.5):
    questions = df['processed_question'].tolist()
    corpus_embeddings = model.encode(questions, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)
    top_results = torch.topk(cosine_scores, k=top_k)
    
    print("\nTop Matches:\n" + "-" * 40)
    for score, idx in zip(top_results[0][0], top_results[1][0]):
        score_val = score.item()
        if score_val < threshold:
            continue
        index = idx.item()
        print(f"Score: {score_val:.3f}")
        print(f"Q: {df.iloc[index]['question']}")
        print(f"A: {df.iloc[index]['answer'][:150]}...")
        print(f"Category: {df.iloc[index]['category']}")
        print("-" * 40)


In [33]:
# Run the function with a test query
test_query = "Comment faire un paiement?"
results = semantic_search(test_query, df, model)
results


Top Matches:
----------------------------------------
Score: 0.736
Q: Comment puis-je effectuer un paiement via SATIM?
A: SATIM propose plusieurs méthodes de paiement électronique. Contactez-nous pour connaître les options disponibles selon votre situation....
Category: Paiements
----------------------------------------
