# üéØ Objectif principal :
üëâ Personnaliser l‚Äôexp√©rience d‚Äôapprentissage en recommandant les cours les plus pertinents pour chaque utilisateur, selon ses go√ªts, ses √©valuations pass√©es et les contenus disponibles.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import IsolationForest
from textblob import TextBlob
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


# --------------------------
# Simuler les donn√©es
# --------------------------

In [2]:
import pandas as pd
rating = pd.read_csv("sephora_skincare_reviews.csv")
ratings=rating.head(20)
ratings.reset_index(drop=True, inplace=True)
print(ratings.head(10))

   review_id     username                 product_name     category  rating  \
0       1802  user0534359   Niacinamide Spot Corrector    Treatment       4   
1       1191   user075161         Barrier Repair Cream  Moisturizer       3   
2       1818  user0708191  Vitamin C Brightening Serum        Serum       5   
3        252  user1041788        Hyaluronic Acid Serum        Serum       4   
4       2506   user060517  Vitamin C Brightening Serum        Serum       4   
5       1118  user1075994   Niacinamide Spot Corrector    Treatment       3   
6       1412  user0134519          Retinol Night Serum        Serum       5   
7       2114  user0085907   Niacinamide Spot Corrector    Treatment       5   
8        409  user0846131           Soothing Eye Cream     Eye care       5   
9       2580  user0307941          Pore Refining Toner        Toner       2   

                                            feedback        date    skin_type  \
0  Noticed peeling when I used it with other acti

# --------------------------
# Analyse des sentiments
# --------------------------

In [3]:
def analyze_sentiment(feedback):
    return TextBlob(feedback).sentiment.polarity

ratings['sentiment'] = ratings['feedback'].apply(analyze_sentiment)
print("\nüß† R√©sultat de l'analyse des sentiments :")
print(ratings[['feedback', 'sentiment']])


üß† R√©sultat de l'analyse des sentiments :
                                             feedback  sentiment
0   Noticed peeling when I used it with other acti...  -0.129167
1   TThe pump drspensses too much prduct each time...  -0.150000
2   Did ont suit my oily-ocmbination sin; mdaee it...   0.000000
3   It clogged my pors adn causeed breakouts near ...   0.100000
4   Packoging is convenient atd hygienic, pump wor...   0.000000
5   It clgoged my pores and cused breakouts near m...   0.100000
6   Packaginng is convenient and hygienci, rump wo...   0.100000
7   Lovely formula, but the bottle leaked in trans...   0.305556
8   The pump dispenses too much proudc each time. ...  -0.150000
9   I saw miprove hydration bt not much improvemen...  -0.088889
10  Lovely formula, but the bottle leaked in trans...   0.500000
11  Thhe pump dipenses too much product each timme...   0.600000
12  Didn't finish the jar: the smell was unpleasan...  -0.200000
13  Subtle brightening effecct but needed hi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['sentiment'] = ratings['feedback'].apply(analyze_sentiment)


# --------------------------
# Analyse des fautes d'orthographe
# --------------------------

In [None]:
def count_misspellings(text):
    blob = TextBlob(text)
    corrected = blob.correct()
    mistakes = sum(1 for w1, w2 in zip(blob.words, corrected.words) if w1.lower() != w2.lower())
    return mistakes / max(1, len(blob.words))  

ratings['misspell_rate'] = ratings['feedback'].apply(count_misspellings)

print("\nüî§ Taux de fautes d'orthographe par review :")
print(ratings[['feedback', 'misspell_rate']])



üî§ Taux de fautes d'orthographe par review :
                                             feedback  misspell_rate
0   Noticed peeling when I used it with other acti...       0.058824
1   TThe pump drspensses too much prduct each time...       0.166667
2   Did ont suit my oily-ocmbination sin; mdaee it...       0.285714
3   It clogged my pors adn causeed breakouts near ...       0.350000
4   Packoging is convenient atd hygienic, pump wor...       0.142857
5   It clgoged my pores and cused breakouts near m...       0.312500
6   Packaginng is convenient and hygienci, rump wo...       0.250000
7   Lovely formula, but the bottle leaked in trans...       0.225806
8   The pump dispenses too much proudc each time. ...       0.166667
9   I saw miprove hydration bt not much improvemen...       0.290323
10  Lovely formula, but the bottle leaked in trans...       0.142857
11  Thhe pump dipenses too much product each timme...       0.500000
12  Didn't finish the jar: the smell was unpleasan...  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['misspell_rate'] = ratings['feedback'].apply(count_misspellings)


# --------------------------
# Embedding TF-IDF
# --------------------------

In [None]:

tfidf_vec = TfidfVectorizer()
tfidf_features = tfidf_vec.fit_transform(ratings['feedback']).toarray()
tfidf_df = pd.DataFrame(tfidf_features, columns=tfidf_vec.get_feature_names_out())

print("\nüî† √âtape 4 : R√©sultats TF-IDF pour chaque avis :")
print(tfidf_df.round(3))  


üî† √âtape 4 : R√©sultats TF-IDF pour chaque avis :
      30s   aadn  aalso  active    adn     ae  after  again    all   also  \
0   0.000  0.000  0.000   0.285  0.000  0.000  0.000  0.000  0.000  0.000   
1   0.000  0.000  0.000   0.000  0.000  0.000  0.000  0.000  0.000  0.000   
2   0.000  0.000  0.000   0.000  0.000  0.000  0.000  0.000  0.000  0.000   
3   0.000  0.000  0.000   0.000  0.258  0.000  0.000  0.000  0.000  0.000   
4   0.000  0.000  0.000   0.000  0.000  0.000  0.000  0.000  0.000  0.000   
5   0.000  0.000  0.292   0.000  0.000  0.000  0.000  0.000  0.000  0.000   
6   0.000  0.000  0.000   0.000  0.000  0.000  0.000  0.000  0.237  0.209   
7   0.000  0.211  0.000   0.000  0.000  0.000  0.000  0.000  0.000  0.000   
8   0.000  0.000  0.000   0.000  0.000  0.000  0.000  0.000  0.000  0.000   
9   0.000  0.000  0.000   0.000  0.000  0.000  0.180  0.000  0.000  0.000   
10  0.000  0.000  0.000   0.000  0.000  0.000  0.000  0.000  0.000  0.000   
11  0.000  0.000  0.00

# --------------------------
# Embedding BERT
# --------------------------

In [None]:

bert_model = SentenceTransformer('all-MiniLM-L6-v2')
bert_features = bert_model.encode(ratings['feedback'])

bert_df = pd.DataFrame(bert_features)

print("\nüß¨ √âtape 5 : Vecteurs BERT pour chaque avis (affichage des 5 premi√®res dimensions) :")
print(bert_df.iloc[:, :5].round(4))  


üß¨ √âtape 5 : Vecteurs BERT pour chaque avis (affichage des 5 premi√®res dimensions) :
         0       1       2       3       4
0  -0.0682 -0.0653  0.0183  0.1019  0.0423
1  -0.0241 -0.1083  0.0778  0.0042 -0.0359
2  -0.0991  0.0554  0.0611  0.0466  0.0413
3  -0.0599  0.0106  0.0471  0.0042  0.0291
4  -0.0526  0.0482 -0.0193  0.0115  0.0203
5  -0.0606  0.0159  0.0129  0.0773  0.0191
6   0.0546  0.0185  0.0079  0.0495  0.0073
7  -0.0924 -0.0189 -0.0138  0.0153  0.0796
8  -0.0157 -0.0477  0.0159  0.0081  0.0087
9  -0.0377 -0.0130  0.0896  0.0735  0.0367
10 -0.0697  0.0235 -0.0104  0.0593  0.0576
11 -0.0832  0.0104  0.0204 -0.0600 -0.0412
12 -0.0217  0.0435  0.0396  0.0109  0.0719
13 -0.1343 -0.0147  0.1171  0.0654  0.0219
14 -0.0585 -0.0276  0.1044 -0.0303  0.0185
15 -0.1177  0.0202 -0.0058  0.0352  0.0120
16  0.0007  0.0889  0.0371  0.0100  0.0176
17 -0.0918 -0.0185  0.0698  0.0359  0.0112
18 -0.1180  0.0033  0.0505  0.0058 -0.0205
19 -0.0048 -0.0373  0.0296  0.0587 -0.0265


# --------------------------
# Fusion des features
# --------------------------

In [7]:
all_features = np.hstack((
    tfidf_features,
    bert_features,
    ratings[['rating', 'sentiment', 'misspell_rate']].values
))

print("\nüß© √âtape 6 : Fusion des features pour d√©tection d'anomalies.")


üß© √âtape 6 : Fusion des features pour d√©tection d'anomalies.


# --------------------------
# D√©tection d'anomalies
# --------------------------

In [None]:
iso = IsolationForest(contamination=0.05, random_state=42)
ratings['anomaly'] = iso.fit_predict(all_features)

ratings['anomaly_score'] = iso.decision_function(all_features)

print("\nüö® Anomalies d√©tect√©es (avec TF-IDF + BERT + fautes d'orthographe) :")
print(ratings[['user_id', 'item_id', 'rating', 'feedback', 'anomaly', 'anomaly_score']])

ratings = ratings[(ratings['misspell_rate'] < 0.4) & (ratings['sentiment'] > -0.5)]

ratings = ratings.sort_values('anomaly_score')

print("\n‚úÖ Donn√©es apr√®s suppression des anomalies :")
print(ratings)



üö® Anomalies d√©tect√©es (avec TF-IDF + BERT + fautes d'orthographe) :
    user_id  item_id  rating  \
0         1     1001       4   
1         2     1002       3   
2         3     1003       5   
3         4     1004       4   
4         5     1005       4   
5         6     1006       3   
6         7     1007       5   
7         8     1008       5   
8         9     1009       5   
9        10     1010       2   
10       11     1011       5   
11       12     1012       3   
12       13     1013       4   
13       14     1014       5   
14       15     1015       5   
15       16     1016       3   
16       17     1017       3   
17       18     1018       5   
18       19     1019       3   
19       20     1020       5   

                                             feedback  anomaly  anomaly_score  
0   Noticed peeling when I used it with other acti...        1       0.017356  
1   TThe pump drspensses too much prduct each time...        1       0.031333  
2   Did ont s

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['anomaly'] = iso.fit_predict(all_features)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['anomaly_score'] = iso.decision_function(all_features)


#  Correction automatique des fautes d‚Äôorthographe

In [None]:
def correct_spelling(text):
    try:
        return str(TextBlob(text).correct())
    except:
        return text

ratings['feedback_corrected'] = ratings['feedback'].apply(correct_spelling)
ratings[['feedback', 'feedback_corrected']].head()

Unnamed: 0,feedback,feedback_corrected
12,Didn't finish the jar: the smell was unpleasan...,Didn't finish the jar: the smell was unpleasan...
13,Subtle brightening effecct but needed higher c...,Subtle brightening effect but needed higher co...
15,Works well under makeup ind doesn't pill. I've...,Works well under make ind doesn't pill. I've b...
6,"Packaginng is convenient and hygienci, rump wo...","Packaginng is convenient and hygienic, rum wor..."
7,"Lovely formula, but the bottle leaked in trans...","Lovely formula, but the bottle leaned in trans..."


# --------------------------
# Filtrage bas√© sur le contenu
# --------------------------

In [10]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(ratings['product_name'])
content_sim = cosine_similarity(tfidf_matrix)
print("\nüìö Similarit√© entre les items (filtrage bas√© sur le contenu) :")
print(pd.DataFrame(content_sim, index=ratings['category'], columns=ratings['category']))


üìö Similarit√© entre les items (filtrage bas√© sur le contenu) :
category        Serum     Serum     Serum     Serum  Treatment  Mask  \
category                                                               
Serum        1.000000  0.212381  1.000000  1.000000        0.0   0.0   
Serum        0.212381  1.000000  0.212381  0.212381        0.0   0.0   
Serum        1.000000  0.212381  1.000000  1.000000        0.0   0.0   
Serum        1.000000  0.212381  1.000000  1.000000        0.0   0.0   
Treatment    0.000000  0.000000  0.000000  0.000000        1.0   0.0   
Mask         0.000000  0.000000  0.000000  0.000000        0.0   1.0   
Treatment    0.000000  0.000000  0.000000  0.000000        1.0   0.0   
Treatment    0.000000  0.000000  0.000000  0.000000        1.0   0.0   
Toner        0.000000  0.000000  0.000000  0.000000        0.0   0.0   
Treatment    0.000000  0.000000  0.000000  0.000000        1.0   0.0   
Eye care     0.000000  0.000000  0.000000  0.000000        0.0   0.0

# --------------------------
# Filtrage collaboratif bas√© sur les utilisateurs
# --------------------------

In [None]:
pivot = ratings.pivot_table(index='user_id', columns='item_id', values='rating').fillna(0)

knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(pivot)
print("\nüë• Table des notes (pivot user-item) :")
print(pivot)
user_id_test = 1
user_vec = pivot.loc[user_id_test].values.reshape(1, -1)
distances, indices = knn.kneighbors(user_vec, n_neighbors=3)  
similar_users = pivot.index[indices.flatten()].tolist()
similar_users = [u for u in similar_users if u != user_id_test]  

print(f"\nüîó Utilisateurs similaires √† l'utilisateur {user_id_test} : {similar_users}")



üë• Table des notes (pivot user-item) :
item_id  1001  1002  1003  1004  1005  1006  1007  1008  1009  1010  1011  \
user_id                                                                     
1         4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
2         0.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
3         0.0   0.0   5.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
4         0.0   0.0   0.0   4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
5         0.0   0.0   0.0   0.0   4.0   0.0   0.0   0.0   0.0   0.0   0.0   
6         0.0   0.0   0.0   0.0   0.0   3.0   0.0   0.0   0.0   0.0   0.0   
7         0.0   0.0   0.0   0.0   0.0   0.0   5.0   0.0   0.0   0.0   0.0   
8         0.0   0.0   0.0   0.0   0.0   0.0   0.0   5.0   0.0   0.0   0.0   
9         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   5.0   0.0   0.0   
10        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0   0.0   
11        0.0   0.0   0.0   0.0   

# --------------------------
# Approche Hybride
# --------------------------

In [None]:
def hybrid_recommend(user_id, top_n=3):
    rated_items = ratings[ratings['user_id'] == user_id]['item_id'].tolist()
    
    sim_scores = np.mean([content_sim[i - 1] for i in rated_items], axis=0)

    all_item_ids = ratings['item_id'].tolist()
    collab_series = pivot.loc[similar_users].mean()

    collab_scores = pd.Series(index=all_item_ids, data=0.0)
    for item_id in collab_series.index:
        if item_id in collab_scores.index:
            collab_scores[item_id] = collab_series[item_id]

    hybrid_scores = []
    for i, item_id in enumerate(all_item_ids):
        score = 0.5 * sim_scores[i] + 0.5 * collab_scores[item_id]
        hybrid_scores.append((item_id, score))

    recommendations = [(item_id, score) for item_id, score in hybrid_scores if item_id not in rated_items]
    recommendations.sort(key=lambda x: x[1], reverse=True)
    return recommendations[:top_n]


# --------------------------
# Test
# --------------------------

In [13]:
print(f"\nüí° Recommandations hybrides pour l'utilisateur {user_id_test} :")
for rec in hybrid_recommend(user_id_test):
    title = ratings[ratings['item_id'] == rec[0]]['category'].values[0]
    print(f"- {category} (score: {rec[1]:.2f})")


üí° Recommandations hybrides pour l'utilisateur 1 :


IndexError: index 1000 is out of bounds for axis 0 with size 19

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import joblib

class RecoSystem:
    def __init__(self, data):
        self.data = data
        
        self.items = data[['item_id', 'product_name', 'category', 'feedback']].drop_duplicates(subset='item_id').reset_index(drop=True)
        
        self.ratings = data[['user_id', 'item_id', 'rating']].dropna()
        
        self.tfidf = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = self.tfidf.fit_transform(self.items['feedback'].fillna(''))
        self.content_sim = cosine_similarity(self.tfidf_matrix)
        
        self.pivot = self.ratings.pivot_table(index='user_id', columns='item_id', values='rating').fillna(0)
        
        self.knn = NearestNeighbors(metric='cosine', algorithm='brute')
        self.knn.fit(self.pivot)

    def hybrid_recommend(self, user_id, top_n=5):
        if user_id not in self.pivot.index:
            return []
        
        rated_items = self.ratings[self.ratings['user_id'] == user_id]['item_id'].tolist()
        sim_scores = np.mean([self.content_sim[i - 1] for i in rated_items if i - 1 < len(self.content_sim)], axis=0)
        
        distances, indices = self.knn.kneighbors(self.pivot.loc[[user_id]], n_neighbors=3)
        similar_users = [u for u in self.pivot.index[indices.flatten()].tolist() if u != user_id]
        
        collab_series = self.pivot.loc[similar_users].mean()
        all_item_ids = self.items['item_id'].tolist()
        collab_scores = pd.Series(index=all_item_ids, data=0.0)
        
        for item_id in collab_series.index:
            if item_id in collab_scores.index:
                collab_scores[item_id] = collab_series[item_id]
        
        hybrid_scores = []
        for i, item_id in enumerate(all_item_ids):
            score = 0.5 * sim_scores[i] + 0.5 * collab_scores[item_id]
            hybrid_scores.append((item_id, score))
        
        recommendations = [(item_id, score) for item_id, score in hybrid_scores if item_id not in rated_items]
        recommendations.sort(key=lambda x: x[1], reverse=True)
        
        return [
            {
                "item_id": item_id,
                "product_name": self.items[self.items['item_id'] == item_id]['product_name'].values[0],
                "category": self.items[self.items['item_id'] == item_id]['category'].values[0],
                "score": round(score, 2)
            } for item_id, score in recommendations[:top_n]
        ]




sephora_skincare_reviews = pd.read_csv("sephora_skincare_reviews.csv")


reco = RecoSystem(sephora_skincare_reviews)

joblib.dump(reco, 'reco_model.pkl')
print("‚úÖ Mod√®le sauvegard√© dans 'reco_model.pkl'")

user_id_test = sephora_skincare_reviews['user_id'].iloc[0]
print(f"\nRecommandations pour l'utilisateur {user_id_test} :")
print(reco.hybrid_recommend(user_id_test, top_n=5))


‚úÖ Mod√®le sauvegard√© dans 'reco_model.pkl'

Recommandations pour l'utilisateur 1 :
[{'item_id': 1003, 'product_name': 'Vitamin C Brightening Serum', 'category': 'Serum', 'score': 1.25}, {'item_id': 1002, 'product_name': 'Barrier Repair Cream', 'category': 'Moisturizer', 'score': 0.75}, {'item_id': 2001, 'product_name': 'Gentle Foaming Cleanser', 'category': 'Cleanser', 'score': 0.5}, {'item_id': 1327, 'product_name': 'Retinol Night Serum', 'category': 'Serum', 'score': 0.16}, {'item_id': 1828, 'product_name': 'Pore Refining Toner', 'category': 'Toner', 'score': 0.15}]


In [None]:
import numpy as np
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

data = pd.read_csv("sephora_skincare_reviews.csv")  
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['feedback'])
content_sim = cosine_similarity(tfidf_matrix)


pivot = data.pivot_table(index='user_id', columns='item_id', values='rating').fillna(0)
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(pivot)

model_data = {
    "items": data,
    "tfidf": tfidf,
    "tfidf_matrix": tfidf_matrix,
    "content_sim": content_sim,
    "pivot": pivot,
    "knn": knn
}

joblib.dump(model_data, "recommandation_model_AI.pkl")
print("‚úÖ Mod√®le sauvegard√©")

‚úÖ Mod√®le sauvegard√©
