In [None]:
from pymongo  import MongoClient
from wordcloud import WordCloud
import pandas as pd
import re
import collections
from pathlib import Path
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import string
import scipy.stats as st
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn import decomposition, naive_bayes, preprocessing, model_selection, metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm.notebook import tqdm


### Connexion à la bdd

In [None]:
client = MongoClient(host="localhost", port=27017)
db = client["PLDAC"]
collection = db["avis"]

In [None]:
df = pd.DataFrame(list(collection.find()))
df.head(5)

In [None]:
num_users = len(df["author"].unique())
num_items = len(df["title"].unique())

print(f"there are {num_users} authors and {num_items} items")

In [None]:
print(df.isnull().sum())

In [None]:
sparsity = (len(df)/(num_items * num_users))*100
print(f"Rating matrix is only {sparsity}% full")

### Repartition des notes

Find the count/mean/std/min/max of the notes

In [None]:
df['note'].describe()

Skewness empirique, mesure d'asymétrie:

L'asymétrie d'une distribution traduit la régularité ou non avec laquelle les observations se répartissent autour de la valeur centrale

skew = 0 -> symetrique 

skew < 0 -> dstn étalée à gauche (oblique à droite) -> mean < median < mode

In [None]:
df['note'].skew()

In [None]:
median = df['note'].median()
mean = df['note'].mean()
variance = df['note'].var()
std = df['note'].std()
print("mediane : ", median)
print("moyenne : ", mean)
print("variance : ", variance)
print("ecart-type : ", std)

Distribution globale des notes

In [None]:
sns.distplot(df["note"], bins=10)

Distribution des notes par autheurs

In [None]:
user_means = df.groupby("author")["note"].mean()
sns.distplot(user_means,bins=10)

Distribution des notes par jeux


In [None]:
item_means = df.groupby("title")["note"].mean()
sns.distplot(item_means,bins=10)

Built Train/ Test Set

In [None]:
train_indexes,test_indexes = [],[]

for index in range(len(df)):
    if index%5 == 0:
        test_indexes.append(index)
    else:
        train_indexes.append(index)

train_df = df.iloc[train_indexes].copy()
test_df = df.iloc[test_indexes].copy()

Global Training Mean, Global User Mean, Global Item Mean

In [None]:
MEAN = train_df["note"].mean()
USER_MEANS = train_df.groupby("author")["note"].mean()
ITEM_MEANS = train_df.groupby("title")["note"].mean()


def mean_rating_pred(user_item):
    user = user_item["author"]
    item = user_item["title"]
    
    return MEAN

def user_mean_rating_pred(user_item):
    user = user_item["author"]
    item = user_item["title"]
    
    return USER_MEANS.get(user,default=MEAN)

def item_mean_rating_pred(user_item):
    user = user_item["author"]
    item = user_item["title"]
    
    return ITEM_MEANS.get(item,default=MEAN)


In [None]:
test_df["mean_prediction"] = test_df[["author","title"]].apply(mean_rating_pred,axis=1)
test_df["muser_prediction"] = test_df[["author","title"]].apply(user_mean_rating_pred,axis=1) 
test_df["mitem_prediction"] = test_df[["author","title"]].apply(item_mean_rating_pred,axis=1) 

test_df.head(5)

In [None]:
details = db["details"]
df_details = pd.DataFrame(list(details.find()))
df_details.head(5)


### Jeux les mieux notés

In [None]:
df_details.sort_values(by='Note', ascending=False, inplace=True)
df_details[['titre', 'Note']]

In [None]:
tmp_df = pd.DataFrame()
titres = list(df_details.titre)
for i in range(1, 11):
    tmp_df[f'{1 + (i - 1) * 10} - {i * 10}'] = titres[((i-1)*10) : i*10]
tmp_df.head(5)

### Jeux avec le plus d'avis

In [None]:
df_details.sort_values(by='Nombre d\'avis', ascending=False, inplace=True)
df_details[['titre', 'Nombre d\'avis']]

In [None]:
tmp_df = pd.DataFrame()
titres = list(df_details.titre)
for i in range(1, 11):
    tmp_df[f'{1 + (i - 1) * 10} - {i * 10}'] = titres[((i-1)*10) : i*10]
tmp_df

### Vocabulaire

In [None]:
df = df.copy()
df['comment'] = df['comment'].astype(str)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df.comment)

print('Taille initiale du vocabulaire :', len(vectorizer.vocabulary_))

In [None]:
wc = WordCloud(background_color="white").generate(' '.join(vectorizer.vocabulary_.keys()))
plt.figure(figsize=(10, 8))
plt.title('Vocabulaire initial')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
wc = WordCloud().generate(' '.join(df.comment))
plt.figure(figsize=(10, 8))
plt.title('Word cloud corpus')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

### 100 mots les plus frequents

In [None]:
words = " ".join(df.comment).split()
word_counter = Counter(words)

top_100_words = dict(word_counter.most_common(100))

wc = WordCloud().generate_from_frequencies(top_100_words)
plt.figure(figsize=(10, 8))
plt.title('Word cloud 100 mots les plus fréquents')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()


In [None]:
tmp_df = pd.DataFrame()
top_100_words = list(top_100_words)
for i in range(1, 11):
    tmp_df[f'{1 + (i - 1) * 10} - {i * 10}'] = top_100_words[((i-1)*10) : i*10]
tmp_df

### Bigramme

In [None]:
vectorizer = CountVectorizer(ngram_range=(2,2))
X = vectorizer.fit_transform(df.comment)

bigram_frequencies = np.array(X.sum(axis=0))[0]
bigram_frequencies_sorted = (-bigram_frequencies).argsort()

features = vectorizer.get_feature_names_out()

top_100_bigrams_df = pd.DataFrame()
top_100_bigrams_df['bigram'] = [features[i] for i in bigram_frequencies_sorted[:100]]
top_100_bigrams_df['frequency'] = [bigram_frequencies[i] for i in bigram_frequencies_sorted[:100]]
top_100_bigrams_df

In [None]:
tmp_df = pd.DataFrame()
bigrams = list(top_100_bigrams_df.bigram)
for i in range(1, 11):
    tmp_df[f'{1 + (i - 1) * 10} - {i * 10}'] = bigrams[((i-1)*10) : i*10]
tmp_df

### Trigramme

In [None]:
vectorizer = CountVectorizer(ngram_range=(3,3))
X = vectorizer.fit_transform(df.comment)

trigram_frequencies = np.array(X.sum(axis=0))[0]
trigram_frequencies_sorted = (-trigram_frequencies).argsort()

features = vectorizer.get_feature_names_out()

top_100_trigrams_df = pd.DataFrame()
top_100_trigrams_df['trigram'] = [features[i] for i in trigram_frequencies_sorted[:100]]
top_100_trigrams_df['frequency'] = [trigram_frequencies[i] for i in trigram_frequencies_sorted[:100]]
top_100_trigrams_df

In [None]:
tmp_df = pd.DataFrame()
trigrams = list(top_100_trigrams_df.trigram)
for i in range(1, 11):
    tmp_df[f'{1 + (i - 1) * 10} - {i * 10}'] = trigrams[((i-1)*10) : i*10]
tmp_df

### stopwords

In [None]:
stops_words_french = stopwords.words('french')
others_stops_words = ["a", "as", "ai", "au", "aux", "avec", "ce", "ces", "dans", "de", "des",
                      "du", "elle", "en", "et", "eux", "il", "je", "la", "le", "leur", "lui", 
                      "ma", "mais", "me", "même", "mes", "moi", "mon", "ne", "nos", "notre", 
                      "nous", "on", "ou", "par", "pas", "pour", "qu", "que", "qui", "sa", "se", 
                      "ses", "son", "sur", "ta", "te", "tes", "toi", "ton", "tu", "un", "une", 
                      "vos", "votre", "vous", "c", "d", "j", "l", "à", "m", "n", "s", "t", "y", 
                      "été", "étée", "étées", "étés", "étant", "suis", "es", "est", "sommes", "êtes", 
                      "sont", "serai", "seras", "sera", "serons", "serez", "seront", "serais", "serait", 
                      "serions", "seriez", "seraient", "étais", "était", "étions", "étiez", 
                      "étaient", "fus", "fut", "fûmes", "fûtes", "furent", "sois", "soit", 
                      "soyons", "soyez", "soient", "fusse", "fusses", "fût", "fussions", 
                      "fussiez", "fussent", "ayant", "eu", "eue", "eues", "eus", "ai", "as", 
                      "avons", "avez", "ont", "aurai", "auras", "aura", "aurons", "aurez", 
                      "auront", "aurais", "aurait", "aurions", "auriez", "auraient", "avais", 
                      "avait", "avions", "aviez", "avaient", "eut", "eûmes", "eûtes", "eurent", 
                      "aie", "aies", "ait", "ayons", "ayez", "aient", "eusse", "eusses", "eût", 
                      "eussions", "eussiez", "eussent",
                      #
                      "comme", "comment", "cependant", "parce", "dont", "aussi", "cette",
                      "aujourd", "hui", "dont", "ceci", "cela", "celle", "celui", "ceux", "celles",
                      "pourquoi", "quand", "tout", "toute", "tous", "toutes"]
stops_words_french = sorted( list( set(stops_words_french + others_stops_words) ) )
len(stops_words_french)

In [None]:
def delete_stop_word(doc, stop_words=stops_words_french):
    doc = re.sub(r'[\W_]+', ' ', doc)
    return ' '.join([word for word in doc.split() if word.lower() not in stop_words])

In [None]:
corpus = df.comment.map(delete_stop_word)

In [None]:
wc = WordCloud().generate(' '.join(corpus))
plt.figure(figsize=(10, 8))
plt.title('Word cloud corpus')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
words = " ".join(corpus).split()
word_counter = Counter(words)

top_100_words = dict(word_counter.most_common(100))

wc = WordCloud().generate_from_frequencies(top_100_words)
plt.figure(figsize=(10, 8))
plt.title('Word cloud 100 mots les plus fréquents')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
tmp_df = pd.DataFrame()
top_100_words = list(top_100_words)
for i in range(1, 11):
    tmp_df[f'{1 + (i - 1) * 10} - {i * 10}'] = top_100_words[((i-1)*10) : i*10]
tmp_df

### odds ratio

In [None]:
corpus_notes_positives = corpus[df.note >= mean]
corpus_notes_negatives = corpus[df.note < mean]

class1_words = ' '.join(corpus_notes_positives).split()
class2_words = ' '.join(corpus_notes_negatives).split()

class1_counter = Counter(class1_words)
class2_counter = Counter(class2_words)

odds_ratios = {}

for word, freq_class1 in class1_counter.items():
    freq_class2 = class2_counter[word]
    total_words_class1 = sum(class1_counter.values())
    total_words_class2 = sum(class2_counter.values())
    odds_ratios[word] = ((freq_class1 + 1) / (total_words_class1 + len(class1_counter))) / ((freq_class2 + 1) / (total_words_class2 + len(class2_counter)))

sorted_odds_ratios = sorted(odds_ratios.items(), key=lambda x: x[1], reverse=True)
top_100_odds_ratios = dict(sorted_odds_ratios[:100])

wc = WordCloud().generate_from_frequencies(top_100_odds_ratios)
plt.figure(figsize=(10, 8))
plt.title('Word cloud 100 top odds ratio')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
corpus_notes_positives = corpus[df.note >= 5]
corpus_notes_negatives = corpus[df.note < 5]

class1_words = ' '.join(corpus_notes_positives).split()
class2_words = ' '.join(corpus_notes_negatives).split()

class1_counter = Counter(class1_words)
class2_counter = Counter(class2_words)

odds_ratios = {}

for word, freq_class1 in class1_counter.items():
    freq_class2 = class2_counter[word]
    total_words_class1 = sum(class1_counter.values())
    total_words_class2 = sum(class2_counter.values())
    odds_ratios[word] = ((freq_class1 + 1) / (total_words_class1 + len(class1_counter))) / ((freq_class2 + 1) / (total_words_class2 + len(class2_counter)))

sorted_odds_ratios = sorted(odds_ratios.items(), key=lambda x: x[1], reverse=True)
top_100_odds_ratios = dict(sorted_odds_ratios[:100])

wc = WordCloud().generate_from_frequencies(top_100_odds_ratios)
plt.figure(figsize=(10, 8))
plt.title('Word cloud 100 top odds ratio')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

### 100 mots les plus frequents par note

In [None]:
for i in range(0,11):
    corpus_notes = corpus[df.note == i]
    words = " ".join(corpus_notes).split()
    word_counter = Counter(words)

    top_100_words = dict(word_counter.most_common(100))

    wc = WordCloud().generate_from_frequencies(top_100_words)
    plt.figure(figsize=(10, 8))
    plt.title(f'100 mots les plus fréquents - note {i}')
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.show()