In [1]:
!pip install --quiet transformers
!pip install --quiet datasets
!pip install --quiet transformers[torch]
!pip install --quiet accelerate -U
!pip install --quiet matplotlib
!pip install --quiet seaborn
!pip install --quiet -U scikit-learn
!pip install --quiet nltk
!pip install --quiet wandb
!pip install pyarrow
!pip install --upgrade pyarrow



In [2]:
from collections import defaultdict
from tqdm import tqdm
import requests
import torch

from transformers import pipeline
from transformers import GPT2Tokenizer, AutoTokenizer
from datasets import load_dataset, Dataset, DatasetDict
from transformers import GPT2LMHeadModel, AutoConfig
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F

In [3]:
import os
import s3fs
import json

BUCKET_OUT = "mamazo"  

S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
fs.ls(BUCKET_OUT)

['mamazo/StatApp']

In [4]:
#with fs.open(LOGIN+"/StatApp/tokenized_128_top10.json", 'r') as file:
#   tokenized_test = Dataset.from_dict({"input_ids": json.load(file)})

# Entropy ?

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#import nltk
import nltk
from nltk.corpus import stopwords

#stop-words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:

BUCKET_OUT = "mamazo"
file='top_500_sur_10000.json'

CORPUS_S3 = BUCKET_OUT + "/" + "StatApp/" + file
with fs.open(CORPUS_S3, 'r') as f:
        data = Dataset.from_list(json.load(f))


In [7]:
def spectral_entropy(matrix, tol = 10**(-2)):
    eigenvalues, _ = np.linalg.eig(matrix)
    eigenvalues = np.real(eigenvalues)
    
    for i,val in enumerate(eigenvalues) :
        if val < tol :
            eigenvalues[i] = 0
    
    eigenvalues = eigenvalues/np.sum(eigenvalues)

    entr = 0
    for val in eigenvalues:
        #print(val)

        if val > 0 :
            #print(val*np.log(val))
            entr += val*np.log(val)
    #print("entr :",entr)

    eigenvalues = np.power(eigenvalues,-eigenvalues)

    return np.exp(-entr),eigenvalues

def select_n_highest(array, n):
    sorted_indices = np.argsort(-array)
    selected_indices = sorted_indices[:n]
    selected_indices.sort()
    selected_values = array[selected_indices]
    return selected_values, selected_indices


In [8]:
#Ce code est beaucoup trop sous-optimisé et ne sert pas à grand chose

def maximize_entropy(corpus, threshold_entr = np.inf, max_features=5000, seed_value=345, size_selected=10, n_components = None, affiche_index = False):

    n = len(corpus["text"])

    if size_selected > n:
        size_selected = n
    if n_components == None:
        n_components = round((size_selected)**(0.5))

    np.random.seed(seed_value)

    #Sélection du premier texte du corpus
    selected_document = np.random.choice(corpus["text"])
    index = corpus["text"].index(selected_document)

    #Sélection du deuxième texte du corpus

    if index == n-1:
        index2 = index-1
    else :
        index2 = index+1
    selected_indexes = [index,index2]
    
    #Création du sous-corpus dans lequel on va ajouter les documents que l'on va sélectionner
    selected_text = corpus['text'][index]
    selected_text2 = corpus['text'][index2]
    sub_corpus = DatasetDict({"text": [selected_text,selected_text2]})

    #Création du vectorizer
    stop_words = nltk.corpus.stopwords.words('english')
    vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=max_features)

    entr_tot = 0
    for k in range(size_selected-2):
        
        #Création d'un sous-corpus temporaire
        sub_corpus2 = sub_corpus

        #On ne sélectionne des textes que si on a pas déjà "suffisamment d'entropie".
        if entr_tot < threshold_entr: 
            
            max_entr_tot = 0
            index = 0
            for j in range(0,n-1):
                
                #On trouve un texte que l'on n'a pas déjà utilisé
                if j not in selected_indexes:

                    #Ajout du texte au corpus temporaire
                    selected_text = corpus['text'][j]
                    sub_corpus2["text"].append(selected_text)

                    #Calcul de la matrice de similarité et de son entropie spectrale
                    tfidf_matrix = vectorizer.fit_transform(sub_corpus2)
                    lsa_model = TruncatedSVD(n_components=n_components, n_iter=10, random_state=seed_value)
                    lsa_matrix = lsa_model.fit_transform(tfidf_matrix)
                    similarity_matrix = cosine_similarity(lsa_matrix, lsa_matrix)
                    new_entr,aux = spectral_entropy(similarity_matrix)
                    
                    if new_entr > max_entr_tot:
                        index = j
            
            #On ajoute le texte maximisant l'entropie pour un nombre k+2 de textes du sous-corpus
            selected_text = corpus['text'][index]
            sub_corpus["text"].append(selected_text)
            selected_indexes +=[index]
            entr_tot = max_entr_tot

    return sub_corpus, entr #On renvoie le sous-corpus ainsi que l'entropie du susnommé


In [9]:
def select_least_similar(corpus, max_features=5000, seed_value=345, size_selected=100, tol = 10**(-2), n_components=None, affiche_index=False, affiche_entrop = False, max_df=0.5, min_df=3):

    if n_components is None:
        n_components = round((size_selected) ** 0.5)

    stop_words = nltk.corpus.stopwords.words('english')
    vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=max_features, max_df=max_df, min_df=min_df)

    tfidf_matrix = vectorizer.fit_transform(corpus)
    lsa_model = TruncatedSVD(n_components=n_components, n_iter=10, random_state=seed_value)
    lsa_matrix = lsa_model.fit_transform(tfidf_matrix)
    similarity_matrix = cosine_similarity(lsa_matrix, lsa_matrix)
    
    aux,vendi_val = spectral_entropy(similarity_matrix)

    selected_vendi,selected_indices = select_n_highest(vendi_val,size_selected)

    if affiche_index == True:
        print(selected_indices)
    
    if affiche_entrop == True:
        prod = 1
        for val in selected_vendi:
            prod = prod*val
        print(prod)

    return data.select(selected_indices)


In [10]:
#X = Dataset.from_dict(data[:10])

#print(select_least_similar(X["text"], size_selected= 9)["text"])

In [11]:
def select_least_similar2(corpus, num_batch, num_per_batch, max_features=5000, seed_value=345, tol=10**(-2), n_components=None, affiche_index=False, affiche_entrop=False):
    
    n = len(corpus)
    if n_components is None:
        n_components = round((num_batch * num_per_batch) ** 0.5)

    selected_indices = np.array([], dtype=int)
    selected_vendi = np.array([])

    stop_words = nltk.corpus.stopwords.words('english')
    vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=max_features)

    batch_size = int(n // num_batch)
    
    overall_index = 0
    
    for k in tqdm(range(num_batch)):
        
        #print("batch n°",k+1)
        sub_corpus = corpus[k * batch_size:(k + 1) * batch_size]
        
        tfidf_matrix = vectorizer.fit_transform(sub_corpus)
        lsa_model = TruncatedSVD(n_components=n_components, n_iter=10, random_state=seed_value)
        lsa_matrix = lsa_model.fit_transform(tfidf_matrix)
        similarity_matrix = cosine_similarity(lsa_matrix, lsa_matrix)

        aux,vendi_val = spectral_entropy(similarity_matrix,tol)

        selected_vendi_temp, selected_indices_temp = select_n_highest(vendi_val, num_per_batch)
        selected_indices_temp += overall_index
        
        overall_index += batch_size
        
        selected_indices = np.concatenate((selected_indices, selected_indices_temp))
        selected_vendi = np.concatenate((selected_vendi, selected_vendi_temp))
    
    selected_indices = selected_indices.astype(int)

    if affiche_index:
        print(selected_indices)
    
    if affiche_entrop:

        if not isinstance(corpus, np.ndarray):
            corpus = np.array(corpus)
        
        if corpus.ndim > 1:
            corpus = corpus.ravel()
        
        sub_corpus = corpus[selected_indices]
        tfidf_matrix = vectorizer.fit_transform(sub_corpus)
        lsa_model = TruncatedSVD(n_components=n_components, n_iter=10, random_state=seed_value)
        lsa_matrix = lsa_model.fit_transform(tfidf_matrix)
        similarity_matrix = cosine_similarity(lsa_matrix, lsa_matrix)

        aux,vendi_val = spectral_entropy(similarity_matrix,tol)
        print("vendi score: ",aux)


    return data.select(selected_indices)

In [12]:
X = Dataset.from_dict(data[:10000])

select_least_similar2(X["text"], num_batch = 100, num_per_batch = 50)

100%|██████████| 100/100 [01:28<00:00,  1.14it/s]


Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 5000
})

In [13]:
def select_least_similar3(corpus, size_selected, num_subcorpus = 10, max_features = 5000, seed_value = 345, tol = 10**(-2), n_components = None, affiche_entrop = False):
    
    n = len(corpus)

    if n_components is None:
        n_components = round((size_selected) ** 0.5)
    max_vendi = 0

    stop_words = nltk.corpus.stopwords.words('english')
    vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=max_features)
    selected_indices = []

    for i in tqdm(range(num_subcorpus)) :
        
        np.random.seed(seed_value + i)
        random_indices = np.random.choice(n, size_selected, replace=False)
        sub_corpus = [corpus[idx] for idx in random_indices]

        tfidf_matrix = vectorizer.fit_transform(sub_corpus)
        lsa_model = TruncatedSVD(n_components=n_components, n_iter=10, random_state=seed_value)
        lsa_matrix = lsa_model.fit_transform(tfidf_matrix)
        similarity_matrix = cosine_similarity(lsa_matrix, lsa_matrix)

        vendi, aux = spectral_entropy(similarity_matrix, tol)
        #print("vendi :",vendi)

        if vendi > max_vendi :
            selected_indices = random_indices
            max_vendi = vendi
    
    if affiche_entrop:
        print(max_vendi)

    return data.select(selected_indices)


In [14]:
X = Dataset.from_dict(data[:1000])

select_least_similar3(X["text"], size_selected = 500, num_subcorpus =  17, tol = 10**(-2), affiche_entrop = True )

100%|██████████| 17/17 [02:50<00:00, 10.03s/it]

15.474200044772722





Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 500
})