In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from pprint import pprint
import gensim.corpora as corpora



# Charger le dataset
df = pd.read_csv('data.csv',nrows=10000)




# Filtrer les données pour le produit spécifique et les scores 1 et 5
data = df.loc[(df['ProductId'] == 'B000G6RYNE') & (df['Score'] == 1)]

print(data.head())









      Id   ProductId          UserId               ProfileName  \
508  509  B000G6RYNE  A3I5AT1101AS3A           Nikolette Tripp   
509  510  B000G6RYNE  A22LENLDTGQIU7                R. Yamaoka   
516  517  B000G6RYNE  A38KP1POQ191WT  Judy Schinske "Veronica"   
528  529  B000G6RYNE  A1BXG0K7UD9CTD         MicTrik "mictrik"   
537  538  B000G6RYNE  A18VDAH788BOAC                      Geeb   

     HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
508                     1                       2      1  1233360000   
509                     4                       7      1  1252713600   
516                     0                       1      1  1279065600   
528                    20                      27      1  1254009600   
537                     1                       3      1  1331856000   

                                            Summary  \
508                     Maybe the worst chips ever.   
509                   Surprise 1  It's different...   
516  

In [15]:
# Prétraitement des données

data.drop(['Id', 'ProductId','UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time',], axis=1, inplace=True)
data = data.dropna()
data['Text'] = data['Text'].apply(lambda x: re.sub('<.*?>', '', x))  # Suppression des balises HTML
data['Text'] = data['Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))  # Suppression des caractères spéciaux
data['Text'] = data['Text'].apply(lambda x: x.lower())  # Mise en minuscules

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

data['Text'] = data['Text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))  # Suppression des stopwords
data['Text'] = data['Text'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))  # Lemmatisation

print (data['Text'])



# Créer une liste de documents tokenisés à partir de la colonne 'Text'
documents = [text.split() for text in data['Text']]

508    perhaps worst chip ever gone mouthfor entire l...
509    kettle chip look feel taste like lay chip used...
516    nasty greasy rich blood plus lacked major flav...
528    loved chip chip would buy discovered england b...
537    originally produced england best chip ever tas...
538    opening numerous bag found none chip flavoring...
541    ive bought local supermarket enjoyed although ...
543    kettle brand chip used goodoily crunchy flavor...
544    absolutely forget confirmed reviewer chip tota...
545    chip nasty thought someone spilled drink bag c...
547    bought brand trial since tired pingosit claim ...
550    ordered kettle chip following flavvorssalt fre...
551    purchased low salt indeed low salt however man...
554    chip greasy taste burntthere grease bottom bag...
555    dont waste money kettle brand potato chip boug...
556    defintely tasty madhouse munchies family favor...
557    love sour food one cant bear strong sour taste...
558    unless really really rea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(['Id', 'ProductId','UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time',], axis=1, inplace=True)


In [16]:

print(documents[:1][0][:30])

['perhaps', 'worst', 'chip', 'ever', 'gone', 'mouthfor', 'entire', 'life', 'sour', 'cream', 'onion', 'case', 'chive', 'chip', 'favorite', 'recently', 'kettle', 'brand', 'honey', 'dijon', 'mustard', 'took', 'slot', 'found', 'sour', 'cream', 'onion', 'try', 'themas', 'soon']


In [17]:

# Créer un dictionnaire
dictionary = corpora.Dictionary(documents)


# Créer un corpus
corpus = [dictionary.doc2bow(doc) for doc in documents]

In [18]:
print(corpus[:1][0][:30])



[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 8), (11, 1), (12, 1), (13, 1), (14, 4), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 3)]


In [19]:
from gensim.models import LdaMulticore, TfidfModel,LdaModel
from gensim.models import CoherenceModel

# Création du modèle TF-IDF
tfidf_model = TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]

# Paramètres à tester
num_topics_list = [5, 10, 15]  # Liste des nombres de topics à tester
passes_list = [1000, 1500, 2000]  # Liste des nombres de passes à tester

best_coherence_score = -1
best_lda_model = None
best_num_topics = 0
best_passes = 0

for num_topics in num_topics_list:
    for passes in passes_list:
        # Entraînement du modèle LDA
        lda_model = LdaModel(corpus=corpus_tfidf,
                             id2word=dictionary,
                             num_topics=num_topics,
                             passes=passes)
        
        # Calcul de la cohérence pour évaluer le modèle
        coherence_model = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        
        # Comparaison avec le meilleur score obtenu jusqu'à présent
        if coherence_score > best_coherence_score:
            best_coherence_score = coherence_score
            best_lda_model = lda_model
            best_num_topics = num_topics
            best_passes = passes

# Affichage des meilleurs hyperparamètres et du meilleur modèle
print("Meilleurs hyperparamètres :")
print("Numéro de topics :", best_num_topics)
print("Nombre de passes :", best_passes)
print("Meilleur score de cohérence :", best_coherence_score)

# Affichage des topics du meilleur modèle
pprint(best_lda_model.print_topics())



Meilleurs hyperparamètres :
Numéro de topics : 15
Nombre de passes : 1500
Meilleur score de cohérence : 0.43305609484603763
[(0,
  '0.007*"unedible" + 0.007*"thai" + 0.006*"spicy" + 0.006*"item" + '
  '0.005*"product" + 0.005*"know" + 0.005*"quality" + 0.005*"get" + '
  '0.005*"stale" + 0.005*"probably"'),
 (1,
  '0.008*"fried" + 0.008*"hole" + 0.007*"rancid" + 0.007*"first" + '
  '0.007*"screwed" + 0.007*"new" + 0.006*"cream" + 0.006*"sour" + '
  '0.006*"onion" + 0.005*"bag"'),
 (2,
  '0.002*"munchies" + 0.002*"tasty" + 0.002*"darkburntmore" + '
  '0.002*"defintely" + 0.002*"family" + 0.002*"greasyoily" + 0.002*"light" + '
  '0.002*"madhouse" + 0.002*"broken" + 0.002*"oh"'),
 (3,
  '0.002*"munchies" + 0.002*"tasty" + 0.002*"darkburntmore" + '
  '0.002*"defintely" + 0.002*"family" + 0.002*"greasyoily" + 0.002*"light" + '
  '0.002*"madhouse" + 0.002*"broken" + 0.002*"oh"'),
 (4,
  '0.010*"msg" + 0.006*"instead" + 0.006*"label" + 0.005*"ingredient" + '
  '0.005*"premium" + 0.005*"thai" +

In [20]:
# Choisissez un document à analyser
document_index = 0
document = documents[document_index]

# Convertissez le document en une représentation vectorielle
vector = dictionary.doc2bow(document)

# Obtenez la distribution de probabilité des topics pour le document
topic_distribution = lda_model[vector]

# Triez les topics par ordre décroissant de probabilité
sorted_topics = sorted(topic_distribution, key=lambda x: x[1], reverse=True)

# Affichez les mots clés des topics les plus pertinents
num_keywords = 10  # Nombre de mots clés à afficher par topic

for topic in sorted_topics:
    topic_id = topic[0]
    topic_keywords = lda_model.show_topic(topic_id, num_keywords)
    topic_keywords = [keyword[0] for keyword in topic_keywords]
    
    print(f"Topic {topic_id + 1}:")
    print(", ".join(topic_keywords))
    print()



Topic 8:
hole, brand, first, screwed, health, cream, sour, good, style, onion

Topic 4:
favorite, low, never, cut, feel, hefty, crinkle, many, salt, oil

Topic 1:
case, ended, money, waste, garbage, cream, cheddar, potato, sour, dont



In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

# Charger les données depuis un fichier CSV
data = pd.read_csv('data.csv',nrows=20000)

# Diviser les données en variables indépendantes (X) et dépendante (y)
X = data['Text']
y = data['Score']

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Créer une représentation vectorielle des textes en utilisant TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Entraîner un modèle de classification (par exemple, SVM linéaire)
model = LinearSVC()
model.fit(X_train, y_train)

# Faire des prédictions sur les données de test
predictions = model.predict(X_test)

# Évaluer l'exactitude du modèle
accuracy = accuracy_score(y_test, predictions)
print("Exactitude : ", accuracy)


Exactitude :  0.69675


In [23]:
from transformers import pipeline
summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")

   
# Supposons que vous ayez un DataFrame appelé 'data' avec une colonne 'texte'
texte_complet = " ".join(data['Text'][:10].tolist())
print(texte_complet)


print(summarizer(texte_complet, min_length=50, max_length=200)[0]['summary_text'])


RuntimeError: Failed to import transformers.models.bart.modeling_tf_bart because of the following error (look up to see its traceback):
No module named 'keras.saving.hdf5_format'

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"

# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'Why is model conversion important?',
    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
}
res = nlp(QA_input)

# b) Load model & tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(res)

{'score': 0.2117144614458084, 'start': 59, 'end': 84, 'answer': 'gives freedom to the user'}


In [None]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [None]:
num_beams = 10
num_return_sequences = 5
context = "gives freedom to the user"
get_response(context,num_return_sequences,num_beams)

['The user has freedom.',
 'The user has the freedom to use.',
 'The user is given freedom.',
 'It gives the user freedom.',
 'The user has the freedom to use it.']

In [24]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")
texte_complet = " ".join(data['Text'][:10].tolist())

summary_words = [
    "Reliable",
    "Efficient",
    "High-quality",
    "User-friendly",
    "Innovative",
    "Convenient",
    "Fast",
    "Secure",
    "Disappointing",
]

result = classifier(texte_complet, summary_words)
sorted_labels = sorted(result['labels'], key=lambda x: result['scores'][result['labels'].index(x)], reverse=True)
print(sorted_labels[:5])


['High-quality', 'User-friendly', 'Reliable', 'Convenient', 'Secure']


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"

# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'Why is model conversion important?',
    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
}
res = nlp(QA_input)

print(res['answer'])

gives freedom to the user


In [None]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

num_beams = 10
num_return_sequences = 1
context = res['answer']
get_response(context,num_return_sequences,num_beams)

['The user has freedom.']

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

In [8]:
question = "What is the best way to get to the airport?"

inputs = tokenizer.encode(question, return_tensors="pt")
outputs = model.generate(inputs)
response = tokenizer.decode(outputs[0])
print(response)



<pad> The positive point is that the sand is very soft and the sand is very
