In [11]:
!pip install googletrans==4.0.0-rc1


Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Obtaining dependency information for httpx==0.13.3 from https://files.pythonhosted.org/packages/54/b4/698b284c6aed4d7c2b4fe3ba5df1fcf6093612423797e76fbb24890dd22f/httpx-0.13.3-py3-none-any.whl.metadata
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Obtaining dependency information for hstspreload from https://files.pythonhosted.org/packages/b6/26/eaff7948f46de318ac3b86fc68d72106c41bcfdb46b77d55712c22565808/hstspreload-2024.6.1-py3-none-any.whl.metadata
  Downloading hstspreload-2024.6.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Obtaining dependency information for chardet==3.* from https://files.python

In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import f
from nltk.corpus import names
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import string

from googletrans import Translator


In [2]:
# Téléchargement des ressources nécessaires de NLTK
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('names')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Utilisateur\AppData\Roaming\nltk_data...
[nltk_da

True

In [3]:

# Fonction de prétraitement pour nettoyer le texte
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # POS Tagging
    pos_tags = pos_tag(tokens)
    # NER Tagging
    ner_tags = ne_chunk(pos_tags)
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos_tag)) for token, pos_tag in pos_tags]
    # Remove stop words and punctuation
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token.lower() for token in lemmatized_tokens if token.lower() not in stop_words and token.lower() not in string.punctuation]
    return filtered_tokens, ner_tags

In [4]:

# Fonction pour obtenir la partie de la parole de WordNet
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [5]:

# Fonction pour la modélisation des sujets
def topic_modeling(texts, num_topics=3):
    count_vectorizer = CountVectorizer(stop_words='english')
    X = count_vectorizer.fit_transform(texts)
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(X)
    return lda, count_vectorizer

In [6]:
def translate_text(text, target_language):
    translator = Translator()
    translated_text = translator.translate(text, dest=target_language)
    return translated_text.text

In [16]:
# Interface utilisateur
user_input = input("Enter a sentence, paragraph, or upload a file: ")

# Prétraitement des données
tokens, ner_tags = preprocess_text(user_input)

# Affichage des résultats
print("POS Tags:")
print(pos_tag(tokens))
print("\nNER Tags:")
print(ner_tags)

# Modélisation des sujets
lda_model, count_vectorizer = topic_modeling([user_input])
print("\nTop Topics:")
for idx, topic in enumerate(lda_model.components_):
    print(f"Topic {idx + 1}:")
    feature_names = count_vectorizer.get_feature_names_out()
    top_words_indices = topic.argsort()[-10:][::-1]
    top_words = [feature_names[i] for i in top_words_indices]
    print(top_words)

Enter a sentence, paragraph, or upload a file: hello sidali, how are you
POS Tags:
[('hello', 'NN'), ('sidali', 'NN')]

NER Tags:
(S hello/NN sidali/NN ,/, how/WRB are/VBP you/PRP)

Top Topics:
Topic 1:
['sidali', 'hello']
Topic 2:
['sidali', 'hello']
Topic 3:
['hello', 'sidali']


In [18]:

target_language = input("Enter the target language (e.g., 'fr' for French): ")

# Traduction du texte
translated_text = translate_text(user_input, target_language)
print("\nTranslated Text:")
print(translated_text)

Enter the target language (e.g., 'fr' for French): ar

Translated Text:
مرحبا سيدي ، كيف حالك
