In [None]:
# 3.Se da un fisier care contine un text (format din mai multe propozitii) in limba romana - a se vedea fisierul ”data/texts.txt”. Se cere sa se determine si sa se vizualizeze:
#
# numarul de propozitii din text;
# numarul de cuvinte din text
# numarul de cuvinte diferite din text
# cel mai scurt si cel mai lung cuvant (cuvinte)
# textul fara diacritice
# sinonimele celui mai lung cuvant din text

In [9]:
import unidecode
import re
import requests
from bs4 import BeautifulSoup

def get_synonyms(word):
    url = f"https://dexonline.ro/intrare/{word}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
    }

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Error: Could not access DEX Online for '{word}'.")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    synonyms = set()
    for span in soup.find_all("span", class_="badge-relation badge-relation-1"):
        link = span.find("a")
        if link:
            synonyms.add(link.text.strip())

    return list(synonyms)

def normalize_word(word):
    cleaned_word = re.sub(r'[^\w\s]', '', word)
    normalized_word = re.sub(r'(.)\1+', r'\1', cleaned_word)
    return normalized_word

def analyze_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    sentences = text.split('.')
    num_sentences = len([s for s in sentences if s.strip()])

    words = text.split()
    num_words = len(words)

    unique_words = set(words)
    num_unique_words = len(unique_words)

    sorted_words = sorted(words, key=len)
    shortest_word = sorted_words[0]
    longest_word = sorted_words[-1]

    text_without_diacritics = unidecode.unidecode(text)

    # Normalize the longest word to remove repeated characters
    normalized_longest_word = normalize_word(longest_word)

    synonyms = get_synonyms(normalized_longest_word)

    print("Number of sentences:", num_sentences)
    print("Number of words:", num_words)
    print("Number of unique words:", num_unique_words)
    print("Shortest word:", shortest_word)
    print("Longest word:", longest_word)
    print("Text without diacritics:", text_without_diacritics)
    print("Synonyms of the longest word:", synonyms)

analyze_text('data/texts.txt')

Number of sentences: 9
Number of words: 161
Number of unique words: 105
Shortest word: o
Longest word: ”Confiiiiiiiiiiiiiiiiiiiiiiiiiiiirm
Text without diacritics: Mesaj de informare: 
Cursul si laboratoarele de Inteligenta Artificiala vor fi o 
provocare pentru toti. Suntem convinsi ca veti realiza proiecte 
foarte interesante. Va incurajam sa adresati intrebari atunci 
cand ceva nu e clar, atat in mod live, cat si folosind platforma 
Teams, canalul "general". 
Daca ati citit pana aici, va rugam sa lasati un mesaj pe canalul 
general cu textul "Confiiiiiiiiiiiiiiiiiiiiiiiiiiiirm ca am citit 
textul pentru problema 3 din lab2". 
--
Mesaj de informare generat de ChatGPT:
Stimati cursanti,
Suntem incantati sa va avem in echipa noastra pentru Cursul si 
laboratoarele de Inteligenta Artificiala. Aceasta experienta va 
fi o adevarata provocare, dar suntem convinsi ca veti realiza 
proiecte extrem de interesante.
Va incurajam sa fiti activi si sa adresati intrebari atunci cand 
ceva nu este 