Proszę napisać program, który będzie odgadywać język w jakim napisany jest tekst.

Przydatny będzie program z zadania 3 (ngramy) oraz miary odległości (metryki):

- kosinusowa (cosinusowa),
- euklidesowa,
- taksówkowa (Manhattan),
- maksimum.

In [1]:
#import tarfile
#with tarfile.open("teksty.tar", "r") as tar:
#    tar.extractall(path = "teksty")

In [2]:
import os
import string
import re
from collections import defaultdict
import math
import csv

In [3]:
def read_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        text = text.replace('\n', ' ').replace('\r', ' ')
        text = text.replace('\\', '')
        text = re.sub(r"[^a-zA-ZÅåÄäÖöÀàáÈèéíÓóúñÑüÜẞßĄąĆćĘęŁłNńŚśŻżŹźÙù]", "", text).lower()
        return text.strip()

In [4]:
def concatenate(folder_path):
    full_text = ""
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            text = read_text_from_file(file_path)
            full_text += text + " " 
    return full_text

In [5]:
folders = {
    "english": "eng",
    "finnish": "fin",
    "german": "ger",
    "italian": "ita",
    "polish": "pol",
    "spanish": "spa"
}

In [6]:
for language, folder_path in folders.items():
    concatenated_text = concatenate(folder_path)
    print(f"{language.capitalize()} concatenate: {concatenated_text[:100]}")

English concatenate: harrypotterandthesorcerersstonechapteronetheboywholivedmrandmrsdursleyofnumberfourprivetdrivewerepro
Finnish concatenate: oalussaloijumalataivaanjamaanojamaaoliautiojatyhjäjapimeysolisyvyydenpäälläjajumalanhenkiliikkuivett
German concatenate: theodormommsenroemischegeschichtezweitesbuchvonderabschaffungdesroemischenkoenigtumsbiszureinigungit
Italian concatenate: wumingagilbertocentinoncnessundopoguerraglistoltichiamavanopaceilsempliceallontanarsidelfronteglisto
Polish concatenate: polskarzeczpospolitapolskapaństwopolozoneweuropiesrodkowejmiedzybaltykiemnapolnocyakarpatamiisudetam
Spanish concatenate: lanovelaconstadedosparteslaprimeraelingeniosohidalgodonquijotedelamanchafuepublicadaenlasegundasegun


In [7]:
def generate_ngrams(text, n):
    return [text[i:i+n] for i in range(len(text) - n + 1)]

In [8]:
def normalize_profile(profile):
    total = sum(profile.values())
    if total == 0:
        return profile
    return {key: value / total for key, value in profile.items()}

In [9]:
language_ngrams = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

In [10]:
for language, folder_path in folders.items():
    concatenated_text = concatenate(folder_path)
    
    for n in range(1, 4):
        ngrams = generate_ngrams(concatenated_text, n)
        
        for ngram in ngrams:
            language_ngrams[language][n][ngram] += 1

    print(f"{language.capitalize()}:")
    for n in range(1, 4):
        print(f"n = {n}: {list(language_ngrams[language][n].items())[:10]}")
    print("\n")

for language in language_ngrams:
    for n in language_ngrams[language]:
        language_ngrams[language][n] = normalize_profile(language_ngrams[language][n])

English:
n = 1: [('h', 133727), ('a', 163084), ('r', 132895), ('y', 50986), ('p', 34460), ('o', 156195), ('t', 170508), ('e', 238949), ('n', 131455), ('d', 100850)]
n = 2: [('ha', 30038), ('ar', 24922), ('rr', 10863), ('ry', 13267), ('yp', 1363), ('po', 4261), ('ot', 10630), ('tt', 11960), ('te', 13878), ('er', 33580)]
n = 3: [('har', 9224), ('arr', 8767), ('rry', 8620), ('ryp', 507), ('ypo', 436), ('pot', 977), ('ott', 1809), ('tte', 2431), ('ter', 4876), ('era', 1912)]


Finnish:
n = 1: [('o', 175118), ('a', 422980), ('l', 187322), ('u', 156619), ('s', 238634), ('i', 348872), ('j', 88804), ('m', 106915), ('t', 282189), ('v', 72103)]
n = 2: [('oa', 3373), ('al', 29943), ('lu', 6766), ('us', 10563), ('ss', 15849), ('sa', 41107), ('lo', 5499), ('oi', 25232), ('ij', 9433), ('ju', 8056)]
n = 3: [('oal', 81), ('alu', 931), ('lus', 486), ('uss', 927), ('ssa', 9859), ('sal', 2576), ('alo', 1467), ('loi', 2253), ('oij', 271), ('iju', 752)]


German:
n = 1: [('t', 237693), ('h', 192673), ('e',

In [11]:
def distance(measure, test_dict, dict):
    all_keys = set(test_dict.keys()).union(dict.keys())  # Wszystkie klucze

    if measure == "euclidean":
        squared_sum = sum((test_dict.get(ngram, 0) - dict.get(ngram, 0)) ** 2 for ngram in all_keys)
        return math.sqrt(squared_sum)

    elif measure == "manhattan":
        absolute_sum = sum(abs(test_dict.get(ngram, 0) - dict.get(ngram, 0)) for ngram in all_keys)
        return absolute_sum

    elif measure == "cosine":
        dot_product = sum(test_dict.get(ngram, 0) * dict.get(ngram, 0) for ngram in all_keys)
        
        magnitude_test = math.sqrt(sum(freq ** 2 for freq in test_dict.values()))
        magnitude_dict = math.sqrt(sum(freq ** 2 for freq in dict.values()))
        
        if magnitude_test == 0 or magnitude_dict == 0:
            return 0 
        
        cosine_similarity = dot_product / (magnitude_test * magnitude_dict)
        return 1 - cosine_similarity

    elif measure == "maximum":
        max_diff = max(abs(test_dict.get(ngram, 0) - dict.get(ngram, 0)) for ngram in all_keys)
        return max_diff

    else:
        raise ValueError

In [12]:
def scale_distances(distances):
    min_distance = min(distances.values())
    max_distance = max(distances.values())
    scaled_distances = {}
    
    for language, value in distances.items():
        #if max_distance - min_distance == 0:
         #   scaled_value = 50.0
        #else:
        scaled_value = 1 * (value - min_distance) / (max_distance - min_distance)
        scaled_distances[language] = round(scaled_value, 2)
    
    return scaled_distances

In [13]:
def predict_language(test_text, language_ngrams, n, metric):
    test_ngrams = generate_ngrams(test_text, n)
    test_profile = defaultdict(int)
    for ngram in test_ngrams:
        test_profile[ngram] += 1

    test_profile = normalize_profile(test_profile)

    distances = {}
    for language, profiles in language_ngrams.items():
        distance_value = distance(metric, test_profile, profiles[n])
        distances[language] = round(distance_value, 2)

    scaled_distances = scale_distances(distances)

    predicted_language = min(scaled_distances, key=scaled_distances.get)
    return predicted_language, scaled_distances

In [14]:
#selected_language = "finnish"  
#selected_folder = folders[selected_language]
#selected_file = "fin1.txt" 

#test_file_path = os.path.join(selected_folder, selected_file)

#test_text = read_text_from_file(test_file_path)

In [15]:
#n = 3
#metric = "manhattan"
#predicted_language, scaled_distances = predict_language(test_text, language_ngrams, n, metric)

In [16]:
#print(f"Predicted language: {predicted_language}")
#print("Dystans:", scaled_distances)

In [17]:
#n = 3
#metric = "euclidean"
#predicted_language, scaled_distances = predict_language(test_text, language_ngrams, n, metric)

In [18]:
#print(f"Predicted language: {predicted_language}")
#print("Dystans:", scaled_distances)

In [19]:
#n = 3
#metric = "maximum"
#predicted_language, scaled_distances = predict_language(test_text, language_ngrams, n, metric)

In [20]:
#print(f"Predicted language: {predicted_language}")
#print("Dystans:", scaled_distances)

In [21]:
#n = 3
#metric = "cosine"
#predicted_language, scaled_distances = predict_language(test_text, language_ngrams, n, metric)

In [22]:
#print(f"Predicted language: {predicted_language}")
#print("Dystans:", scaled_distances)

In [23]:
def process_all_files(folders, language_ngrams, metrics, n, file_path = "test.txt"): #test.txt - plik, którego język chcemy odgadnąć
    
    results = []

    if file_path:
        
        test_text = read_text_from_file(file_path)
        
        for metric in metrics:
            predicted_language, distances = predict_language(test_text, language_ngrams, n, metric)

            for compare_language, distance_value in distances.items():
                file_results = {
                    "File": os.path.basename(file_path),
                    "Original Language": "?",
                    "Metric": metric,
                    "Compared Language": compare_language,
                    #"Distance": round(distance_value, 2)
                    "Distance": f"{distance_value:.2f}" #liczba miejsc po przecinku
                }
                results.append(file_results)
                
    else:
        for language, folder_path in folders.items():
            for filename in os.listdir(folder_path):
                if filename.endswith(".txt"):
    
                    file_path = os.path.join(folder_path, filename)
                    test_text = read_text_from_file(file_path)
    
                    for metric in metrics:
                        predicted_language, distances = predict_language(test_text, language_ngrams, n, metric)
    
                        for compare_language, distance_value in distances.items():
                            file_results = {
                                "File": filename,
                                "Original Language": language,
                                "Metric": metric,
                                "Compared Language": compare_language,
                                #"Distance": round(distance_value, 2)
                                "Distance": f"{distance_value:.2f}" #liczba miejsc po przecinku
                            }
                            results.append(file_results)

    with open("language_distances.csv", "w", newline = "", encoding = "utf-8") as csvfile:
        fieldnames = ["File", "Original Language", "Metric", "Compared Language", "Distance"]
        writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
        writer.writeheader()
        writer.writerows(results)

metrics = ["euclidean", "manhattan", "cosine", "maximum"]

In [24]:
n = 2
process_all_files(folders, language_ngrams, metrics, n)

FileNotFoundError: [Errno 2] No such file or directory: 'kot.txt'