In [1]:
import pickle
from nltk import word_tokenize
from nltk.util import ngrams



In [4]:
# Read the pickled dictionaries and store each language dict in unigram_dicts[lang] and bigram_dicts[lang]
unigram_dicts = {}
bigram_dicts = {}
languages = ['English', 'French', 'Italian']
V = 0    # vocabulary size

for lang in languages:
    with open(f'unigram_{lang}_dict.pkl', 'rb') as file:
        unigram_dicts[lang] = pickle.load(file)
        V += len(unigram_dicts[lang])
    with open(f'bigram_{lang}_dict.pkl', 'rb') as file:
        bigram_dicts[lang] = pickle.load(file)

In [8]:
# For each line in test file, calculating probability for each language and writing the language with the highest probability
def calculate_language_probability(sentence, unigram_dicts, bigram_dicts, V):
    tokens = word_tokenize(sentence)
    bigrams = list(ngrams(tokens, 2))
    probabilities = {}
    for lang in unigram_dicts.keys():
        probabilities[lang] = 1
        for bigram in bigrams:
            b = bigram_dicts[lang][bigram] if bigram in bigram_dicts[lang] else 0
            u = unigram_dicts[lang][bigram[0]] if bigram[0] in unigram_dicts[lang] else 0
            probabilities[lang] *= (b + 1) / (u + V)    # Probability using simple laplace smoothing
            
    return max(probabilities, key=probabilities.get)

In [18]:
test_filename = f'data/data/LangId.test.txt'

# Reading test file
with open(test_filename, 'r') as file:
    test_data = file.readlines()

# opening output_file to write predicted languages into
with open(f'LangId.predict.txt', 'w', encoding='utf-8') as file:
    for idx, sentence in enumerate(test_data):
        predicted_lang = calculate_language_probability(sentence.strip(), unigram_dicts, bigram_dicts, V)
        file.write(f'{idx+1} {predicted_lang} \n')

In [19]:
# File paths
file1_path = f'LangId.predict.txt'
file2_path = f'data/data/LangId.sol.txt'

incorrect_lines = []
correct_count = 0
total_lines = 0

# Computing the accuracy
with open(file1_path, 'r', encoding='utf-8') as file1, open(file2_path, 'r', encoding='utf-8') as file2:
    for line_num, (line1, line2) in enumerate(zip(file1, file2), start=1):
        if line1.strip() != line2.strip():
            incorrect_lines.append(line_num)
        else:
            correct_count += 1
        total_lines += 1

accuracy = (correct_count / total_lines) * 100

In [20]:
print("Accuracy:", accuracy)
print("Incorrectly classified lines:", incorrect_lines)

Accuracy: 97.66666666666667
Incorrectly classified lines: [24, 44, 187, 191, 247, 277, 279]
