In [84]:
import glob
from itertools import islice
import pandas as pd
import numpy as np

In [85]:
# create a function that will read in a list of characters
# and convert into lists of letter, next letter
def window(seq, n=2):
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

In [94]:
# create a function that takes in a list of characters and
# calculates the bayesian unigram prior smoothing probabilites
# count of bigram + prob(second char) / count(first char) + 1
def bayesian_unigram_prior_smoothing_prob(char_list, lang):
        # character frequencies
        chars = pd.DataFrame(char_list, columns=['character'])
        char_freqs = chars.character.value_counts().reset_index(name='counts')
        char_probs = chars.character.value_counts(normalize=True).reset_index(name='state2_prob')
        # rename some columns for easy merge later
        char_freqs.columns = ['state1', 'state1_count']
        char_probs.columns = ['state2', 'state2_prob']
        
        # create bigrams
        pairs = pd.DataFrame(window(char_list), columns=['state1', 'state2'])
        # bigram frequencies
        bigrams = pairs.groupby(['state1','state2']).size().reset_index(name='bigram_count')
        
        # pull in the first char counts and second char probs from above
        bigrams = pd.merge(bigrams, char_probs, on='state2')
        bigrams = pd.merge(bigrams, char_freqs, on='state1')
        
        # calculate the bigram smoothed probs
        bigrams['p_smooth'] = (bigrams.bigram_count + bigrams.state2_prob)/(bigrams.state1_count + 1)
        
        # convert to log prob
        # use a name specific to language
        col_name = lang+'_log_p_smooth'
        bigrams[col_name] = np.log(bigrams.p_smooth)
        
        # keep only required columns
        bigrams = bigrams[['state1', 'state2', col_name]].copy()
        print(bigrams.head())
        
        return(bigrams)

In [95]:
# for each language file we'll calculate the log bayesian unigram prior smoothing probability
training_probs = {}
for dq_file in glob.glob('train/*.txt'):
    # create new file name to save transition matrix
    language = dq_file.split(".")[0][6:]
    character_list = ["<<START>>"]
    with open(dq_file, 'r') as dq_text:
        for line in dq_text:
            # convert each line into a list of characters
            line_character_list = list(line)
            # add the characters of the line to the list of characters for the whole text
            character_list += line_character_list
        # add end character
        character_list.append("<<END>>")
        print("Number of characters in", language, "file:", len(character_list))
        prob_df = bayesian_unigram_prior_smoothing_prob(character_list,language)
        training_probs[language] = prob_df

Number of characters in dut file: 2498
  state1   state2  dut_log_p_smooth
0     \n       \n         -3.648464
1     \n                 -3.663161
2     \n  <<END>>         -3.663161
3     \n        N         -3.659566
4     \n        O         -1.583670
Number of characters in eng file: 2044
  state1   state2  eng_log_p_smooth
0     \n       \n         -3.538851
1     \n                 -3.554859
2     \n  <<END>>         -3.554859
3     \n        N         -3.553881
4     \n        P         -2.861712
Number of characters in esper file: 2044
  state1   state2  esper_log_p_smooth
0     \n       \n           -3.538851
1     \n                   -3.554859
2     \n  <<END>>           -3.554859
3     \n        A           -2.860979
4     \n        L           -3.554859
Number of characters in frn file: 2275
  state1   state2  frn_log_p_smooth
0     \n       \n         -2.507045
1     \n                 -3.610478
2     \n  <<END>>         -3.610478
3     \n        C         -1.664506
4 

In [125]:
# Now that we have the training log probabilities, we can attempt
# to determine the languages of the test documents
print("Dev File Language : Predicted Language")
for dq_file in glob.glob('dev/*.txt'):
    language = dq_file.split(".")[0][4:]
    character_list = ["<<START>>"]
    with open(dq_file, 'r') as dq_text:
        for line in dq_text:
            # convert each line into a list of characters
            line_character_list = list(line)
            # add the characters of the line to the list of characters for the whole text
            character_list += line_character_list
        # add end character
        character_list.append("<<END>>")
    test_bigrams = pd.DataFrame(window(character_list), columns=['state1', 'state2'])

    # grab the log probabilites for each language
    for l, probs in training_probs.items():
        test_bigrams = pd.merge(test_bigrams, probs, on=['state1','state2'])
    
    highest_log_prob = test_bigrams.iloc[:,2:].sum().reset_index().set_index('index').idxmax().values[0]
    print(language, ":", highest_log_prob.split("_")[0])      

Dev File Language : Predicted Language
dut : dut
eng : eng
esper : spn
frn : frn
ger : ger
spn : spn


In [None]:
# pretty good.. not perfect.