In [1]:
import numpy as np
import pandas as pd
from string import ascii_lowercase as letters
from collections import Counter
import os
import math

In [2]:
def parseFile(filePath):
    with open(filePath) as f:
        language = filePath.split('/')[0]
        text = f.read().lower()
        letters_counter = dict((l, text.count(l)) for l in letters)
        #print (letters_counter)
        all_letters = sum(value for value in letters_counter.values())
        letters_frequency = np.array([value / all_letters for value in letters_counter.values()])
        letters_frequency = np.append(letters_frequency, language)
        #print(letters_frequency)
        return letters_frequency

def parseFilesInFolder(folderName, network):
    path = folderName + '/'
    for filename in os.listdir(path):
        filePath = path + filename
        if not filename.startswith('.'):
            print(filePath)
            network.fit(filePath)
        
def trainModel(folders, network):
    for folder in folders:
        parseFilesInFolder(folder, network)

In [3]:
class Perceptron:
    weights_learning_rate = 0.3
    threshold_learning_rate = 0.3
    threshold = 1
    
    def __init__(self, dimensions, language):
        self.weights = np.ones(dimensions).astype(float)
        self.language = language
    
    def fit(self, observation):
        net_value = self.predict(observation[0:-1])
        #print(self.language, 'net: ', net_value)
        prediction = int(net_value >= self.threshold)
        correct_answer = int(observation[-1] == self.language)
        if prediction != correct_answer: # update parameters
            self.weights = self.weights + (correct_answer - int(prediction)) \
            * self.weights_learning_rate * observation[0:-1].astype(float)
            
            length = np.dot(self.weights, self.weights)
            length = math.sqrt(length)
            self.weights /= length
            
            self.threshold = self.threshold - (correct_answer - int(prediction)) \
            * self.threshold_learning_rate
    
    def predict(self, observation):
        net_value = np.dot(self.weights, observation.astype(float))
        print(self.language, 'net: ', net_value)
        return net_value
    
class SingleLayerNetwork:
    def __init__(self, languages):
        self.perceptrons = [Perceptron(26, language) for language in languages]
        self.languages = languages
        
    def fit(self, filePath):
        observation = parseFile(filePath)
        for perceptron in self.perceptrons:
            perceptron.fit(observation)
    
    def predict(self, filePath):
        observation = parseFile(filePath)
        predictions = [perceptron.predict(observation[0:-1]) for perceptron in self.perceptrons]
        max_index = predictions.index(max(predictions))
        print(self.languages[max_index])

In [4]:
folders = ['niemiecki', 'angielski', 'hiszpanski', 'esperanto']
network = SingleLayerNetwork(folders)

In [22]:
for i in range(20):
    trainModel(folders, network)

niemiecki/warszawa.txt
niemiecki net:  0.1595581844399213
angielski net:  0.09198788058483366
hiszpanski net:  0.13431753118274004
esperanto net:  0.034209872634389946
niemiecki/lublin.txt
niemiecki net:  0.15909165843301726
angielski net:  0.09977836628910293
hiszpanski net:  0.10714975104856644
esperanto net:  0.033905765271109445
niemiecki/madeira.txt
niemiecki net:  0.16543473474368947
angielski net:  0.08790435577265379
hiszpanski net:  0.12198206313207953
esperanto net:  0.029861256553799266
niemiecki/madagaskar.txt
niemiecki net:  0.15734774356062087
angielski net:  0.0916489738993487
hiszpanski net:  0.11728085529069276
esperanto net:  0.036601568026971884
angielski/madryt.txt
niemiecki net:  0.07534764555276523
angielski net:  0.1322869868319768
hiszpanski net:  0.10289868089926303
esperanto net:  0.07710053403724318
angielski/venus.txt
niemiecki net:  0.09919175977802251
angielski net:  0.11569227992698736
hiszpanski net:  0.11216425955396367
esperanto net:  0.064833242065114

niemiecki net:  0.057876880974704926
angielski net:  0.09931784808388514
hiszpanski net:  0.08982014485866319
esperanto net:  0.1512480324300996
esperanto/bangladesz.txt
niemiecki net:  0.07203381833104201
angielski net:  0.09264868085864095
hiszpanski net:  0.10022076474556783
esperanto net:  0.13595592946392998
esperanto/belgio.txt
niemiecki net:  0.07569372013851085
angielski net:  0.08530360196350657
hiszpanski net:  0.08169976193936394
esperanto net:  0.14002602652723323
niemiecki/warszawa.txt
niemiecki net:  0.1595581844399213
angielski net:  0.09198788058483366
hiszpanski net:  0.10560421357976488
esperanto net:  0.034209872634389946
niemiecki/lublin.txt
niemiecki net:  0.15909165843301726
angielski net:  0.09977836628910293
hiszpanski net:  0.0944282069096368
esperanto net:  0.033905765271109445
niemiecki/madeira.txt
niemiecki net:  0.16543473474368947
angielski net:  0.08790435577265379
hiszpanski net:  0.10952620506055354
esperanto net:  0.029861256553799266
niemiecki/madagas

niemiecki net:  0.0910219036719871
angielski net:  0.12746266143107396
hiszpanski net:  0.08968496231051215
esperanto net:  0.06670472240878991
hiszpanski/napoleon.txt
niemiecki net:  0.09442533800649701
angielski net:  0.06713068420220024
hiszpanski net:  0.14438572013421358
esperanto net:  0.09342654564700059
hiszpanski/londres.txt
niemiecki net:  0.09702123871907183
angielski net:  0.08227049439360541
hiszpanski net:  0.1437250857115095
esperanto net:  0.08506042632145724
hiszpanski/madrid.txt
niemiecki net:  0.09884823599144747
angielski net:  0.07257690708387526
hiszpanski net:  0.15047415850928614
esperanto net:  0.09019380533485924
hiszpanski/malta.txt
niemiecki net:  0.08573131162148714
angielski net:  0.0772170772484615
hiszpanski net:  0.14999773163665203
esperanto net:  0.09854495904100397
hiszpanski/varsovia.txt
niemiecki net:  0.09848325586177986
angielski net:  0.0671166410384138
hiszpanski net:  0.151735494883749
esperanto net:  0.09615495441034821
esperanto/cezaro.txt
n

hiszpanski net:  0.09026581218793002
esperanto net:  0.1512480324300996
esperanto/bangladesz.txt
niemiecki net:  0.07203381833104201
angielski net:  0.09264868085864095
hiszpanski net:  0.09793553266894219
esperanto net:  0.13595592946392998
esperanto/belgio.txt
niemiecki net:  0.07569372013851085
angielski net:  0.08530360196350657
hiszpanski net:  0.0981921444037231
esperanto net:  0.14002602652723323
niemiecki/warszawa.txt
niemiecki net:  0.1595581844399213
angielski net:  0.09198788058483366
hiszpanski net:  0.0962569199194262
esperanto net:  0.034209872634389946
niemiecki/lublin.txt
niemiecki net:  0.15909165843301726
angielski net:  0.09977836628910293
hiszpanski net:  0.08379314133420598
esperanto net:  0.033905765271109445
niemiecki/madeira.txt
niemiecki net:  0.16543473474368947
angielski net:  0.08790435577265379
hiszpanski net:  0.09911070608793979
esperanto net:  0.029861256553799266
niemiecki/madagaskar.txt
niemiecki net:  0.15734774356062087
angielski net:  0.091648973899

niemiecki net:  0.06896079908233319
angielski net:  0.09658162713158983
hiszpanski net:  0.09134062031999167
esperanto net:  0.1404155189196873
esperanto/romo.txt
niemiecki net:  0.057876880974704926
angielski net:  0.09931784808388514
hiszpanski net:  0.0894412021050607
esperanto net:  0.1512480324300996
esperanto/bangladesz.txt
niemiecki net:  0.07203381833104201
angielski net:  0.09264868085864095
hiszpanski net:  0.09725095729533773
esperanto net:  0.13595592946392998
esperanto/belgio.txt
niemiecki net:  0.07569372013851085
angielski net:  0.08530360196350657
hiszpanski net:  0.09780290055813506
esperanto net:  0.14002602652723323
niemiecki/warszawa.txt
niemiecki net:  0.1595581844399213
angielski net:  0.09198788058483366
hiszpanski net:  0.0904881620336404
esperanto net:  0.034209872634389946
niemiecki/lublin.txt
niemiecki net:  0.15909165843301726
angielski net:  0.09977836628910293
hiszpanski net:  0.0775117586222026
esperanto net:  0.033905765271109445
niemiecki/madeira.txt
ni

In [26]:
network.predict('angielski/madryt.txt')

niemiecki net:  0.07534764555276523
angielski net:  0.1322869868319768
hiszpanski net:  0.09562169170987245
esperanto net:  0.07710053403724318
angielski
