In [0]:
!pip install wikipedia
!pip install unidecode

In [0]:
import re
import wikipedia as wiki
from unidecode import unidecode

In [0]:
wordLength = 12
articles = {

                'en':['actor', 'alcohol', 'cheque', 'cancer', 'chocolate', 'debate', 'hobby', 'melon', 'propaganda',
                      'religion', 'violin', 'england', 'MediaWiki'],

                'cs': ['praha', 'evropa', 'pyreneje', 'voda', 'housle', 'Náboženství', 'Příroda', 'Ekosystém',
                    'vzdělání', 'Irsko', 'Dům', 'Zpěvák', 'Zeus', 'Mykény', 'Starověké_Řecko', 'Renesance',
                    'Andrej_Babiš', 'Správa_železniční_dopravní_cesty', 'Kraje_v_Česku', 'Česko', 'Slezsko',
                    'Latina', 'Spojené_království', 'Římský_senát'],

                'de': ['Deutsche_Sprache', 'Deutschland', 'Kommunistische_Partei_der_Sowjetunion', 'Wasser',
                    'Festkörper', 'Seele', 'Geist', 'Dreifaltigkeit', 'Große', 'Christentum', 'Gott'],

                'sv': ['Svenska', 'Sverige', 'Danmark', 'Europeiska_unionen', 'Medeltiden', 'Feodalism', 'Kung',
                    'Kejsare', 'Monarki', 'Valmonarki', 'Choklad', 'Mjölk', 'Prolaktin', 'Kvinna', 'Eldvapen',
                    'Kina', 'Götar', 'Romantiken', 'Ideologi', 'Tänkande', 'Pedagogik', 'Sekund', 'Solen', 'Väder',
                    'Mellanöstern', 'Väte', 'Anatomi', 'Hjärta', 'Puls', 'Grekiska', 'Cypern'],

                'fr': ['Français', 'Langues_romanes', 'Charlemagne', 'Traité_de_Verdun', 'Louis_le_Pieux',
                    'Son_(physique)', 'Zoologie', 'Intelligence_animale', 'Intelligence', 'Tautologie',
                    'Pléonasme', 'Figure_de_style']

                # 'it': ['Lingua_italiana', 'Graffiti_(archeologia)', 'Impero_romano', 'Romolo_Augusto', 'Diritto_romano',
                #     'Europa', 'Continente', 'Islanda', 'Cioccolato', 'Alimento', 'Plantae', 'Aroma', 'Olfatto',
                #     'Organi_di_senso', 'Organismo_vivente', 'Epigenetica', 'Fenotipo', 'Composto_chimico',
                #     'Legame_covalente', 'Atomo', 'Materia_(fisica)', 'Energia', 'Fisica']
                 }


tags = {
    'en' : 'english',
    'fr' : 'french',
    'de' : 'german',
    'cs' : 'czech',
    'sv' : 'swedish'
}


In [0]:
'''
Returns a list of words present in all pages of the given language tag
'''
def generateDict(tag, maxLength):
    
    wiki.set_lang(tag)

    for article in articles[tag]:

        page = wiki.WikipediaPage(article)
        content = unidecode(page.content)
        wordList = generateWordList(content, maxLength)

    return wordList

In [0]:

'''
Generate a list of words from the given page content
'''
def generateWordList(pageContent, maxLength):

    words = re.sub(r'[^a-zA-Z ]', '', pageContent)
    words = words.lower()
    wordList = words.split()

    shortWords = []
    for word in wordList:
        if len(word) <= maxLength:
            shortWords.append(word)

    return shortWords

In [0]:

'''
Encode the words in their vector form.
abcdef.....xyz
000100.....000
Represents the character d
'''
def convertDictToVector(dic, wordLength):
    vecList = []

    for word in dic:
        vec = ''

        for i in word:

            currentLetter = i
            index = ord(currentLetter) - 97     #a = 0
            letter = ('0' * index) + '1' + ('0' * (25-index))
            vec += letter
        
        if len(word) < wordLength:

            for i in range(wordLength - len(word)):
                vec += '0' * 26

        vecList.append(vec)

    return vecList

In [0]:

'''
Creates the encoding for the output (languages)
'''
def createLanguageVector(tagIndex, numberOfLanguages):
    vec = ('0' * tagIndex) + '1' + ('0' * (numberOfLanguages - tagIndex - 1))
    return vec

In [0]:
import numpy as np
import pandas as pd

In [40]:
actualWords = []
wordVectors = []
languageVectors = []
currentLanguage = 0

for tag in articles.keys():
    print('Generating dataset for ' + tags[tag])

    dic = generateDict(tag, wordLength)
    print(len(dic), 'words in', tags[tag])
    print(currentLanguage)
    for word in dic:
        actualWords.append(word)
    
    vectors = convertDictToVector(dic, wordLength)
    for vector in vectors:
        wordVectors.append(vector)

    outputVector = createLanguageVector(currentLanguage, len(articles))
    for i in range(len(vectors)):
        languageVectors.append(outputVector)
    
    print(outputVector)
    
    currentLanguage += 1

Generating dataset for english
5923 words in english
0
10000
Generating dataset for czech
2746 words in czech
1
01000
Generating dataset for german
11510 words in german
2
00100
Generating dataset for swedish
2333 words in swedish
3
00010
Generating dataset for french
12531 words in french
4
00001


In [41]:
print(set(languageVectors))
print(languageVectors.count('10000'))
print(languageVectors.count('01000'))
print(languageVectors.count('00100'))
print(languageVectors.count('00010'))
print(languageVectors.count('00001'))

{'00100', '01000', '00001', '10000', '00010'}
5923
2746
11510
2333
12531


In [42]:
'''
Create a dataframe to save as csv for visualization
col1    col2-6  col7-32
Word    Output  Vectorized word
'''
print('Creating a dataframe for dataset')
c=0
arr = []
for i in range(len(actualWords)):
    entry = []
    entry.append(actualWords[i])

    for digit in languageVectors[i]:
        entry.append(float(digit))

    for digit in wordVectors[i]:
        entry.append(float(digit))

    if entry[2] == 1.0:
        c+=1
    arr.append(entry)
print(c)

Creating a dataframe for dataset
2746


In [43]:
arr = np.array(arr)
#np.save('arr.npy', arr)

print('Storing dataframe as a .csv file')
df = pd.DataFrame(arr)
df.to_csv('data.csv')

Storing dataframe as a .csv file


In [0]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from sklearn.model_selection import train_test_split

In [0]:
#data = np.load('arr.npy')

In [45]:
print(df.head())
print(df.tail())

         0    1    2    3    4    5    6    7    8    9    ...  308  309  310  \
0  mediawiki  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1         is  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2          a  1.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3       free  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4        and  1.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

   311  312  313  314  315  316  317  
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 318 columns]
                0    1    2    3    4    5    6    7    8    9    ...  308  \
35038  linguistique  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  ...  0.0   
35039       portail  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  ...  0.0   
35040            de  

In [46]:
'''
data[0]
print(data[0,0])
print(data[0, 1:6])
print(data[0, 6:])
'''

labels = arr[:, 1:6]
inputs = arr[:, 6:]

print(arr.shape)
#print(data[11011,0])
#print(labels[11011])
#print(inputs[11011])

#print((labels[:,1] == '1.0').sum())

(35043, 318)


In [50]:
x_train, x_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.15)

#print(data.shape)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(29786, 312)
(29786, 5)
(5257, 312)
(5257, 5)


In [51]:
print(x_train[0])   
print(y_train[0])

l = np.ndarray.tolist(x_train[0])
print(l)

['0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
 '0.0' '0.0' '1.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
 '0.0' '0.0' '0.0' '1.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
 '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
 '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
 '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
 '0.0' '0.0' '1.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
 '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '1.0' '0.0' '0.0' '0.0'
 '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
 '0.0' '0.0' '0.0' '1.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
 '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
 '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '1.0' '0.0' '0.0'
 '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
 '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0

In [0]:
network = Sequential()
network.add(Dense(200, input_dim=26*wordLength, activation='sigmoid'))
network.add(Dense(150, activation='sigmoid'))
network.add(Dense(100, activation='sigmoid'))
network.add(Dense(100, activation='sigmoid'))
network.add(Dense(len(articles), activation='softmax'))

network.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [53]:

filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
tboard = TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
callbacks_list = [checkpoint, tboard]

network.fit(x_train, y_train, epochs=100, batch_size=1000, validation_data=(x_test, y_test), callbacks=callbacks_list)

Train on 29786 samples, validate on 5257 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.80000, saving model to weights.hdf5
Epoch 2/100

Epoch 00002: val_acc did not improve from 0.80000
Epoch 3/100

Epoch 00003: val_acc did not improve from 0.80000
Epoch 4/100

Epoch 00004: val_acc did not improve from 0.80000
Epoch 5/100

Epoch 00005: val_acc improved from 0.80000 to 0.82819, saving model to weights.hdf5
Epoch 6/100

Epoch 00006: val_acc improved from 0.82819 to 0.83797, saving model to weights.hdf5
Epoch 7/100

Epoch 00007: val_acc improved from 0.83797 to 0.84619, saving model to weights.hdf5
Epoch 8/100

Epoch 00008: val_acc improved from 0.84619 to 0.85109, saving model to weights.hdf5
Epoch 9/100

Epoch 00009: val_acc improved from 0.85109 to 0.85809, saving model to weights.hdf5
Epoch 10/100

Epoch 00010: val_acc improved from 0.85809 to 0.86099, saving model to weights.hdf5
Epoch 11/100

Epoch 00011: val_acc improved from 0.86099 to 0.86559, saving model to 

<keras.callbacks.History at 0x7f665ba01358>