In [1]:
import pandas as pd
import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros
import re
import keras
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
# text pre-processing functions
def clean_text(text):
    text = text.replace('\nBULLET::::', ' ')
    text = text.replace('BULLET::::-', ' ')
    text = text.replace('BULLET::::', ' ')
    text = text.replace('\n', ' ')
    text = text.replace('\n\n', ' ')
    text = text.replace(r',', '')
    text = text.replace('.', '')
    text = text.replace(' - ', '')
    text = text.replace('-', '')
    text = text.replace('&nbsp;', ' ')
    text = text.replace('Page', ' ')
    text = text.replace(':', ' ')
    text = text.replace(';', ' ')
    text = text.replace('"', '')
    text = text.replace("'", '')
    text = text.replace('(', '')
    text = text.replace('[', '')
    text = text.replace(']', '')
    text = text.replace(')', '')
    text = text.strip()
    text = re.sub(r'\d+.', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text

def get_tamil_stop_words():
    swdf1 = pd.read_csv("data/TamilNLP_TamilStopWords.txt",  header=None) 
    swdf2 = pd.read_csv("data/custom_tamil_stop_words.txt",  header=None) 
    sw1 = swdf1[0].tolist()
    sw2 = swdf2[0].tolist()
    tamil_stop_words = list(set(sw1 + sw2))
    return tamil_stop_words

In [3]:
# Load the Tamil wiki lablled data
df = pd.read_csv("data/cleaned_tamil_wiki_text_data.csv")
df = df.sample(frac=1)
df.shape

(16250, 12)

In [4]:
# Check how many articles are more than 300 words, 
df['word_count'] = df.text.apply(lambda x: len(str(x).split()))
more_than_300 =  df['word_count'] > 300
df3 = df[more_than_300]
df3.shape

(2157, 13)

In [5]:
# Split the data into train and test
train_set,test_set = train_test_split(df,test_size=0.15,random_state=50)
X_train = train_set.text
X_test = test_set.text
y_train = train_set.parent_category
y_test = test_set.parent_category

In [6]:
# convert text into embedded vectors
maxlen = 325
tokenizer = Tokenizer(num_words=10000, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(X_train)

X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

# Pad the text to a constant length
X_train = pad_sequences(X_train_tokens, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test_tokens, padding='post', maxlen=maxlen)

vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

403044


In [7]:
# Encode the labels, note the labels_index so we can recover the labels when we predict
category_encoding_test = pd.factorize(y_test)
category_encoding = pd.factorize(y_train)
labels, labels_index = category_encoding
labels_test, labels_index_test = category_encoding_test
y_train_encoded = keras.utils.to_categorical(labels)
y_test_encoded = keras.utils.to_categorical(labels_test)
labels_index

Index(['சமூகம்', 'மானிடவியல்', 'சமயம்', 'கணினியியல்', 'உயிரியல்', 'மொழி',
       'ஊடகவியல்', 'கணிதம்', 'இயற்பியல்', 'வேதியியல்', 'திரைப்படம்', 'சட்டம்',
       'கல்வி', 'பண்பாடு', 'வரலாறு', 'கட்டிடக்கலை', 'அரசியல்', 'புவியியல்',
       'வணிகவியல்', 'உளவியல்', 'இசை', 'வானியல்', 'நலம்', 'இலக்கியம்',
       'தொழினுட்பம்'],
      dtype='object')

In [8]:
# Build embeddings
embeddings_dictionary = dict()
## Note part of github repo, pls download it from 
## https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md
glove_file = open('data/cc.ta.300.vec', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 300))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [9]:
from keras.layers import Dense, Flatten, LSTM, Conv1D, SpatialDropout1D, MaxPooling1D, Dropout, Activation, GlobalMaxPooling1D
model_conv = Sequential()
model_conv.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=maxlen))
model_conv.add(Conv1D(100, 5))
#model_conv.add(Flatten())
model_conv.add(GlobalMaxPooling1D())
model_conv.add(Dense(125, activation='relu'))
model_conv.add(Dense(25, activation='softmax'))
model_conv.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_conv.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 325, 300)          120913200 
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 321, 100)          150100    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 125)               12625     
_________________________________________________________________
dense_2 (Dense)              (None, 25)                3150      
Total params: 121,079,075
Trainable params: 121,079,075
Non-trainable params: 0
_________________________________________________________________


In [10]:
# Fit the model
epochs = 12
batch_size = 650
model_conv.fit(X_train, y_train_encoded, validation_split=0.2, epochs = epochs, batch_size=batch_size)

  num_elements)


Train on 11049 samples, validate on 2763 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f7c2d1e5390>

In [11]:
loss, accuracy = model_conv.evaluate(X_test, y_test_encoded, verbose=2)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 1.230517


In [12]:
tamil_stop_words = get_tamil_stop_words()

In [13]:
# Load external validation dataset
validation_df = pd.read_csv("data/validation.csv")
validation_df['text'] = validation_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (tamil_stop_words)]))
validation_df['text'] = validation_df.apply(lambda row: clean_text(row['text']),axis=1)
validation_df.head(2)

Unnamed: 0.1,Unnamed: 0,title,source,parent_category,text
0,,சுமத்ரான் காண்டாமிருக இனம் முற்றிலும் அழிந்துவ...,https://www.bbc.com/tamil/science-50534560,உயிரியல்,மனிதர்களின் வேட்டைகளினாலும் வாழிட பரப்பு அழிப்...
1,,பிளாஸ்டிக்கில் இருந்து எண்ணெய் எடுக்கும் முயற்...,https://www.bbc.com/tamil/global-50199538,வேதியியல்,ஆய்வகத்தில் உலைக்கலனை கென்னத் போயிப்பெல்மெயர் ...


In [14]:
# predict on external data validation set
V_train = tokenizer.texts_to_sequences(validation_df.text)
V_train = pad_sequences(V_train, padding='post', maxlen=maxlen)
validation_predictions = model_conv.predict_classes(V_train)
validation_predictions

array([24, 18,  8, 21, 22,  0,  0,  4,  9,  8, 13, 24,  1, 23])

In [15]:
# calculate accuracy
j = 0
correct = 0
for y_1 in validation_predictions:
    actual = validation_df.iloc[j].parent_category
    predicted = labels_index[y_1]
    #print ("actual: " + actual + " " + "predicted: " + predicted)
    if actual == predicted:
        correct = correct + 1
    j = j + 1
print("validation accuracy: " + str(correct/j))

validation accuracy: 0.35714285714285715


In [20]:
# predict one case as a sanity check
test_id = 4565
df.iloc[test_id]

parent_category                                                                 நலம்
Unnamed: 1                                                                     12536
Unnamed: 0                                                                    107760
id                                                                            141616
url                                       https://ta.wikipedia.org/wiki?curid=141616
title                                                                 கருணைக்கிழங்கு
text                               கருணைக்கிழங்கு கருனை அல்லது பூமி சல்லரைக்கிழங்...
categories                                              ['காய்கறிகள்', 'கிழங்குகள்']
parent_category.1                                                               நலம்
parent_category_recursion_depth                                                    3
is_cricketer                                                                   False
is_temple                                                        

In [21]:
# predict one case as a sanity check
input_text = [df.iloc[test_id].text]
seq = tokenizer.texts_to_sequences(input_text)
padded = pad_sequences(seq, maxlen=maxlen)
ynew = model_conv.predict(padded)
y_classes = ynew.argmax(axis=-1)
labels_index[y_classes[0]]

'நலம்'