In [30]:
import string

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv("csv_data/salt-train-v1.csv")
data.dropna(axis=0, inplace=True)

In [31]:
"""
 * transform_txt_col -- transforms the text column
 * 
 * @text <str>: text to transform
 * 
 * Description: 
 * Removes the trailing full stop
 * Removes words less than 3 characters
 * Changes words to lowercase
 *
 * Return: transformed text
"""
def transform_txt_col(text: str)-> str:
    
    text = text.strip(".")
    text = " ".join([ word.lower() for word in text.split(" ") if len(word) > 2 ])
    text = remove_non_ascii_chars(text)
                     
    return text

                     
"""
 * remove_non_ascii_chars -- deletes characters
 * 
 * @text <str>: text to delete characters from 
 *
 * Depends: none_ascii_chars <list of non-ascii characters>
 *
 * Return: formatted text
"""
def remove_non_ascii_chars(text: str)->str:
    for character in none_ascii_chars:
        if character in text:
            text = text.replace(character, "")
    return text


In [32]:
# get uncommon characters
ascii_chars = set(string.ascii_lowercase)
full_corpus_uniq_chars = set("".join([text for text in data.text])) # all characters in the text data
none_ascii_chars = full_corpus_uniq_chars.difference(ascii_chars)
none_ascii_chars.remove(" ") # remove the \0 character

In [33]:
data['text'] = data['text'].apply(transform_txt_col)

data['len_text'] = data['text'].str.len()
data['unique_chars'] = data['text'].str.join("").apply(set).apply(len) # no of unique chars in text

In [34]:
data.head()

Unnamed: 0,text,lang,len_text,unique_chars
0,was not ghost refugee camp,eng,26,16
1,enkambi yabanoonyiboobubudamu teyaliiwo bulimba,lug,47,15
2,obedo kem goba goba,ach,19,9
3,mam arai ekabi erai ekwam,teo,25,9
4,eri aani ndra kembe emunyale eyini aazu inzoru,lgg,46,15


In [5]:
# remove empty text
data = data[data.text != ""]

In [6]:
# data.describe()

In [7]:
data[data.lang == "eng"].describe()

Unnamed: 0,len_text,unique_chars
count,23947.0,23947.0
mean,48.094083,17.092454
std,16.810643,2.466982
min,6.0,5.0
25%,36.0,16.0
50%,47.0,17.0
75%,59.0,19.0
max,130.0,24.0


In [35]:
data[data.lang == "lug"].describe()

Unnamed: 0,len_text,unique_chars
count,23947.0,23947.0
mean,52.932225,16.215184
std,20.113512,2.496002
min,6.0,4.0
25%,38.0,15.0
50%,51.0,16.0
75%,66.0,18.0
max,153.0,24.0


In [9]:
# data[data.lang == "ach"].describe()

In [10]:
# data[data.lang == "teo"].describe()

In [11]:
# data[data.lang == "lgg"].describe()

In [36]:
data[data.lang == "nyn"].describe()

Unnamed: 0,len_text,unique_chars
count,23947.0,23947.0
mean,56.25452,16.257485
std,21.544976,2.232146
min,5.0,5.0
25%,41.0,15.0
50%,54.0,16.0
75%,69.0,18.0
max,185.0,24.0


In [13]:
train_data = data[['text', 'lang']]

In [14]:
replace_dict = {
    "eng": 0,
    "lug": 1,
    "ach": 2,
    "teo": 3,
    "lgg": 4,
    "nyn": 5
}

train_data['lang'] = train_data.lang.map(replace_dict)

In [15]:
# train_data.head()

In [17]:
texts = train_data['text']
labels = train_data['lang']

In [18]:
max_len = 150  # Maximum length of each sequence (number of characters)

tokenizer = Tokenizer(char_level=True) 
tokenizer.fit_on_texts(texts)

char_index = tokenizer.word_index
print(f"Found {len(char_index)} unique characters.")

sequences = tokenizer.texts_to_sequences(texts)
print(sequences[0])

text_data = pad_sequences(sequences, maxlen=max_len)
print(text_data[0])

labels = to_categorical(labels, num_classes=6)
labels = np.array(labels)


Found 28 unique characters.
[18, 2, 13, 1, 6, 5, 10, 1, 15, 19, 5, 13, 10, 1, 9, 4, 23, 7, 15, 4, 4, 1, 21, 2, 12, 20]
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0 18  2 13  1  6  5 10  1 15 19  5 13 10  1  9  4 23  7 15  4
  4  1 21  2 12 20]


In [19]:
labels[0]

array([1., 0., 0., 0., 0., 0.], dtype=float32)

In [20]:
text_data.shape, labels.shape

((143680, 150), (143680, 6))

In [21]:
# Initialize the model
model = Sequential()

# Add an Embedding layer to handle character embeddings
model.add(Embedding(len(char_index) + 1, 32, input_length=max_len))  # Add +1 for the unknown character
model.add(LSTM(64))  # LSTM layer with 64 units
model.add(Dense(6, activation='sigmoid'))  # Output layer with sigmoid activation for binary classification


In [22]:

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [23]:
# Training the model
epochs = 5
batch_size = 32
model.fit(text_data, labels, epochs=epochs, batch_size=batch_size, validation_split=0.1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fa929dd68e0>

In [24]:
# preparing the test data

test_data = pd.read_csv("csv_data/salt-test-v1.csv")

test_data['text'] = test_data['text'].apply(transform_txt_col)
test_data_texts = test_data['text']
test_data_texts = tokenizer.texts_to_sequences(test_data_texts)
test_data_texts = pad_sequences(test_data_texts, maxlen=max_len)

test_data['lang'] = test_data['lang'].map(replace_dict)
test_data_labels = test_data['lang']
test_data_labels = to_categorical(test_data_labels, num_classes=6)
test_data_labels = np.array(test_data_labels)


In [25]:
loss, accuracy = model.evaluate(test_data_texts, test_data_labels)

print(f"Loss: {loss}")


Loss: 0.0250179935246706


In [26]:
new_text = "nibwigya"


new_sequence = tokenizer.texts_to_sequences([new_text])
new_data = pad_sequences(new_sequence, maxlen=max_len)

predictions = model.predict(new_data)
print(predictions)

predicted_class = np.argmax(predictions, axis=1)
print(predicted_class)

[[0.19496532 0.94518447 0.27659863 0.02539157 0.02784669 0.9657331 ]]
[5]


In [29]:
# import pickle
# with open("tokenizer.pickle", "wb") as _tok:
# #     pickle.dump(tokenizer, _tok)
    
# with open("model_1.pickle", "wb") as _model:
# #     pickle.dump(model, _model)
    