In [29]:
import random

class Language:
    ENGLISH = 'ENGLISH'
    SWAHILI = 'SWAHILI'

class Phrase:
    def __init__(self, text, lang):
        self.text = text
        self.lang = lang
        self.language = self.get_language()
    
    def get_language(self):
        if self.lang == 'en':
            return Language.ENGLISH
        elif  self.lang == 'sw':
            return  Language.SWAHILI
        else:
            pass
    
class PhraseContainer:
    def __init__(self, phrases):
        self.phrases = phrases
    
    def get_text(self):
        return [x.text for x in self.phrases]
    
    def get_language_which(self):
        return [x.language for x in self.phrases]
    
    def evenly_distribute(self):
        random.shuffle(self.phrases)
            
    
    

#### Load Data

In [30]:
import json

filename = 'corpora.json'

phrases = []
with open (filename) as f:
    data = json.load(f)
    for dictionary in data:
        phrases.append(Phrase(dictionary['text'], dictionary['lang_id']))

#### Prep Data

In [31]:
from sklearn.model_selection import train_test_split

random.shuffle(phrases)
training, test = train_test_split(phrases, test_size=0.33, random_state=42)

train_container = PhraseContainer(training)

test_container = PhraseContainer(test)

In [40]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_language_which()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_language_which()

print(train_y.count(Language.ENGLISH))
print(train_y.count(Language.SWAHILI))

18743
11187


In [41]:
training[978].text

'A light part song, or madrigal, with a fa la burden or chorus,-- most common with the Elizabethan madrigal composers.'

#### Bag of words vectorization

In [42]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x_vectors[0].toarray())

[[0. 0. 0. ... 0. 0. 0.]]


#### Classification

#### Linear SVM

In [51]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

print(test_x[990])

clf_svm.predict(test_x_vectors[990])

diwanimjumbe aliyechaguliwa na wananchi wa kata ili awawakilishe katika baraza la jiji, manispaa au halmashauri ya wilaya


array(['SWAHILI'], dtype='<U7')

#### Decision Tree

In [44]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

print(test_x[0])
clf_dec.predict(test_x_vectors[0])

One who, or that which, gives energy, or acts in producing aneffect.


array(['ENGLISH'], dtype='<U7')

#### Naive Bayes

In [52]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors, train_y)

print(test_x[2])
clf_gnb.predict(test_x_vectors[2])

isotopuatomi yenye uzito tofauti na atomi nyingine


array(['SWAHILI'], dtype='<U7')

#### Logistic Regression

In [53]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['ENGLISH'], dtype='<U7')

#### Evaluation

In [54]:
# Mean accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.9883334463813335
0.9750390015600624
0.9752424879603879
0.9881299599810079


In [55]:
# F1 scores
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Language.ENGLISH, Language.SWAHILI])
# f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Language.ENGLISH, Language.SWAHILI])
# f1_score(test_y, clf_gnb.predict(test_x_vectors), average=None, labels=[Language.ENGLISH, Language.SWAHILI])
# f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Language.ENGLISH, Language.SWAHILI])


array([0.99065522, 0.98447653])

In [70]:
test_set = ['tazama mwanakondoo wa Mungu', 'behold the lamb of God', ' mimi', ' because of']

new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['SWAHILI', 'ENGLISH', 'SWAHILI', 'ENGLISH'], dtype='<U7')

#### Tuning of the model(with Grid Search)

In [72]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

#### Save Model

In [73]:
import pickle

with open('C:\\Users\\Admin\\Documents\\DATA SCIENCE\\Data Analysis And Visualisation\\Ten Libraries\\language_detection\\language_detector.pkl', 'wb') as f:
    pickle.dump(clf, f)

#### Load model

In [74]:
with open('C:\\Users\\Admin\\Documents\\DATA SCIENCE\\Data Analysis And Visualisation\\Ten Libraries\\language_detection\\language_detector.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [75]:
print(test_x[0])
loaded_clf.predict(test_x_vectors[0])

One who, or that which, gives energy, or acts in producing aneffect.


array(['ENGLISH'], dtype='<U7')

In [85]:
print(test_x[2])
loaded_clf.predict(test_x_vectors[2])

isotopuatomi yenye uzito tofauti na atomi nyingine


array(['SWAHILI'], dtype='<U7')

In [99]:
test_set = []
sentence = input("Enter a phrase to predict whether it is in english or swahili: ")
test_set.clear()
test_set.append(sentence)
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['ENGLISH'], dtype='<U7')

In [94]:
### very accurate