In [79]:
# Import libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
import itertools

from copy import deepcopy
from tqdm import tqdm
import time
tqdm.pandas()

#set plot style
sns.set()

#from wordcloud import WordCloud 

import nltk
from nltk import TreebankWordTokenizer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import urllib

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /Users/admin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/admin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [80]:
# Load files
df_train= pd.read_csv('//Users/admin/Documents/Data Science/Explore/Advanced Classification/Hackaton/south-african-language-identification/train_set.csv')
df_test= pd.read_csv('//Users/admin/Documents/Data Science/Explore/Advanced Classification/Hackaton/south-african-language-identification/test_set.csv')
samplesubmission = pd.read_csv('//Users/admin/Documents/Data Science/Explore/Advanced Classification/Hackaton/south-african-language-identification/sample_submission.csv')


In [81]:
train = deepcopy(df_train)
test = deepcopy(df_test)

In [82]:
# Preview train dataset
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [134]:
# Preview test dataset
test.head()

Unnamed: 0,index,text,tokens,lemma,stop_word,stop_word_str
0,1,umgaqo-siseko wenza amalungiselelo kumaziko ax...,"[umgaqo-siseko, wenza, amalungiselelo, kumazik...","[umgaqo-siseko, wenza, amalungiselelo, kumazik...","[umgaqo-siseko, wenza, amalungiselelo, kumazik...",umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,2,i-dha iya kuba nobulumko bokubeka umsebenzi na...,"[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...","[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...","[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...",i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,3,the province of kwazulu-natal department of tr...,"[the, province, of, kwazulu-natal, department,...","[the, province, of, kwazulu-natal, department,...","[province, kwazulu-natal, department, transpoi...",the province of kwazulu-natal department of tr...
3,4,o netefatša gore o ba file dilo ka moka tše le...,"[o, netefatša, gore, o, ba, file, dilo, ka, mo...","[o, netefatša, gore, o, ba, file, dilo, ka, mo...","[netefatša, gore, ba, file, dilo, ka, moka, tš...",o netefatša gore o ba file dilo ka moka tše le...
4,5,khomishini ya ndinganyiso ya mbeu yo ewa maana...,"[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...","[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...","[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...",khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [84]:
# Preview sample submission file
samplesubmission.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl



## Exploratory Data Analysis (EDA)


In [85]:
# Check size and shape of datasets
train.shape, test.shape

((33000, 2), (5682, 2))

In [86]:
value_counts = train["lang_id"].value_counts()
value_counts.name = "Raw Number"

value_normd = train["lang_id"].value_counts(normalize=True)*100
value_normd.name = "Percentage"

display(pd.concat([value_counts, value_normd], axis=1))

Unnamed: 0,Raw Number,Percentage
xho,3000,9.090909
eng,3000,9.090909
nso,3000,9.090909
ven,3000,9.090909
tsn,3000,9.090909
nbl,3000,9.090909
zul,3000,9.090909
ssw,3000,9.090909
tso,3000,9.090909
sot,3000,9.090909


## Data Engineering

In [87]:
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [88]:
def cleaner(language):
    language = language.lower()
    
    language = re.sub(r'@[A-Za-z0-9]+', '', language) #Remove @mentions
    language = re.sub(r':[\s]+', '', language)
    language = re.sub(r'#', '', language) #Remove # symbol
    language = re.sub(r'rt[\s]+', '', language) #Remove RT
    language = re.sub(r'https?:\/\/\S+', '', language) #Remove hyper-links
    
    # strip punctuation and special characters
    language = re.sub(r"[,.;':@#?!\&/$]+\ *", " ", language)
    # strip excess white-space
    language = re.sub(r"\s\s+", " ", language)
    
    return language.lstrip(" ")

In [89]:
train["text"] = train["text"].apply(cleaner)
test["text"] = train["text"].apply(cleaner)
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


# Tokenization

In [90]:
tokeniser = TreebankWordTokenizer()
train['tokens'] =train['text'].apply(tokeniser.tokenize)
test['tokens'] =train['text'].apply(tokeniser.tokenize)

## Lemmatization

In [91]:
# Create WordNetLemmatizer object
lemmatizer = WordNetLemmatizer()

def train_lemma(tokens, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in tokens]    

In [92]:
train['lemma'] = train['tokens'].apply(train_lemma, args=(lemmatizer, ))
test['lemma'] = test['tokens'].apply(train_lemma, args=(lemmatizer, ))

In [93]:
def remove_stop_words(tokens):    
    return [t for t in tokens if t not in stopwords.words('english')]

In [94]:
train['stop_word'] = train['tokens'].apply(remove_stop_words)
test['stop_word'] = test['tokens'].apply(remove_stop_words)

In [95]:
train.head()

Unnamed: 0,lang_id,text,tokens,lemma,stop_word
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,"[umgaqo-siseko, wenza, amalungiselelo, kumazik...","[umgaqo-siseko, wenza, amalungiselelo, kumazik...","[umgaqo-siseko, wenza, amalungiselelo, kumazik..."
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,"[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...","[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...","[i-dha, iya, kuba, nobulumko, bokubeka, umsebe..."
2,eng,the province of kwazulu-natal department of tr...,"[the, province, of, kwazulu-natal, department,...","[the, province, of, kwazulu-natal, department,...","[province, kwazulu-natal, department, transpoi..."
3,nso,o netefatša gore o ba file dilo ka moka tše le...,"[o, netefatša, gore, o, ba, file, dilo, ka, mo...","[o, netefatša, gore, o, ba, file, dilo, ka, mo...","[netefatša, gore, ba, file, dilo, ka, moka, tš..."
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,"[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...","[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...","[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew..."


In [96]:
# Converting list to string
train['stop_word_str'] = [' '.join(map(str,l)) for l in train['lemma']]
train.head()

Unnamed: 0,lang_id,text,tokens,lemma,stop_word,stop_word_str
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,"[umgaqo-siseko, wenza, amalungiselelo, kumazik...","[umgaqo-siseko, wenza, amalungiselelo, kumazik...","[umgaqo-siseko, wenza, amalungiselelo, kumazik...",umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,"[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...","[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...","[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...",i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...,"[the, province, of, kwazulu-natal, department,...","[the, province, of, kwazulu-natal, department,...","[province, kwazulu-natal, department, transpoi...",the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,"[o, netefatša, gore, o, ba, file, dilo, ka, mo...","[o, netefatša, gore, o, ba, file, dilo, ka, mo...","[netefatša, gore, ba, file, dilo, ka, moka, tš...",o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,"[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...","[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...","[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...",khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [97]:
test['stop_word_str'] = [' '.join(map(str,l)) for l in test['lemma']]
test.head()

Unnamed: 0,index,text,tokens,lemma,stop_word,stop_word_str
0,1,umgaqo-siseko wenza amalungiselelo kumaziko ax...,"[umgaqo-siseko, wenza, amalungiselelo, kumazik...","[umgaqo-siseko, wenza, amalungiselelo, kumazik...","[umgaqo-siseko, wenza, amalungiselelo, kumazik...",umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,2,i-dha iya kuba nobulumko bokubeka umsebenzi na...,"[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...","[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...","[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...",i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,3,the province of kwazulu-natal department of tr...,"[the, province, of, kwazulu-natal, department,...","[the, province, of, kwazulu-natal, department,...","[province, kwazulu-natal, department, transpoi...",the province of kwazulu-natal department of tr...
3,4,o netefatša gore o ba file dilo ka moka tše le...,"[o, netefatša, gore, o, ba, file, dilo, ka, mo...","[o, netefatša, gore, o, ba, file, dilo, ka, mo...","[netefatša, gore, ba, file, dilo, ka, moka, tš...",o netefatša gore o ba file dilo ka moka tše le...
4,5,khomishini ya ndinganyiso ya mbeu yo ewa maana...,"[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...","[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...","[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...",khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [98]:
#create a new dataframe with the sentiment and lemma columns
train2=train[['lang_id', 'stop_word_str']]
train2.head()

Unnamed: 0,lang_id,stop_word_str
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


## Vectorization

In [99]:
X, y= train2.iloc[:,0], train2.iloc[:,1]

In [100]:
from sklearn.model_selection import train_test_split

In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [102]:
vect = CountVectorizer(max_features=1000)
X_count = vect.fit_transform(train2['stop_word_str'].values.astype(str))
X_count_test = vect.fit_transform(test['stop_word_str'].values.astype(str))

In [103]:
X_count.shape

(33000, 1000)

In [104]:
X = X_count.toarray()

In [115]:
X_test_pred= X_count_test.toarray()

In [116]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# Fit label encoder and return encoded labels
y = le.fit_transform(train2['lang_id'])

In [117]:
y

array([9, 9, 1, ..., 1, 9, 4])

In [129]:
lang_labels = list(le.classes_)

In [118]:
#Splitting Data into Training and Testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

In [119]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(26400, 1000)
(6600, 1000)
(26400,)
(6600,)


In [120]:
from sklearn.linear_model import LogisticRegression

In [121]:
#Creating instance for LogisticRegression
lr = LogisticRegression(multi_class='ovr')

In [122]:
#Fitting the model
lr.fit(X_train, y_train)

LogisticRegression(multi_class='ovr')

In [123]:
# Form a prediction set
y_pred = lr.predict(X_test)

In [124]:
y_pred

array([10,  3,  5, ...,  3,  2,  2])

In [None]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test, target_names=type_labels))

import warnings
warnings.filterwarnings('ignore')

In [132]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=lang_labels))
#print(f'{classification_report(y_test, y_pred)}')

import warnings
warnings.filterwarnings('ignore')

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       632
         eng       0.99      1.00      1.00       595
         nbl       0.92      0.89      0.91       607
         nso       1.00      0.99      1.00       573
         sot       1.00      1.00      1.00       572
         ssw       0.90      0.95      0.92       647
         tsn       0.99      1.00      0.99       568
         tso       1.00      1.00      1.00       576
         ven       1.00      1.00      1.00       601
         xho       0.93      0.91      0.92       598
         zul       0.89      0.87      0.88       631

    accuracy                           0.96      6600
   macro avg       0.97      0.97      0.97      6600
weighted avg       0.96      0.96      0.96      6600



In [140]:
predict= y_pred_pred = lr.predict(X_test_pred)

In [141]:
predict

array([5, 5, 2, ..., 4, 3, 5])