# **South African Language Identification**

## EDSA 2021 Classification Hackaton

## 1. Importing Packaging

In [1]:
# Libraries for data loading, data manipulation and data visulisation
import nltk
import re
import csv
import string
from PIL import Image
from wordcloud import WordCloud
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

# Downloads
#nlp = spacy.load('en')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Libraries for data preparation and model building
from collections import Counter
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords, wordnet  
from sklearn.feature_extraction.text import CountVectorizer   
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample

from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report, accuracy_score
#from sklearn.datasets import load_climate_change_tweet

# Setting global constants to ensure notebook results are reproducible
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
sns.set(rc={'figure.figsize':(12,8)})

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\36390\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\36390\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\36390\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\36390\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## 2. Loading Data

In [2]:
# Loading the South African Language Identification train and test data from csv files

df_train = pd.read_csv("train_set.csv")
df_test = pd.read_csv("test_set.csv")

## 3. Data Preprocessing

In [3]:
#view train dataset
df_train.head(10)

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
5,nso,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6,tsn,kgetse nngwe le nngwe e e sa faposiwang mo tsh...
7,ven,mbadelo dze dza laelwa dzi do kwama mahatulele...
8,nso,maloko a dikhuduthamaga a ikarabela mongwe le ...
9,tsn,fa le dirisiwa lebone le tshwanetse go bontsha...


In [4]:
# Checking the shape of the data_train set
df_train.shape

(33000, 2)

In [5]:
#view test dataset
df_test.head(10)

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.
5,6,"Ke feela dilense tše hlakilego, tša pono e tee..."
6,7,<fn>(762010101403 AM) 1495 Final Gems Birthing...
7,8,Ntjhafatso ya konteraka ya mosebetsi: Etsa bon...
8,9,u-GEMS uhlinzeka ngezinzuzo zemithi yezifo ezi...
9,10,"So, on occasion, are statistics misused."


In [6]:
# Checking the shape of the data set
df_test.shape

(5682, 2)

In [7]:
#combine train and test datasets
df=pd.concat([df_train,df_test])
df.head()

Unnamed: 0,lang_id,text,index
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,
2,eng,the province of kwazulu-natal department of tr...,
3,nso,o netefatša gore o ba file dilo ka moka tše le...,
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,


## Data size and structure

In [8]:
#looking at the information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38682 entries, 0 to 5681
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   lang_id  33000 non-null  object 
 1   text     38682 non-null  object 
 2   index    5682 non-null   float64
dtypes: float64(1), object(2)
memory usage: 1.2+ MB


### Removing noise from the train and test data sets

In [9]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
df_train['text'] = df_train['text'].replace(to_replace = pattern_url, value = subs_url, 
                                            regex = True)
df_test['text'] = df_test['text'].replace(to_replace = pattern_url, value = subs_url,
                                          regex = True)

#remove unusual charaters 
unusual_char = ['tÃƒÂ¢Ã¢â€šÂ¬Ã‚Â¦ï¿½ï¿½ï¿½ï¿½ï™']
normal_char = 'abcdefghijklmnopqrstuvwxyz123456789 \n'

def remove_unusual_char(post):
    return ''.join([l for l in post if l in normal_char])

df_train['text'] = df_train['text'].apply(remove_unusual_char)
df_train['text'].iloc[0]

df_test['text'] = df_test['text'].apply(remove_unusual_char)
df_test['text'].iloc[0]

#string of punctuations
print(string.punctuation)

#function to remove punctuation
def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

df_train['text'] = df_train['text'].apply(remove_punctuation)
df_test['text'] = df_test['text'].apply(remove_punctuation)

# Remove all words below 3 characters
df_train['text'] = df_train['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
df_test['text'] = df_test['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Changing from uppercase to lowercase
df_train['text'] = df_train['text'].str.lower()
df_test['text'] = df_test['text'].str.lower()


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [10]:
# df_train without noise
df_train.head(10)

Unnamed: 0,lang_id,text
0,xho,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,idha kuba nobulumko bokubeka umsebenzi naphi k...
2,eng,province kwazulunatal department transport inv...
3,nso,netefata gore file dilo moka dumelelanego tona...
4,ven,khomishini ndinganyiso mbeu maana mulayo khomi...
5,nso,dinyakiio dirwa gabedi ngwaga lebelela dipheto...
6,tsn,kgetse nngwe nngwe faposiwang tshekong temana ...
7,ven,mbadelo laelwa kwama mahatulele khothe madzhis...
8,nso,maloko dikhuduthamaga ikarabela mongwe mongwe ...
9,tsn,dirisiwa lebone tshwanetse bontsha lesedi beny...


In [11]:
# df_test without noise
df_test.head(10)

Unnamed: 0,index,text
0,1,masepala maemo kgethegileng letlelela kgato
1,2,zakwaziswa ngokufaneleko nakungafuneka eminye ...
2,3,shivhumbeo tshi fana ngano vhathu
3,4,inja nelikati betingevakala kutsi titsini nati...
4,5,inste buitelandse valuta
5,6,feela dilense hlakilego pono goba pedi lefelel...
6,7,fn76211143 1495 inal irthing ptionstxtfn
7,8,tjhafatso konteraka mosebetsi bonnete hore tso...
8,9,uhlinzeka ngezinzuzo zemithi yezifo ezingaphel...
9,10,occasion statistics misused


In [12]:
#splitting the data
X = df_train["text"]
y = df_train["lang_id"]

In [13]:
X.shape

(33000,)

In [14]:
# Split the data into training and testing test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train.head(10)

7223     tshedimoo mokgwa wona diinstituene sethaba por...
5196     toboketsa motheo reng moifo tshwereng maemong ...
6547     terreinontwikkelingsplan onderverdelingsplan w...
26890    ifreelance lengumagazini weinternet loniketelw...
30152    khumbula isikolo sakho kufuneka sivelise umgaq...
8557     impikiswano iyothulwa emnyangweni wezindaba za...
26111    strategiese doel loodsprojek horskool frikkie ...
24311    huna mauvha tiwaho dokhumenthe bviswaho tshime...
2214     population census most complex massive exercis...
20380    umongameli umandela wadlala indima eqakathekil...
Name: text, dtype: object

In [16]:
TFID = TfidfVectorizer(max_df=0.9,
                       ngram_range=(1, 5),
                       analyzer= 'char',
                       max_features=10000)
X_vec = TFID.fit_transform(X_train)

#transform X_train and X_test
A = TFID.transform(X_train)
B = TFID.transform(X_test)

In [17]:
# Fitting the Linear SVC model into the train dataset
lsvc = LinearSVC()
lsvc.fit(A, y_train)


LinearSVC()

In [26]:
# Fitting the Logistic Regression Classifier model into the train dataset
lm_full = LogisticRegression()
lm_full.fit(A, y_train)

LogisticRegression()

In [27]:
# Fitting the K-Nearest Neighbors model into the train dataset 
knn = KNeighborsClassifier()
knn.fit(A, y_train)

KNeighborsClassifier()

In [28]:
# Decision Tree
tree = DecisionTreeClassifier( random_state=42)
tree.fit(A, y_train)

DecisionTreeClassifier(random_state=42)

In [29]:
# Use trained model to run prediction on validation data
lsvc_pred= lsvc.predict(B)
#lm_full_pred= lm_full.predict(B)
#knn_pred= knn.predict(B)
#tree_pred= tree.predict(B)

# Score
print("Linear SVC Metrics")
print(metrics.classification_report(y_test, lsvc_pred))

#print("Logistic Regression Classifier Metrics")
#print(metrics.classification_report(y_test, lm_full_pred))

#print("K-Nearest Neighbors Metrics")
#print(metrics.classification_report(y_test, knn_pred))

#print("Decision Tree Metrics")
#print(metrics.classification_report(y_test, tree_pred))

Linear SVC Metrics
              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       583
         eng       1.00      1.00      1.00       615
         nbl       1.00      1.00      1.00       583
         nso       1.00      1.00      1.00       625
         sot       1.00      1.00      1.00       618
         ssw       1.00      1.00      1.00       584
         tsn       1.00      1.00      1.00       598
         tso       1.00      1.00      1.00       561
         ven       1.00      1.00      1.00       634
         xho       1.00      1.00      1.00       609
         zul       1.00      1.00      1.00       590

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600



In [30]:
df_test['text'].shape

(5682,)

In [31]:
test_pred = lsvc.predict(TFID.transform(df_test['text']))

In [32]:
pred_df_train = pd.DataFrame(data=df_test['index'], columns=['index'])
pred_df_train.insert(1, 'lang_id', test_pred, allow_duplicates=False)

pred_df_train.head(10)

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
5,6,nso
6,7,eng
7,8,sot
8,9,zul
9,10,eng


In [None]:
# conversion to csv file
pred_df_train.to_csv('submission.csv', index=False)