In [1]:
# Import packages

# Database manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from wordcloud import WordCloud

# Natural language Toolkit packages.
# Necessary libraries and modules that are 
# going to help us do the data processing 
# from the nltk library.
import nltk

nltk.download(['punkt','stopwords'])
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import string

# Regular expression
import re

# to make bag of words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Packages to create models
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression


from sklearn.svm import LinearSVC  
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
# Model evaluation
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score

# Save trained models
import pickle


import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to C:\Users\Malibongwe
[nltk_data]     Shange\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Malibongwe
[nltk_data]     Shange\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Malibongwe
[nltk_data]     Shange\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Import dataset

df = pd.read_csv("train_set.csv")
df1 = pd.read_csv("test_set.csv")
pd.set_option('display.max_colwidth',None)
pd.set_option('display.max_rows', None)

In [4]:
df.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ezi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi afrika
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi naphi na kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga i-dha mayibize uncedo olufanelekileyo elungelweni layo
2,eng,the province of kwazulu-natal department of transport invites tenders from established contractors experienced in bridge construction for the construction of the kwajolwayo tugela river pedestrian bridge near tugela ferry the duration of the project will be months
3,nso,o netefatša gore o ba file dilo ka moka tše le dumelelanego ka tšona mohlala maleri a magolo a a šomišwago go fihlelela meagong e metelele scaffolds a a bolokegilego lefelo la maleba la go šomela go phela gabotse bjbj
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana u ya nga mulayo wa khomishini ya ndinganyiso ya mbeu u thetshelesa mbilaelo dzine dza tshimbilelana na tshialula u ya nga mbeu nahone i ivhea sa foramu ya thungo u ya nga mulayo wa ndinganyiso


In [5]:
df["lang_id"].value_counts()

ssw    3000
tso    3000
eng    3000
nso    3000
ven    3000
zul    3000
nbl    3000
xho    3000
sot    3000
afr    3000
tsn    3000
Name: lang_id, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB


In [7]:
def clean_text(text):
    text = re.sub(r'[-!@#$(),n"%^*?:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    return text
df['cleaned_text'] = df['text'].apply(clean_text)
df1['cleaned_text'] = df1['text'].apply(clean_text)

In [8]:
df.head()

Unnamed: 0,lang_id,text,cleaned_text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ezi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi afrika,umgaqo siseko we za amalu giselelo kumaziko axhasa ulawulo lwesi i zi ku ye okuthath i xaxheba kwabafazi ezi ziquka phakathi kwezi ye zazo ikomisho i yokuli ga a gokwesi i ikomisho i yamalu gelo olu tu lomza tsi afrika
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi naphi na kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga i-dha mayibize uncedo olufanelekileyo elungelweni layo,i dha iya kuba obulumko bokubeka umsebe zi aphi a kwisebe gokusekwe kwiimfu o zokusebe za zalo emva kokubo a a omsebe zi ku ye oka ye ima ya o yakhe ukuba ula dulo lom tu o jalo alufa eleka ga i dha mayibize u cedo olufa elekileyo elu gelwe i layo
2,eng,the province of kwazulu-natal department of transport invites tenders from established contractors experienced in bridge construction for the construction of the kwajolwayo tugela river pedestrian bridge near tugela ferry the duration of the project will be months,the provi ce of kwazulu atal departme t of tra sport i vites te ders from established co tractors experie ced i bridge co structio for the co structio of the kwajolwayo tugela river pedestria bridge ear tugela ferry the duratio of the project will be mo ths
3,nso,o netefatša gore o ba file dilo ka moka tše le dumelelanego ka tšona mohlala maleri a magolo a a šomišwago go fihlelela meagong e metelele scaffolds a a bolokegilego lefelo la maleba la go šomela go phela gabotse bjbj,o etefatša gore o ba file dilo ka moka tše le dumelela ego ka tšo a mohlala maleri a magolo a a šomišwago go fihlelela meago g e metelele scaffolds a a bolokegilego lefelo la maleba la go šomela go phela gabotse bjbj
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana u ya nga mulayo wa khomishini ya ndinganyiso ya mbeu u thetshelesa mbilaelo dzine dza tshimbilelana na tshialula u ya nga mbeu nahone i ivhea sa foramu ya thungo u ya nga mulayo wa ndinganyiso,khomishi i ya di ga yiso ya mbeu yo ewa maa a u ya ga mulayo wa khomishi i ya di ga yiso ya mbeu u thetshelesa mbilaelo dzi e dza tshimbilela a a tshialula u ya ga mbeu aho e i ivhea sa foramu ya thu go u ya ga mulayo wa di ga yiso


In [9]:
df1.head()

Unnamed: 0,index,text,cleaned_text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlelela kgato eo.",mmasepala fa maemo a a kgethegile g a letlelela kgato eo.
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye imitlolo engezelelako ukuqedelela ukutloliswa kwesibawo sakho.,uzakwaziswa gokufa eleko aku gafu eka emi ye imitlolo e gezelelako ukuqedelela ukutloliswa kwesibawo sakho.
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,tshivhumbeo tshi fa a a ga o dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini naticocisana.,kube i ja elikati beti gevakala kutsi titsi i aticocisa a.
4,5,Winste op buitelandse valuta.,wi ste op buitela dse valuta.


In [10]:
X = df["cleaned_text"]
y = df["lang_id"]

In [11]:
# Split 1 : Normal data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 , random_state = 42)

In [12]:
# Declare TfidfVectorizer
TDIF_vector_1 = TfidfVectorizer(max_df=0.9, ngram_range=(1, 5), analyzer= 'char',max_features=10000)

In [13]:
LR_model_1 = LogisticRegression()
LR_model_1 = Pipeline([('tfidf' ,TDIF_vector_1), ('cfl', LogisticRegression(max_iter=1000))])

In [14]:
LR_model_1.fit(X_train, y_train)
y_pred = LR_model_1.predict(X_test)

In [15]:
f1score_4 = f1_score(y_test, y_pred, average= 'weighted')
print('f1 score:', f1score_4)

f1 score: 0.9975773317158672


In [16]:
# Classification Report
report = metrics.classification_report(y_test, y_pred)

In [17]:
print(report)

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       583
         eng       1.00      1.00      1.00       615
         nbl       0.99      0.99      0.99       583
         nso       1.00      1.00      1.00       625
         sot       1.00      1.00      1.00       618
         ssw       1.00      1.00      1.00       584
         tsn       0.99      1.00      1.00       598
         tso       1.00      1.00      1.00       561
         ven       1.00      1.00      1.00       634
         xho       1.00      1.00      1.00       609
         zul       0.99      0.99      0.99       590

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600



In [18]:
y_test = LR_model_1.predict(df1['cleaned_text'])
results = pd.DataFrame({'index' : df1['index'],'lang_id' : y_test})
results.to_csv('submission.csv', index = False)

In [19]:
results

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
5,6,nso
6,7,eng
7,8,sot
8,9,zul
9,10,eng
