In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv('train_set.csv')

In [3]:
df.head(15)

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
5,nso,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6,tsn,kgetse nngwe le nngwe e e sa faposiwang mo tsh...
7,ven,mbadelo dze dza laelwa dzi do kwama mahatulele...
8,nso,maloko a dikhuduthamaga a ikarabela mongwe le ...
9,tsn,fa le dirisiwa lebone le tshwanetse go bontsha...


In [4]:
df_test = pd.read_csv('test_set.csv', index_col= 0)
df_test.head(10)

Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
3,Tshivhumbeo tshi fana na ngano dza vhathu.
4,Kube inja nelikati betingevakala kutsi titsini...
5,Winste op buitelandse valuta.
6,"Ke feela dilense tše hlakilego, tša pono e tee..."
7,<fn>(762010101403 AM) 1495 Final Gems Birthing...
8,Ntjhafatso ya konteraka ya mosebetsi: Etsa bon...
9,u-GEMS uhlinzeka ngezinzuzo zemithi yezifo ezi...
10,"So, on occasion, are statistics misused."


In [24]:
def clean(text):
    text = re.sub(r'-','',text)
    text = re.sub(r',','',text)
    text = re.sub(r'\d+','',text)
    text = re.sub(r'\<fn>','',text)
    text = re.sub(r'</fn>','',text)
    text = re.sub(r'( AM)','',text)
    text = re.sub(r'()','',text)
    return text
df_test['text'] = df_test['text'].apply(clean)
df_test.head(9)

Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
1,Mmasepala fa maemo a a kgethegileng a letlelel...
2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
3,Tshivhumbeo tshi fana na ngano dza vhathu.
4,Kube inja nelikati betingevakala kutsi titsini...
5,Winste op buitelandse valuta.
6,Ke feela dilense tše hlakilego tša pono e tee ...
7,() Final Gems Birthing Options_ZULU.txt
8,Ntjhafatso ya konteraka ya mosebetsi: Etsa bon...
9,uGEMS uhlinzeka ngezinzuzo zemithi yezifo ezin...


In [6]:
y = df['lang_id']
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=42)

In [7]:
count_vectorizer = CountVectorizer()
count_train = count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)

In [7]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
tfidf_test = tfidf_vectorizer.transform(X_test.values)
tfidf_df_test = tfidf_vectorizer.transform(df_test['text'].values)

In [8]:
svc = SVC()

In [9]:
svc.fit(tfidf_train, y_train)
y_predict = svc.predict(tfidf_test)
y_predict

array(['sot', 'nso', 'eng', ..., 'afr', 'nso', 'eng'], dtype=object)

In [10]:
f1_score_cv_svc = f1_score(y_test, y_predict, average='macro')
f1_score_cv_svc

0.9951233168220898

In [12]:
df_test['text'] = df_test['text'].apply(clean)
count_df_test = count_vectorizer.transform(df_test['text'].values)
prd = svc.predict(count_df_test)

In [13]:
prd

array(['ssw', 'nbl', 'ssw', ..., 'sot', 'sot', 'zul'], dtype=object)

In [14]:
sub = pd.DataFrame({"index": df_test.index, "lang_id": prd})
sub.to_csv('sub_3.csv', index = False)
sub.head(20)

Unnamed: 0,index,lang_id
0,1,ssw
1,2,nbl
2,3,ssw
3,4,ssw
4,5,ssw
5,6,nso
6,7,ssw
7,8,sot
8,9,zul
9,10,zul


In [15]:
df_sub = pd.read_csv('sub_3.csv', index_col = 0)
df_sub

Unnamed: 0_level_0,lang_id
index,Unnamed: 1_level_1
1,ssw
2,nbl
3,ssw
4,ssw
5,ssw
...,...
5678,ssw
5679,nso
5680,sot
5681,sot


In [16]:
df_test['text'] = df_test['text'].apply(clean)
df_test.head(9)

Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
1,Mmasepala fa maemo a a kgethegileng a letlelel...
2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
3,Tshivhumbeo tshi fana na ngano dza vhathu.
4,Kube inja nelikati betingevakala kutsi titsini...
5,Winste op buitelandse valuta.
6,Ke feela dilense tše hlakilego tša pono e tee ...
7,() Final Gems Birthing Options_ZULU.txt
8,Ntjhafatso ya konteraka ya mosebetsi: Etsa bon...
9,uGEMS uhlinzeka ngezinzuzo zemithi yezifo ezin...


In [27]:
rfc = RandomForestClassifier()

In [41]:
rfc.fit(tfidf_train, y_train)
rfc_pred = rfc.predict(tfidf_test)
rfc_pred

array(['sot', 'nso', 'eng', ..., 'afr', 'nso', 'eng'], dtype=object)

In [42]:
f1_score = f1_score(y_test, rfc_pred, average="macro")
f1_score

TypeError: 'numpy.float64' object is not callable

In [39]:
count_df_test = count_vectorizer.transform(df_test['text'].values)
rfc_pr = rfc.predict(count_df_test)
rfc_pr

array(['tsn', 'nbl', 'ven', ..., 'sot', 'sot', 'ssw'], dtype=object)

In [40]:
rfc_prd = rfc.predict(tfidf_df_test)
rfc_prd

array(['tsn', 'nbl', 'ven', ..., 'sot', 'sot', 'ssw'], dtype=object)

In [None]:
tfidf_df_test

In [31]:
sub = pd.DataFrame({"index": df_test.index, "lang_id": rfc_prd})
#sub.to_csv('sub_456csv', index = False)
sub.head(20)

Unnamed: 0,index,lang_id
0,1,zul
1,2,zul
2,3,ven
3,4,ssw
4,5,zul
5,6,nso
6,7,zul
7,8,sot
8,9,zul
9,10,eng
