In [49]:
import pandas as pd
import re
import string
import matplotlib.pyplot as plt

from sklearn import feature_extraction
from sklearn import  linear_model
from sklearn import pipeline
from sklearn import metrics

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

import time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB


from keras import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, SpatialDropout1D, LSTM, Dense, Dropout



## Loding Datasets

In [50]:
train = pd.read_csv('train_set.csv')


In [51]:
test = pd.read_csv('test_set.csv')

## Creating a copy

In [52]:
#create a copy of train data
df = train.copy()
df

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...
32997,eng,closing date for the submission of completed t...
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...


In [53]:
#create a copy of test data
df_test= test.copy()
df_test

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.
...,...,...
5677,5678,You mark your ballot in private.
5678,5679,Ge o ka kgetha ka bowena go se šomiše Mofani k...
5679,5680,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ..."
5680,5681,"TB ke bokudi ba PMB, mme Morero o tla lefella ..."


## drop duplicates

In [54]:
#remove duplicates
df = df.drop_duplicates(keep='first')


## Text Cleaning

In [55]:
# remove special characters, numbers, punctuations from train data
df['text'] = df['text'].str.replace("[^a-zA-Z#]", " ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [56]:

# remove special characters, numbers, punctuations from test data
df_test['text'] = df_test['text'].str.replace("[^a-zA-Z#]", " ")

In [57]:
#removing short words from train data
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if len(word)>3]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [58]:


#clean line that has english in text
def text_cleaning(text):

  text = re.sub("r[a-zA-Z]+", "",text)


  return text

In [59]:
# cleaning the text of train data
df['text'] = df['text'].apply(lambda x: text_cleaning(x))

# show the cleaned text 
df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,lang_id,text
0,xho,umgaqo siseko wenza amalungiselelo kumaziko ax...
1,xho,kuba nobulumko bokubeka umsebenzi naphi kwiseb...
2,eng,p kwazulu natal depa t invites tende f establi...
3,nso,netefat go file dilo moka dumelelanego mohlala...
4,ven,khomishini ndinganyiso mbeu maana mulayo khomi...


In [60]:
#view data
df['text'].unique()

array(['umgaqo siseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi af',
       'kuba nobulumko bokubeka umsebenzi naphi kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga mayibize uncedo olufanelekileyo elungelweni layo',
       'p kwazulu natal depa t invites tende f established cont expe b const const kwajolwayo tugela  pedest b near tugela fe du p will months',
       ...,
       'closing date submission completed tende august late submissions will conside submissions must enclosed sealed envelope add chief executive officer above add',
       'nawuphina umntu ofunyenwe enetyala phantsi kwalo mthetho ujongene ujungene nesohlwayo sokudliwa imali okanye ukuvalelwa ixesha elingadluliyo kwiinyanga ezintandathu okanye kuzo zombini isohlwayo sokudl

##  Exploratory Data Analysis (EDA)

## Model Building

In [61]:
# Seperate features and tagret variables
y = train['lang_id']
X = train['text']

In [62]:
#apply vextorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

In [63]:
# Split the train data to create validation dataset
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.1,shuffle=True, stratify=y, random_state=11)

## Logistic Regression

In [64]:
#first model

In [65]:

modelstart = time.time()
logreg = LogisticRegression(C=1000, multi_class='ovr', solver='saga', random_state=42, max_iter=10)

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_val)

logreg_f1 = round(f1_score(y_val, y_pred, average='weighted'),2)

print('Accuracy %s' % accuracy_score(y_pred, y_val))
print("Model Runtime: %0.2f seconds"%((time.time() - modelstart)))
report = classification_report(y_val, y_pred, output_dict=True)
results = pd.DataFrame(report).transpose()
results



Accuracy 0.9960606060606061
Model Runtime: 6.39 seconds




Unnamed: 0,precision,recall,f1-score,support
afr,0.996678,1.0,0.998336,300.0
eng,0.993377,1.0,0.996678,300.0
nbl,0.986799,0.996667,0.991708,300.0
nso,0.996667,0.996667,0.996667,300.0
sot,1.0,0.996667,0.998331,300.0
ssw,0.996667,0.996667,0.996667,300.0
tsn,0.996667,0.996667,0.996667,300.0
tso,1.0,1.0,1.0,300.0
ven,1.0,1.0,1.0,300.0
xho,0.993355,0.996667,0.995008,300.0


## Linear SVC

In [66]:
#second model Linear svc

In [67]:
from sklearn.svm import LinearSVC
modelstart = time.time() 
linsvc = LinearSVC()
linsvc.fit(X_train, y_train)
y_pred = linsvc.predict(X_val)
linsvc_f1 = round(f1_score(y_val, y_pred, average='weighted'),2)
print('Accuracy %s' % accuracy_score(y_pred, y_val))
print("Model Runtime: %0.2f seconds"%((time.time() - modelstart)))
report = classification_report(y_val, y_pred, output_dict=True)
results = pd.DataFrame(report).transpose()

results

Accuracy 0.9966666666666667
Model Runtime: 2.53 seconds


Unnamed: 0,precision,recall,f1-score,support
afr,0.996678,1.0,0.998336,300.0
eng,0.993377,1.0,0.996678,300.0
nbl,0.986799,0.996667,0.991708,300.0
nso,0.996667,0.996667,0.996667,300.0
sot,1.0,0.996667,0.998331,300.0
ssw,0.996667,0.996667,0.996667,300.0
tsn,0.996667,0.996667,0.996667,300.0
tso,1.0,1.0,1.0,300.0
ven,1.0,1.0,1.0,300.0
xho,1.0,0.996667,0.998331,300.0


## Naive Bayse

In [68]:
#third model naive bayes

In [71]:

modelstart = time.time()
nb_model = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)

#train model
nb_model.fit(X_train, y_train)

#make predictions 
y_pred = nb_model.predict(X_val)
nb_model_f1 = round(f1_score(y_val, y_pred, average='weighted'),2)

#print accuracy score
print('Accuracy %s' % accuracy_score(y_pred, y_val))
print("Model Runtime: %0.2f seconds"%((time.time() - modelstart)))

report = classification_report(y_val, y_pred, output_dict=True)
results = pd.DataFrame(report).transpose()
results

Accuracy 0.9990909090909091
Model Runtime: 0.27 seconds


Unnamed: 0,precision,recall,f1-score,support
afr,0.996678,1.0,0.998336,300.0
eng,0.993377,1.0,0.996678,300.0
nbl,1.0,1.0,1.0,300.0
nso,1.0,1.0,1.0,300.0
sot,1.0,1.0,1.0,300.0
ssw,1.0,1.0,1.0,300.0
tsn,1.0,0.996667,0.998331,300.0
tso,1.0,1.0,1.0,300.0
ven,1.0,1.0,1.0,300.0
xho,1.0,0.996667,0.998331,300.0


## Submission

In [70]:
test_nb = test['text']
test_vect = vectorizer.transform(test_nb)
# Predict the sentiment using the test data
y_pred =nb_model.predict(test_vect)
# Assign a new column on the test data by using the predicted sentiment from the tweets from test data
test['lang_id'] = y_pred
# Look into the data that will be submitted on Kaggle as csv
test[['index','lang_id']].head()
# save the csv file and submit it. 
test[['index','lang_id']].to_csv('nb_test_submission.csv', index=False)