In [1]:
# Packages for data analysis
import pandas as pd
import numpy as np
import time

# Packages for visualizations
import seaborn as sns
import matplotlib.style as style

# Packages for preprocessing
import nltk
import string
import re
#from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer

# Packages for training models
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
#import xgboost as xgb

# Model Evaluation Packages
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.metrics import make_scorer

import matplotlib.pyplot as plt
%matplotlib inline

# Style
sns.set(font_scale=1.5)


In [2]:
train=pd.read_csv('train_set.csv')

In [3]:
test=pd.read_csv('test_set.csv')

In [4]:
test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [5]:
def clean_text(text):
    # replace the html characters with " "
    text=re.sub('<.*?>', ' ', text)
#     Removal of numbers
#    text = re.sub(r'\d+', ' ', text)
    # will replace newline with space
    text = re.sub("\n"," ",text)
    # will convert to lower case
    text = text.lower()
    # will split and join the words
    text=' '.join(text.split())
    return text

In [6]:
train['text'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

In [7]:
X = train['text']
y = train['lang_id']

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20)

In [9]:
classifiers = [LinearSVC(random_state=42),
    
               LogisticRegression(random_state=42,
                                  multi_class='ovr',
                                  n_jobs=1,
                                  C=1e5,
                                  max_iter=4000),
               KNeighborsClassifier(n_neighbors=5),
               MultinomialNB()]

In [10]:
def models_building(classifiers, X_train, y_train, X_val, y_val):


    models_summary = {}

    # Pipeline to balance the classses and then to build the model
    for clf in classifiers:
        clf_text = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                                       max_df=0.9,
                                                       ngram_range=(1, 2))),
                             ('clf', clf)])

        # Logging the Execution Time for each model
        start_time = time.time()
        clf_text.fit(X_train, y_train)
        predictions = clf_text.predict(X_val)
        run_time = time.time()-start_time

        # Output for each model
        models_summary[clf.__class__.__name__] = {
            'F1-Macro': metrics.f1_score(y_val,
                                         predictions,
                                         average='macro'),
            'F1-Accuracy': metrics.f1_score(y_val, predictions,
                                            average='micro'),
            'F1-Weighted': metrics.f1_score(y_val,
                                            predictions,
                                            average='weighted'),
            'Execution Time': run_time}

    return pd.DataFrame.from_dict(models_summary, orient='index')

In [11]:
classifiers_df = models_building(classifiers, X_train, y_train, X_val, y_val)
ordered_df = classifiers_df.sort_values('F1-Macro', ascending=False)
ordered_df

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted,Execution Time
MultinomialNB,0.998447,0.998485,0.998485,7.721965
LinearSVC,0.997227,0.997273,0.997275,12.100551
LogisticRegression,0.997227,0.997273,0.997275,228.900311
KNeighborsClassifier,0.966428,0.96697,0.966789,14.654906


In [14]:
param_grid = {'alpha': [0.1, 1, 5, 10]}  # setting parameter grid

tuned_mnb = Pipeline([('tfidf', TfidfVectorizer(min_df=2,
                                                max_df=0.9,
                                                ngram_range=(1, 2))),
                      ('mnb', GridSearchCV(MultinomialNB(),
                                           param_grid=param_grid,
                                           cv=5,
                                           n_jobs=-1,
                                           scoring='f1_weighted'))
                      ])

tuned_mnb.fit(X_train, y_train)  # Fitting the model

y_pred_mnb = tuned_mnb.predict(X_val)  # predicting the fit on validation set

print(classification_report(y_val, y_pred_mnb))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       644
         eng       0.99      1.00      1.00       579
         nbl       1.00      0.99      0.99       597
         nso       1.00      1.00      1.00       630
         sot       1.00      1.00      1.00       575
         ssw       1.00      1.00      1.00       622
         tsn       1.00      1.00      1.00       613
         tso       1.00      1.00      1.00       607
         ven       1.00      1.00      1.00       571
         xho       1.00      0.99      1.00       596
         zul       0.99      0.99      0.99       566

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600



In [16]:
submission_df = pd.DataFrame(test['index'])
submission_df['lang_id'] = tuned_mnb.predict(test['text'])
submission_df.to_csv('submission_Muhammed_NB.csv', index=False)