In [66]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC


# Metrics for Model Evaluation
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import GridSearchCV
from scikitplot.metrics import plot_roc, plot_confusion_matrix
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report



In [67]:
data_train = pd.read_csv("train_set.csv")
samp_sub = pd.read_csv("sample_submission.csv")
data_test = pd.read_csv("test_set.csv")



In [68]:
df_test = data_test.copy()



In [69]:
# View the train data
data_train.head()



Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [70]:
# count the values of the language id:
data_train["lang_id"].value_counts()



eng    3000
nso    3000
tso    3000
ssw    3000
ven    3000
zul    3000
nbl    3000
xho    3000
tsn    3000
sot    3000
afr    3000
Name: lang_id, dtype: int64

In [71]:
# Look for the null values in the dataset
data_train.isnull().sum()



lang_id    0
text       0
dtype: int64

In [72]:
# View the test data
data_test.head()



Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [73]:
# View the sample data, i.e. how we will have to submit...
samp_sub.head()



Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl


In [74]:
# View the unique values of the train dataset language id
data_train["lang_id"].unique()



array(['xho', 'eng', 'nso', 'ven', 'tsn', 'nbl', 'zul', 'ssw', 'tso',
       'sot', 'afr'], dtype=object)

In [75]:
# View the unique values of the test dataset index
data_test["index"].unique()



array([   1,    2,    3, ..., 5680, 5681, 5682], dtype=int64)

In [76]:
samp_sub["index"].unique()



array([1, 2], dtype=int64)

In [77]:
# Create a copy of the train dataset
df = data_train.copy()




In [78]:

df['lang_id'].unique()

array(['xho', 'eng', 'nso', 'ven', 'tsn', 'nbl', 'zul', 'ssw', 'tso',
       'sot', 'afr'], dtype=object)

In [79]:
import re
def preprocess_text(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    text = re.sub('@[^\s]+','USER', text)
    text = text.lower().replace("ё", "е")
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +',' ', text)
    return text.strip()
df["text1"] = [preprocess_text(t) for t in df['text']]

In [80]:
df

Unnamed: 0,lang_id,text,text1
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,i dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...,the province of kwazulu natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,o netefat a gore o ba file dilo ka moka t e le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...
...,...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...,popo ya dipolateforomo tse ke go tlisa boetele...
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...,modise mosadi na o ntse o sa utlwe hore thaban...
32997,eng,closing date for the submission of completed t...,closing date for the submission of completed t...
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...,nawuphina umntu ofunyenwe enetyala phantsi kwa...


In [81]:
# Seperate features and tagret variables
y = df['lang_id']
X = df['text1']

In [82]:
# Turning text into something your model can read
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2)
X_vectorized = vectorizer.fit_transform(X)

In [83]:
# Split the train data to create validation dataset
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.2,shuffle=True, stratify=y, random_state=11)#changed test size to 0.1 from 0.3

In [84]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_val)
nb_model_f1 = round(f1_score(y_val, y_pred, average='weighted'),2)
print('Accuracy %s' % accuracy_score(y_pred, y_val))

report = classification_report(y_val, y_pred, output_dict=True)


Accuracy 0.9986363636363637


In [85]:
import re
def preprocess_text(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    text = re.sub('@[^\s]+','USER', text)
    text = text.lower().replace("ё", "е")
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +',' ', text)
    return text.strip()
df_test["text1"] = [preprocess_text(t) for t in data_test['text']]

In [86]:
test_nb = df_test["text1"]
test_vect = vectorizer.transform(test_nb)
# Predict the sentiment using the test data
y_pred = nb_model.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
data_test['lang_id'] = y_pred
# Look into the data that will be submitted on Kaggle as csv
data_test[['index', 'lang_id']].head()
# save the csv file and submit it.
data_test[['index', 'lang_id']].to_csv('test_nb_model_submission.csv', index=False)

In [87]:
# Linear SVC

In [88]:

linsvc = LinearSVC()
linsvc.fit(X_train, y_train)
y_pred = linsvc.predict(X_val)
linsvc_f1 = round(f1_score(y_val, y_pred, average='weighted'),2)
print('Accuracy %s' % accuracy_score(y_pred, y_val))

report = classification_report(y_val, y_pred, output_dict=True)
report

Accuracy 0.9971212121212121


{'afr': {'precision': 0.9983361064891847,
  'recall': 1.0,
  'f1-score': 0.9991673605328892,
  'support': 600},
 'eng': {'precision': 0.9966777408637874,
  'recall': 1.0,
  'f1-score': 0.9983361064891847,
  'support': 600},
 'nbl': {'precision': 0.9933444259567388,
  'recall': 0.995,
  'f1-score': 0.9941715237302248,
  'support': 600},
 'nso': {'precision': 0.9966777408637874,
  'recall': 1.0,
  'f1-score': 0.9983361064891847,
  'support': 600},
 'sot': {'precision': 1.0,
  'recall': 0.9983333333333333,
  'f1-score': 0.9991659716430359,
  'support': 600},
 'ssw': {'precision': 0.9950166112956811,
  'recall': 0.9983333333333333,
  'f1-score': 0.9966722129783694,
  'support': 600},
 'tsn': {'precision': 1.0,
  'recall': 0.9966666666666667,
  'f1-score': 0.9983305509181971,
  'support': 600},
 'tso': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 600},
 'ven': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 600},
 'xho': {'precision': 0.9966555183946488,
  're

In [89]:
test_nb = df_test["text1"]
test_vect = vectorizer.transform(test_nb)
# Predict the sentiment using the test data
y_pred = linsvc.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
data_test['lang_id'] = y_pred
# Look into the data that will be submitted on Kaggle as csv
data_test[['index', 'lang_id']].head()
# save the csv file and submit it.
data_test[['index', 'lang_id']].to_csv('test_linsvc.csv', index=False)

In [90]:
# Hyperparameter models

In [101]:
params = {'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
         }

multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=15, verbose=5)
multinomial_nb_grid.fit(X_train, y_train)

Fitting 15 folds for each of 10 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   34.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   44.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   59.9s finished


GridSearchCV(cv=15, estimator=MultinomialNB(), n_jobs=-1,
             param_grid={'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,
                                   1.0]},
             verbose=5)

In [102]:
test_nb = df_test["text1"]
test_vect = vectorizer.transform(test_nb)
# Predict the sentiment using the test data
y_pred = multinomial_nb_grid.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
data_test['lang_id'] = y_pred
# Look into the data that will be submitted on Kaggle as csv
data_test[['index', 'lang_id']].head()
# save the csv file and submit it.
data_test[['index', 'lang_id']].to_csv('test_multinomial.csv', index=False)

In [104]:
random_state = 42
kf = KFold(n_splits=10,
           random_state=random_state,
           shuffle=True)  # Define number of KFolds

NameError: name 'KFold' is not defined

In [103]:
# Specify the range of 'C' parameters for LinearSVC
params = {'C': [0.1, 0.5, 1, 5, 10]}

# Setting the GridSearch for the best parameters
clf = GridSearchCV(LinearSVC(max_iter=4000, multi_class='ovr'),
                   param_grid=params, cv=kf,
                   scoring=make_scorer(f1_score, average='macro'))

# Fit the gridsearch on the dataset
clf = clf.fit(X_train, y_train)


NameError: name 'kf' is not defined

In [105]:
# Seperate features and tagret variables
y = df['lang_id'].values
X = df['text1'].values


In [106]:
# Split the train data to create validation dataset
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=.2,shuffle=True, stratify=y, random_state=11)#changed test size to 0.1 from 0.3

In [107]:
import pandas as pd
from sklearn import linear_model
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [108]:
tfidf_pipenb = Pipeline([
    ('tfidf', TfidfVectorizer(min_df=3, max_features=None,
                              strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                              ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1)),
    ('mnb', MultinomialNB())
])


In [109]:
tfidf_pipenb.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(min_df=3, ngram_range=(1, 3), smooth_idf=1,
                                 strip_accents='unicode', sublinear_tf=1,
                                 token_pattern='\\w{1,}', use_idf=1)),
                ('mnb', MultinomialNB())])

In [110]:
y_pred_nb = tfidf_pipenb.predict(X_val)

In [111]:
print(classification_report(y_val, y_pred_nb))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       600
         eng       0.99      1.00      1.00       600
         nbl       1.00      0.99      1.00       600
         nso       1.00      1.00      1.00       600
         sot       1.00      1.00      1.00       600
         ssw       1.00      1.00      1.00       600
         tsn       1.00      1.00      1.00       600
         tso       1.00      1.00      1.00       600
         ven       1.00      1.00      1.00       600
         xho       1.00      1.00      1.00       600
         zul       0.99      0.99      0.99       600

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600



In [112]:
test_nb = df_test["text1"]
#test_vect = vectorizer.transform(test_nb)
# Predict the sentiment using the test data
y_pred = tfidf_pipenb.predict(test_nb)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
data_test['lang_id'] = y_pred
# Look into the data that will be submitted on Kaggle as csv
data_test[['index', 'lang_id']].head()
# save the csv file and submit it.
data_test[['index', 'lang_id']].to_csv('test_pipeline_nb_model_submission.csv', index=False)