In [1]:
# imports for Natural Language  Processing
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
from string import punctuation


# feature extractioin
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split

# classification models
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Hyperparameter tunning methods
#import parfit.parfit as pf
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline


# Metrics for Model Evaluation
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

from scikitplot.metrics import plot_roc, plot_confusion_matrix
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer


In [2]:
# View the data
data_train = pd.read_csv("train_set.csv")
samp_sub = pd.read_csv("sample_submission.csv")
data_test = pd.read_csv("test_set.csv")

In [3]:
#Checking if there are missing values in the Train dataset
data_train.isna().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   lang_id  33000 non-null  bool 
 1   text     33000 non-null  bool 
dtypes: bool(2)
memory usage: 64.6 KB


In [4]:
#Cheching if there are missing values in the Test dataset
data_test.isna().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5682 entries, 0 to 5681
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   index   5682 non-null   bool 
 1   text    5682 non-null   bool 
dtypes: bool(2)
memory usage: 11.2 KB


In [5]:
# Make a copy of the available data
df_test = data_test.copy()

In [319]:
# View the train data
data_train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [320]:
# count the values of the language id:
data_train["lang_id"].value_counts()

nbl    3000
sot    3000
ssw    3000
afr    3000
nso    3000
zul    3000
xho    3000
tso    3000
ven    3000
eng    3000
tsn    3000
Name: lang_id, dtype: int64

In [321]:
# Look for the null values in the dataset
data_train.isnull().sum()

lang_id    0
text       0
dtype: int64

In [322]:
# View the test data
data_test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [323]:
# View the sample data, i.e. how we will have to submit...
samp_sub.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl


In [324]:
# View the unique values of the train dataset language id
data_train["lang_id"].unique()

array(['xho', 'eng', 'nso', 'ven', 'tsn', 'nbl', 'zul', 'ssw', 'tso',
       'sot', 'afr'], dtype=object)

In [327]:
# Create a copy of the train dataset
df = data_train.copy()

In [328]:
# View the copy 
df

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...
32997,eng,closing date for the submission of completed t...
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...


# Model Evaluation

In [331]:
# Seperate features and tagret variables
y = df['lang_id']
X = df['text']

In [332]:
# Turning text into something your model can read
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2)
X_vectorized = vectorizer.fit_transform(X)

In [333]:
# Split the train data to create validation dataset
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.3,shuffle=True, stratify=y, random_state=11)#changed test size to 0.1 from 0.3

# Logistic Regression

In [334]:
logreg = LogisticRegression(C=1000, multi_class='ovr', solver='saga', random_state=42, max_iter=10)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_val)
logreg_f1 = round(f1_score(y_val, y_pred, average='weighted'),2)
print('Accuracy %s' % accuracy_score(y_pred, y_val))

report = classification_report(y_val, y_pred)
print(report)

Accuracy 0.9956565656565657
              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       900
         eng       1.00      1.00      1.00       900
         nbl       0.99      0.99      0.99       900
         nso       1.00      1.00      1.00       900
         sot       1.00      1.00      1.00       900
         ssw       0.99      1.00      0.99       900
         tsn       1.00      1.00      1.00       900
         tso       1.00      1.00      1.00       900
         ven       1.00      1.00      1.00       900
         xho       0.99      0.99      0.99       900
         zul       0.99      0.98      0.98       900

    accuracy                           1.00      9900
   macro avg       1.00      1.00      1.00      9900
weighted avg       1.00      1.00      1.00      9900



In [213]:
# Save the and Take it to Kaggle
test_logreg = data_test['text']
test_vect = vectorizer.transform(test_logreg)
# Predict the sentiment using the test data
y_pred = logreg.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
data_test['lang_id'] = y_pred
# Look into the data that will be submitted on Kaggle as csv
data_test[['index', 'lang_id']].head()
# save the csv file and submit it.
data_test[['index', 'lang_id']].to_csv('test_logreg_submission.csv', index=False)

# Random Forest

In [215]:
rf = RandomForestClassifier(max_features=4, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
rf_f1 = round(f1_score(y_val, y_pred, average='weighted'),2)
print('Accuracy %s' % accuracy_score(y_pred, y_val))

report = classification_report(y_val, y_pred)
print(report)

Accuracy 0.9957575757575757
              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       900
         eng       1.00      1.00      1.00       900
         nbl       0.99      0.98      0.99       900
         nso       1.00      1.00      1.00       900
         sot       1.00      1.00      1.00       900
         ssw       0.99      1.00      1.00       900
         tsn       1.00      1.00      1.00       900
         tso       1.00      1.00      1.00       900
         ven       1.00      1.00      1.00       900
         xho       1.00      0.99      0.99       900
         zul       0.98      0.99      0.98       900

    accuracy                           1.00      9900
   macro avg       1.00      1.00      1.00      9900
weighted avg       1.00      1.00      1.00      9900



In [216]:
# Save the csv model for the random Forest model and take it to Kaggle
test_rf = data_test['text']
test_vect = vectorizer.transform(test_logreg)
# Predict the sentiment using the test data
yrf_pred = rf.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
data_test['lang_id'] = yrf_pred
# Look into the data that will be submitted on Kaggle as csv
data_test[['index', 'lang_id']].head()
# save the csv file and submit it.
data_test[['index', 'lang_id']].to_csv('test_RandomForest_submission.csv', index=False)

# Hyper parameter tuning

# Logistic Regression

In [None]:
# Improve the LR model using the GridVSearch
param_grid = {'C': [1000], #[100,1000]
              'max_iter': [100], #[10,100]
              'multi_class': ['multinomial'], #['ovr', 'multinomial']
              'random_state': [42],
              'solver': ['lbfgs']} #['saga','lbfgs']
grid_LR = GridSearchCV(LogisticRegression(), param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_LR.fit(X_train, y_train)
y_pred = grid_LR.predict(X_val)
print("Best parameters:")
lr_params = grid_LR.best_params_
print(grid_LR.best_params_)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred))

In [None]:
# Save the Grid V
test_lrh = data_test['text']
test_vect = vectorizer.transform(test_lrh)
# Predict the sentiment using the test data
ylrh_pred = grid_LR.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
data_test['lang_id'] = ylrh_pred
# Look into the data that will be submitted on Kaggle as csv
data_test[['index', 'lang_id']].head()
# save the csv file and submit it.
data_test[['index', 'lang_id']].to_csv('test_lrhp_submission.csv', index=False)

# LinearSVC

In [None]:
LSVC_param_grid = {'LSVC_clf__C': [1, 1.01, 1.02, 1.03],
                   'LSVC_tfidf__max_df': (0.9, 0.999),
                   'LSVC_tfidf__min_df': (0,0.00001, 0.001),
                   'LSVC_tfidf__ngram_range': [(1, 2), (1, 3), (1, 4), (1, 5)]}


# Using the Linear SVC model above, we perform the gridsearch
LSVC_searchCV = GridSearchCV(LinearSVC(), cv=5, param_grid=LSVC_param_grid, verbose=3, scoring='f1_weighted', n_jobs=-1, refit=True)
LSVC_searchCV.fit(X, y)

In [None]:
test_LSVC= data_test['text']
test_vect = vectorizer.transform(test_LSVC)
# Predict the sentiment using the test data
yLSVC_pred = grid_LSVC.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
data_test['lang_id'] = yLSVC_pred
# Look into the data that will be submitted on Kaggle as csv
data_test[['index', 'lang_id']].head()
# save the csv file and submit it.
data_test[['index', 'lang_id']].to_csv('test_LSVC_submission.csv', index=False)

# LinearSVC model

In [None]:

svm = SVC(kernel='linear')
# Fit the model
svm.fit(X_train, y_train)
y_pred = svm.predict(X_val)
svm_f1 = round(f1_score(y_val, y_pred, average='weighted'),2)
print('Accuracy %s' % accuracy_score(y_pred, y_val))

report = classification_report(y_val, y_pred)
report

In [None]:
# Save the model
test_svm = data_test['text']
test_vect = vectorizer.transform(test_svm)
# Predict the sentiment using the test data
ysvm_pred = svm.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
data_test['lang_id'] = ysvm_pred
# Look into the data that will be submitted on Kaggle as csv
data_test[['index', 'lang_id']].head()
# save the csv file and submit it.
data_test[['index', 'lang_id']].to_csv('test_svm_submission.csv', index=False)

# SGD Classifier

In [None]:
param_grid = {'alpha': [0.0001], 'max_iter': [1000],
              'n_iter_no_change': [5], 'tol': [0.01]}
grid_sgdc = GridSearchCV(SGDClassifier(), param_grid,
                         scoring='f1_weighted', cv=5, n_jobs=-1)
grid_sgdc.fit(X_train, y_train)
y_pred = grid_sgdc.predict(X_val)
sgdc_params = grid_sgdc.best_params_
print(grid_sgdc.best_params_)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred))

In [None]:
test_sgdc= data_test['text']
test_vect = vectorizer.transform(test_sgdc)
# Predict the sentiment using the test data
ysgdc_pred = grid_sgdc.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
data_test['lang_id'] = ysgdc_pred
# Look into the data that will be submitted on Kaggle as csv
data_test[['index', 'lang_id']].head()
# save the csv file and submit it.
data_test[['index', 'lang_id']].to_csv('test_grid_sgdc_submission.csv', index=False)

In [236]:
# Remove punctuation
def preprocess(tweet):
    tweet = tweet.lower()
    random_characters = ['â','¢','‚','¬','Â','¦','’',"It's",'Ã','..','Å']
    #tokenizer = word_tokenize(preserve_case=True, reduce_len=True)
    tweet = word_tokenize(tweet)
    stopwords_list = set(random_characters+list(punctuation))
    tweet = [word for word in tweet if word not in stopwords_list]
    tweet = re.sub(r'#([^\s]+)', r'\1', " ".join(tweet))
    tweet = re.sub(r'@([^\s]+)', r'\1', "".join(tweet))
    return tweet

In [285]:
train = df.copy()

In [286]:
# Splitting the labels and features
train['processed'] = train['text'].apply(preprocess)
X = train['processed']
y = train['lang_id']

In [288]:
# printing out cleaned text
index = 40
for text in X[0:5]:
    print(str(index)+": " + text)
    print('\n')
    index += 1

40: umgaqo-siseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ezi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi afrika


41: i-dha iya kuba nobulumko bokubeka umsebenzi naphi na kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga i-dha mayibize uncedo olufanelekileyo elungelweni layo


42: the province of kwazulu-natal department of transport invites tenders from established contractors experienced in bridge construction for the construction of the kwajolwayo tugela river pedestrian bridge near tugela ferry the duration of the project will be months


43: o netefatša gore o ba file dilo ka moka tše le dumelelanego ka tšona mohlala maleri a magolo a a šomišwago go fihlelela meagong e metelele scaffolds a a bolokegilego lefelo la maleba la go šomela go phela gabotse bjbj


44: khomishini ya n

In [289]:
# printing out cleaned tweet
index = 1
for tweet in X[0:5]:
    print(str(index)+": " + tweet)
    print('\n')
    index += 1

1: umgaqo-siseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ezi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi afrika


2: i-dha iya kuba nobulumko bokubeka umsebenzi naphi na kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga i-dha mayibize uncedo olufanelekileyo elungelweni layo


3: the province of kwazulu-natal department of transport invites tenders from established contractors experienced in bridge construction for the construction of the kwajolwayo tugela river pedestrian bridge near tugela ferry the duration of the project will be months


4: o netefatša gore o ba file dilo ka moka tše le dumelelanego ka tšona mohlala maleri a magolo a a šomišwago go fihlelela meagong e metelele scaffolds a a bolokegilego lefelo la maleba la go šomela go phela gabotse bjbj


5: khomishini ya ndinga

In [290]:
test = df_test.copy()
test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [291]:
# preprocess testing data by applying our function
test['processed'] = test['text'].apply(preprocess)

In [244]:
# Splitting the labels and fetures into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05,random_state=42,stratify=y)

In [245]:
from sklearn.pipeline import Pipeline

# Logistic regression

In [247]:
#creating a pipeline with a tfidf vectorizer and a logistic regression model
LR_model = Pipeline([('tfidf',TfidfVectorizer()),('classify',(LogisticRegression(C=1.0,solver='lbfgs',random_state=42,max_iter=200)))])

#fitting the model
LR_model.fit(X_train, y_train)

#Apply model on test data
y_pred_lr = LR_model.predict(X_test)

In [248]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       150
         eng       0.99      1.00      1.00       150
         nbl       0.99      0.99      0.99       150
         nso       1.00      0.99      1.00       150
         sot       1.00      1.00      1.00       150
         ssw       0.99      0.99      0.99       150
         tsn       0.99      1.00      1.00       150
         tso       1.00      1.00      1.00       150
         ven       1.00      1.00      1.00       150
         xho       1.00      0.99      1.00       150
         zul       0.99      0.98      0.98       150

    accuracy                           1.00      1650
   macro avg       1.00      1.00      1.00      1650
weighted avg       1.00      1.00      1.00      1650



In [253]:
# Saving the test dataset
test_LR= test['processed']
test_vect = vectorizer.transform(test_LR)
# Predict the sentiment using the test data
y_pred_lr = LR_model.predict(test_LR)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
data_test['lang_id'] =y_pred_lr
# Look into the data that will be submitted on Kaggle as csv
data_test[['index', 'lang_id']].head()
# save the csv file and submit it.
data_test[['index', 'lang_id']].to_csv('test_PipeLogisticR_submission.csv', index=False)

# Linear SVC

In [258]:
#creating a pipeline with the tfid vectorizer and a linear svc model
svc = Pipeline([('tfidf',TfidfVectorizer()),('classify',LinearSVC(C=1))])

#fitting the model
svc.fit(X_train, y_train)

#apply model on test data
y_pred_svc = svc.predict(X_test)

In [259]:
print(classification_report(y_test, y_pred_svc))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       150
         eng       0.99      1.00      0.99       150
         nbl       0.99      0.99      0.99       150
         nso       1.00      0.99      1.00       150
         sot       0.99      1.00      1.00       150
         ssw       0.99      1.00      1.00       150
         tsn       0.99      0.99      0.99       150
         tso       1.00      1.00      1.00       150
         ven       1.00      1.00      1.00       150
         xho       1.00      0.99      0.99       150
         zul       0.99      0.99      0.99       150

    accuracy                           1.00      1650
   macro avg       1.00      1.00      1.00      1650
weighted avg       1.00      1.00      1.00      1650



In [260]:
test_svc= test['processed']
test_vect = vectorizer.transform(test_LR)
# Predict the sentiment using the test data
y_pred_svc = svc.predict(test_svc)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
data_test['lang_id'] =y_pred_svc
# Look into the data that will be submitted on Kaggle as csv
data_test[['index', 'lang_id']].head()
# save the csv file and submit it.
data_test[['index', 'lang_id']].to_csv('test_svc.csv', index=False)

# Decision Tree

In [150]:
#creating a pipeline with the DecisionTreeClassifier 
DT = Pipeline([('tfidf',TfidfVectorizer()),('classify',(DecisionTreeClassifier(max_depth=150,random_state=42, splitter='best')))])

#fitting the model
DT.fit(X_train, y_train)

#Apply model on test data
y_pred_DT = DT.predict(X_test)

In [151]:
print(classification_report(y_test, y_pred_DT))

              precision    recall  f1-score   support

         afr       0.99      0.99      0.99       150
         eng       0.98      1.00      0.99       150
         nbl       0.86      0.85      0.86       150
         nso       0.97      0.95      0.96       150
         sot       0.97      0.99      0.98       150
         ssw       0.94      0.85      0.90       150
         tsn       0.95      0.94      0.95       150
         tso       0.99      0.99      0.99       150
         ven       0.99      0.99      0.99       150
         xho       0.79      0.93      0.85       150
         zul       0.86      0.79      0.83       150

    accuracy                           0.93      1650
   macro avg       0.94      0.93      0.93      1650
weighted avg       0.94      0.93      0.93      1650



# Random Forest

In [153]:
#creating a pipeline with the RandomForest classifier  
RF_model = Pipeline([('tfidf', TfidfVectorizer()),('clf', (RandomForestClassifier(max_depth=200, random_state=42,n_estimators=10)))])

#fitting the model
RF_model.fit(X_train, y_train)

#Apply model on test data
y_pred_RF = RF_model.predict(X_test)

In [154]:
print(classification_report(y_test, y_pred_RF))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       150
         eng       0.97      1.00      0.99       150
         nbl       0.92      0.89      0.90       150
         nso       0.99      0.97      0.98       150
         sot       0.99      1.00      0.99       150
         ssw       0.98      0.97      0.97       150
         tsn       0.97      0.99      0.98       150
         tso       1.00      1.00      1.00       150
         ven       1.00      1.00      1.00       150
         xho       0.90      0.97      0.93       150
         zul       0.94      0.88      0.91       150

    accuracy                           0.97      1650
   macro avg       0.97      0.97      0.97      1650
weighted avg       0.97      0.97      0.97      1650



In [None]:
y = train['lang_id'].values
X = train['text'].values

In [None]:
# vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=5,
# stop_words="english")
vectorizer = TfidfVectorizer(sublinear_tf=True,
                             smooth_idf=True,
                             max_df=0.3,
                            # min_df=1,
                             strip_accents='ascii',
                             ngram_range=(1, 2))
X_vectorized = vectorizer.fit_transform(X)

In [None]:
print('Shape of the vectorised data: {}'.format(X_vectorized.shape))

In [None]:
testx = test['text']
test_vect = vectorizer.transform(testx.values)
test_vect.data

In [None]:
# initializing a variable to the desired SMOTE
smote = SMOTE(random_state=2)#sampling_strategy='minority')

# fit SMOTE to training dataset
X_smote, y_smote = smote.fit_resample(X_vectorized, y)

In [None]:
X_smote.shape

In [None]:
y_smote.shape

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_vectorized, y,
                                                  #X_smote,y_smote,
                                                  test_size=.1,
                                                  random_state=42
                                                  )

# Hyper tuned LinearSVC

In [None]:
# Specify the range of 'C' parameters for LinearSVC
params = {'C': [0.1, 0.5, 1, 5, 10]}

# Setting the GridSearch for the best parameters
clf = GridSearchCV(LinearSVC(max_iter=4000, multi_class='ovr'),
                   param_grid=params, cv=kf,
                   scoring=make_scorer(f1_score, average='macro'))

# Fit the gridsearch on the dataset
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print('Getting the Best Model Performance' + '\n')
print('Accuracy: {}'.format(accuracy_score(y_val, y_pred)))
print('F1: {}'.format(f1_score(y_val, y_pred, average='macro')))
print('\n' + classification_report(y_val, y_pred))

In [None]:
testx = test['text']
test_vect = vectorizer.transform(testx.values)
# Predict the sentiment using the test data
y_pred = clf.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
test['lang_id'] = y_pred
# Look into the data that will be submitted on Kaggle as csv
test[['index', 'lang_id']].head()
# save the csv file and submit it.
test[['index', 'lang_id']].to_csv('test_LinearSVC_submission.csv', index=False)

# Complement Naive Bayes

### The best performing model

In [None]:
# Complement Naive Bayes
cnb = ComplementNB()
cnb.fit(X_train, y_train)
y_pred = cnb.predict(X_val)

print('Getting the Best Model Performance' + '\n')
print('Accuracy: {}'.format(accuracy_score(y_val, y_pred)))
print('F1: {}'.format(f1_score(y_val, y_pred, average='macro')))
print('\n' + classification_report(y_val, y_pred))

In [None]:
# Carrying out cross-validation and checking F1 score for different classifiers
random_state = 42
kf = KFold(n_splits=10,
           random_state=random_state,
           shuffle=True)  # Define number of KFolds

In [None]:
params = {'alpha': [0.1, 0.5, 1, 10],
          'norm': [True, False]}

clf2 = GridSearchCV(ComplementNB(),
                    param_grid=params,
                    cv=kf,
                    scoring=make_scorer(f1_score,
                                        average='macro'))
# Fit the gridsearch on the dataset
clf2 = clf2.fit(X_train, y_train)

In [None]:
y_pred = clf2.predict(X_val)

print('Getting the Best Model Performance' + '\n')
print('Accuracy: {}'.format(accuracy_score(y_val, y_pred)))
print('F1: {}'.format(f1_score(y_val, y_pred, average='macro')))
print('\n' + classification_report(y_val, y_pred))

In [None]:
testx = test['text']
test_vect = vectorizer.transform(testx.values)
# Predict the sentiment using the test data
y_pred = clf2.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
test['lang_id'] = y_pred
# Look into the data that will be submitted on Kaggle as csv
test[['index', 'lang_id']].head()
# save the csv file and submit it.
test[['index', 'lang_id']].to_csv('test_ComplementNB_submission.csv', index=False)

# Complement Naive Bayes

In [None]:
cnb = ComplementNB(alpha=clf2.best_params_['alpha'],
                   norm=clf2.best_params_['norm'])
cnb.fit(X_train, y_train)
y_pred = cnb.predict(X_val)

cnb_tuned = ComplementNB()
cnb_tuned.fit(X_train, y_train)
y_pred_tuned = cnb_tuned.predict(X_val)

In [None]:
# Saving the model
testx = test['text']
test_vect = vectorizer.transform(testx.values)
# Predict the sentiment using the test data
y_pred = cnb.predict(test_vect)
# Assign a new column on the test data by using ...
# the predicted sentiment from the tweets from test data
test['lang_id'] = y_pred
# Look into the data that will be submitted on Kaggle as csv
test[['index', 'lang_id']].head()
# save the csv file and submit it.
test[['index', 'lang_id']].to_csv('test_ComplementNBtuned_submission.csv', index=False)