In [1]:
#Load packages
# data loading and manipulation 
import pandas as pd
from nltk import tokenize
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

#Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# data modeling and evaluation packages
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn import pipeline
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

In [2]:
#Load data
data = pd.read_csv('train_set.csv')
sample_submission = pd.read_csv('sample_submission.csv')
test_data = pd.read_csv('test_set.csv')

In [4]:
data.head(10)

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
5,nso,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6,tsn,kgetse nngwe le nngwe e e sa faposiwang mo tsh...
7,ven,mbadelo dze dza laelwa dzi do kwama mahatulele...
8,nso,maloko a dikhuduthamaga a ikarabela mongwe le ...
9,tsn,fa le dirisiwa lebone le tshwanetse go bontsha...


In [5]:
sample_submission.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl


In [32]:
test_data

Unnamed: 0,index,text,cleaned_text,predicted_lang
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele...",mmasepala fa maemo a a kgethegileng a letlelel...,tsn
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,uzakwaziswa ngokufaneleko nakungafuneka eminye...,nbl
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,tshivhumbeo tshi fana na ngano dza vhathu,ven
3,4,Kube inja nelikati betingevakala kutsi titsini...,kube inja nelikati betingevakala kutsi titsini...,ssw
4,5,Winste op buitelandse valuta.,winste op buitelandse valuta,afr
...,...,...,...,...
5677,5678,You mark your ballot in private.,you mark your ballot in private,eng
5678,5679,Ge o ka kgetha ka bowena go se šomiše Mofani k...,ge o ka kgetha ka bowena go se šomiše mofani k...,nso
5679,5680,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ...",e ka kopo etsa kgetho ya hao ka hloko hobane h...,sot
5680,5681,"TB ke bokudi ba PMB, mme Morero o tla lefella ...",tb ke bokudi ba pmb mme morero o tla lefella t...,sot


In [7]:
#Preprocessing
data.shape

(33000, 2)

In [8]:
data.isnull().sum()

lang_id    0
text       0
dtype: int64

In [13]:
#Remove punctuations and put all in lower case
def clean_text(text):
    # Remove punctuation using regular expression
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    
    # Convert text to lowercase
    cleaned_text = cleaned_text.lower()
    
    return cleaned_text

In [14]:
data['cleaned_text'] = data['text'].apply(clean_text)

In [15]:
data.head(10)

Unnamed: 0,lang_id,text,cleaned_text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,eng,the province of kwazulu-natal department of tr...,the province of kwazulunatal department of tra...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...
5,nso,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6,tsn,kgetse nngwe le nngwe e e sa faposiwang mo tsh...,kgetse nngwe le nngwe e e sa faposiwang mo tsh...
7,ven,mbadelo dze dza laelwa dzi do kwama mahatulele...,mbadelo dze dza laelwa dzi do kwama mahatulele...
8,nso,maloko a dikhuduthamaga a ikarabela mongwe le ...,maloko a dikhuduthamaga a ikarabela mongwe le ...
9,tsn,fa le dirisiwa lebone le tshwanetse go bontsha...,fa le dirisiwa lebone le tshwanetse go bontsha...


In [17]:
# Split the data into train and test sets
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

In [18]:
#Convert the text data into numerical features using TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df['cleaned_text'])
X_test = vectorizer.transform(test_df['cleaned_text'])

**Trying many models to see their accuracy**

In [20]:
#Train and Evaluate Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize models
models = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Nearest Neighbors': KNeighborsClassifier(),
    'Linear SVM': SVC(kernel='linear'),
    'RBF SVM': SVC(kernel='rbf'),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier()
}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, train_df['lang_id'])
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(test_df['lang_id'], y_pred)
    classification_rep = classification_report(test_df['lang_id'], y_pred)
    
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", classification_rep)
    print("=" * 50)

Model: Multinomial Naive Bayes
Accuracy: 0.9985
Classification Report:
               precision    recall  f1-score   support

         afr       1.00      1.00      1.00       583
         eng       1.00      1.00      1.00       615
         nbl       0.99      1.00      1.00       583
         nso       1.00      1.00      1.00       625
         sot       1.00      1.00      1.00       618
         ssw       1.00      1.00      1.00       584
         tsn       1.00      1.00      1.00       598
         tso       1.00      1.00      1.00       561
         ven       1.00      1.00      1.00       634
         xho       1.00      1.00      1.00       609
         zul       1.00      0.99      0.99       590

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600

Model: Logistic Regression
Accuracy: 0.9948
Classification Report:
               precision    recall  f1-score   s

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: AdaBoost
Accuracy: 0.5518
Classification Report:
               precision    recall  f1-score   support

         afr       1.00      0.98      0.99       583
         eng       0.99      0.98      0.99       615
         nbl       0.29      0.03      0.05       583
         nso       0.54      0.92      0.68       625
         sot       0.62      0.98      0.76       618
         ssw       0.25      0.97      0.39       584
         tsn       0.44      0.03      0.05       598
         tso       0.67      0.34      0.45       561
         ven       0.70      0.68      0.69       634
         xho       0.73      0.10      0.18       609
         zul       0.00      0.00      0.00       590

    accuracy                           0.55      6600
   macro avg       0.57      0.55      0.48      6600
weighted avg       0.57      0.55      0.48      6600



In [28]:
#Make predictions on test data using Multinomial Naive Bayes since it's the best performing model above.
# Clean and preprocess the text in test_data similar to what you did for the training data
test_data['cleaned_text'] = test_data['text'].apply(clean_text)

# Use the same vectorizer as before to transform the test data
X_test_new = vectorizer.transform(test_data['cleaned_text'])
# Make predictions
y_pred_new = models['Multinomial Naive Bayes'].predict(X_test_new)

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'index': test_data['index'],  # Keep the original index from test_df
    'lang_id': y_pred_new
})

In [29]:
# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

In [30]:
submission = pd.read_csv('submission.csv')

In [31]:
submission

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot
