In [34]:
# # Libraries for data loading, data manipulation and data visualisation
import numpy as np
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import string


# Reading in data

In [35]:
df_train = pd.read_csv('train.csv') # Read in csv file as pandas dataframe
df_test = pd.read_csv('test.csv') # Read in csv file as pandas dataframe
df_test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


# Data preprocessing

In [36]:
df_train['Text'] = df_train['text'].str.lower()
df_test['text'] = df_test['text'].str.lower()


In [37]:

df_train = df_train.drop_duplicates(keep='first')
df_train = df_train.dropna()


In [38]:
# Function to remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))


# Apply the remove_punctuation function to the 'text' column
df_train['Text'] = df_train['Text'].apply(remove_punctuation)
df_test['text'] = df_test['text'].apply(remove_punctuation)


In [39]:

y = df_train['lang_id']
X = df_train['Text']
vectorizer = CountVectorizer(ngram_range=(1, 2))


# Model development

In [42]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the vectorizer on the training data
X_train_vectorized = vectorizer.fit_transform(X_train)

# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB(alpha=0.2)
classifier.fit(X_train_vectorized, y_train)

# Vectorize the test data using the same vectorizer
X_test_vectorized = vectorizer.transform(X_test)

# Make predictions on the test data
y_pred = classifier.predict(X_test_vectorized)

print("Classification Report: \n", classification_report(y_test, y_pred))




Classification Report: 
               precision    recall  f1-score   support

         afr       1.00      1.00      1.00       553
         eng       1.00      1.00      1.00       603
         nbl       1.00      1.00      1.00       473
         nso       1.00      1.00      1.00       559
         sot       1.00      1.00      1.00       606
         ssw       1.00      1.00      1.00       481
         tsn       1.00      1.00      1.00       543
         tso       1.00      1.00      1.00       548
         ven       1.00      1.00      1.00       509
         xho       1.00      1.00      1.00       503
         zul       1.00      0.99      1.00       612

    accuracy                           1.00      5990
   macro avg       1.00      1.00      1.00      5990
weighted avg       1.00      1.00      1.00      5990



# Submission

In [41]:
test = vectorizer.transform(df_test['text'])

y_pred = classifier.predict(test)

daf = pd.DataFrame(y_pred, columns=['lang_id'])

output = pd.DataFrame({'index':df_test['index']})
submission = output.join(daf)
submission.to_csv('submissionA.csv', index=False)