In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing

##Data Cleaning
import nltk #
import string #to remove all punctuation marks in the corpus/ document
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re

from sklearn.model_selection import train_test_split

ps = nltk.PorterStemmer()

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, accuracy_score
# set plot style
sns.set()

# Loading Data 

In [None]:
df_train = pd.read_csv("../input/south-african-language-identification-2021/train_set.csv")
df_test = pd.read_csv('../input/south-african-language-identification-2021/test_set.csv')
df_sample = pd.read_csv('../input/south-african-language-identification-2021/sample_submission.csv')

# View Data

In [None]:
pd.set_option("display.max_colwidth", -1)

In [None]:
df_train.shape

# 1.Drop duplicates 

In [None]:
df_train.drop_duplicates(subset='text',inplace=True)
df_train.shape

In [None]:
df_train.head(4)

# 2.Null Values 

In [None]:
df_train.isnull().sum()

# 3.Clean Data

In [None]:
def clean(df):
    
    #1.remove short words
    df['text'] = df['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    
    #3.Convert all uppercase to lower
    df['text'] = df['text'].str.lower()
    
    #4.punctuations , numbers and special characters except for the dash -
    df['text'] = df['text'].str.replace("[^a-zA-Z-]", " ")
    
    return df

In [None]:
clean(df_train)

# 4.Tokenization 

In [None]:
tokenized_text = df_train['text'].apply(lambda x: x.split())

tokenized_text.head()

# 5.Stop words

In [None]:
#Try to remove stop words when the language is english

In [None]:
stopword = nltk.corpus.stopwords.words('english')

In [None]:
def remove_stopwords(text):
    text=[word for word in text if word not in stopword]
    return text
stop_words = tokenized_text.apply(lambda x: remove_stopwords(x))
stop_words.head()

# 6.Lemmatisation

In [None]:
def train_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]  

lemmatizer = WordNetLemmatizer()

df_lemma = stop_words.apply(train_lemma, args=(lemmatizer, ))

# 7.Train Test Split 

In [None]:
X = df_train.text
y = df_train['lang_id']

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

# 8.1 Appplying Logistic Regression

In [None]:
#Appplying Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

lr = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LogisticRegression(multi_class='ovr')),
              ])
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [None]:
print(f"Accuracy is : {accuracy_score(y_pred,y_test)}")
print("\n\nClassification Report:\n\n", classification_report(y_test, y_pred))

# 8.2 Multinomial naive bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB


mn = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clr', MultinomialNB()),
              ])
mn.fit(X_train,y_train)

In [None]:
y_pred = mn.predict(X_test)

print(f"Accuracy is : {accuracy_score(y_pred,y_test)}")
print("\n\nClassification Report:\n\n", classification_report(y_test, y_pred))

# 8.3 Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier


tree = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clr', DecisionTreeClassifier()),
              ])
tree.fit(X_train,y_train)
y_pred = tree.predict(X_test)

In [None]:
print(f"Accuracy is : {accuracy_score(y_pred,y_test)}")
print("\n\nClassification Report:\n\n", classification_report(y_test, y_pred))

# 8.3 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clr', RandomForestClassifier()),
              ])
random_forest.fit(X_train,y_train)
y_pred = random_forest.predict(X_test)

In [None]:
print(f"Accuracy is : {accuracy_score(y_pred,y_test)}")
print("\n\nClassification Report:\n\n", classification_report(y_test, y_pred))

# 8.4 Support Vector Machine

In [None]:
from sklearn.svm import SVC

svc = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clr', SVC(gamma='auto')),
              ])
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)



In [None]:
print(f"Accuracy is : {accuracy_score(y_pred,y_test)}")
print("\n\nClassification Report:\n\n", classification_report(y_test, y_pred))

# 8.4 KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clr', KNeighborsClassifier(n_neighbors=3)),
              ])
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)

In [None]:
print(f"Accuracy is : {accuracy_score(y_pred,y_test)}")
print("\n\nClassification Report:\n\n", classification_report(y_test, y_pred))

# Unseen Data 

In [None]:
df_test.isnull().sum()

In [None]:
# Clean_test
clean(df_test)

In [None]:
df_sample

In [None]:
X_unseen = df_test['text']

In [None]:
#y_predict = lr.predict(X_unseen) #linear regression
y_predict = mn.predict(X_unseen) #multinomial naive bayes
#y_predict = tree.predict(X_unseen) #decision tree
#y_predict = random_forest.predict(X_unseen) #decision tree
#y_predict = svc.predict(X_unseen) #svc
#y_predict = knn.predict(X_unseen) #knn


In [None]:
#counting list
count_list  = range(1, 5683 )

In [None]:
submission = pd.DataFrame(
    {
     'index': count_list,
     'lang_id': y_predict
    })

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)

# Final Model

In [None]:
from sklearn import model_selection
import pickle

In [None]:
# storing best multi nomial naive bayes
pkl_file = "model.pkl"  

with open(pkl_file, 'wb') as file:  
    pickle.dump(mn, file)

In [None]:
# loading model
with open(pkl_file, 'rb') as file:  
    pickle_model = pickle.load(file)

In [None]:
new_input = 'i just love south africa'

In [None]:
# making predictions on unseen test dataset
predictions = pickle_model.predict([new_input])

In [None]:
predictions