In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing

##Data Cleaning
import nltk #
import string #to remove all punctuation marks in the corpus/ document
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re

from sklearn.model_selection import train_test_split

ps = nltk.PorterStemmer()

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, accuracy_score
# set plot style
sns.set()

### __0.Loading Data__ 

In [5]:
df_train = pd.read_csv("train_set.csv")
df_test = pd.read_csv('test_set.csv')
df_sample = pd.read_csv('sample_submission.csv')

### __0.1.View Data__

In [6]:
pd.set_option("display.max_colwidth", -1)

  pd.set_option("display.max_colwidth", -1)


In [7]:
df_train.shape

(33000, 2)

### __1.Drop duplicates__ 

In [8]:
df_train.drop_duplicates(subset='text',inplace=True)
df_train.shape

(29948, 2)

In [9]:
df_train.head(4)

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ezi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi afrika
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi naphi na kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga i-dha mayibize uncedo olufanelekileyo elungelweni layo
2,eng,the province of kwazulu-natal department of transport invites tenders from established contractors experienced in bridge construction for the construction of the kwajolwayo tugela river pedestrian bridge near tugela ferry the duration of the project will be months
3,nso,o netefatša gore o ba file dilo ka moka tše le dumelelanego ka tšona mohlala maleri a magolo a a šomišwago go fihlelela meagong e metelele scaffolds a a bolokegilego lefelo la maleba la go šomela go phela gabotse bjbj


### __2.Null Values__

In [10]:
df_train.isnull().sum()

lang_id    0
text       0
dtype: int64

### __3.Clean Data__

In [11]:
def clean(df):
    
    #1.remove short words
    df['text'] = df['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    
    #3.Convert all uppercase to lower
    df['text'] = df['text'].str.lower()
    
    #4.punctuations , numbers and special characters except for the dash -
    df['text'] = df['text'].str.replace("[^a-zA-Z-]", " ")
    
    return df

In [12]:
clean(df_train)

  df['text'] = df['text'].str.replace("[^a-zA-Z-]", " ")


Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi afrika
1,xho,i-dha kuba nobulumko bokubeka umsebenzi naphi kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga i-dha mayibize uncedo olufanelekileyo elungelweni layo
2,eng,province kwazulu-natal department transport invites tenders from established contractors experienced bridge construction construction kwajolwayo tugela river pedestrian bridge near tugela ferry duration project will months
3,nso,netefat a gore file dilo moka dumelelanego t ona mohlala maleri magolo omi wago fihlelela meagong metelele scaffolds bolokegilego lefelo maleba omela phela gabotse bjbj
4,ven,khomishini ndinganyiso mbeu maana mulayo khomishini ndinganyiso mbeu thetshelesa mbilaelo dzine tshimbilelana tshialula mbeu nahone ivhea foramu thungo mulayo ndinganyiso
...,...,...
32994,eng,manuel marin ill-fated debt sources very little from fiscu would like take this relief initiative marin countries opportunity wish bongi proposal write notwithstanding best luck
32995,tsn,popo dipolateforomo tlisa boeteledipele saenseng lefatse rona kgato kgolo pele popont hwa thekeniki biothekeniki motsamaisi thekenoloji biotekeniki lefapha saense thekenoloji durham
32997,eng,closing date submission completed tenders august late submissions will considered submissions must enclosed sealed envelope addressed chief executive officer above address
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwalo mthetho ujongene ujungene nesohlwayo sokudliwa imali okanye ukuvalelwa ixesha elingadluliyo kwiinyanga ezintandathu okanye kuzo zombini isohlwayo sokudliwa imali okanye ukuvalelwa


### __4.Tokenization__

In [13]:
tokenized_text = df_train['text'].apply(lambda x: x.split())

tokenized_text.head()

0    [umgaqo-siseko, wenza, amalungiselelo, kumaziko, axhasa, ulawulo, lwesininzi, kunye, nokuthath, inxaxheba, kwabafazi, ziquka, phakathi, kwezinye, zazo, ikomishoni, yokulingana, ngokwesini, ikomishoni, yamalungelo, oluntu, lomzantsi, afrika]                                   
1    [i-dha, kuba, nobulumko, bokubeka, umsebenzi, naphi, kwisebe, ngokusekwe, kwiimfuno, zokusebenza, zalo, emva, kokubonana, nomsebenzi, kunye, okanye, imanyano, yakhe, ukuba, ulandulo, lomntu, onjalo, alufanelekanga, i-dha, mayibize, uncedo, olufanelekileyo, elungelweni, layo]
2    [province, kwazulu-natal, department, transport, invites, tenders, from, established, contractors, experienced, bridge, construction, construction, kwajolwayo, tugela, river, pedestrian, bridge, near, tugela, ferry, duration, project, will, months]                           
3    [netefat, a, gore, file, dilo, moka, dumelelanego, t, ona, mohlala, maleri, magolo, omi, wago, fihlelela, meagong, metelele, scaffolds, bolokegilego, le

### __5.Stop words__

###### __REMOVE STOP WORDS WHEN THE LANGUAGE IS ENGLISH__

In [17]:
nltk.download()
stopword = nltk.corpus.stopwords.words('english')

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [None]:
def remove_stopwords(text):
    text=[word for word in text if word not in stopword]
    return text
stop_words = tokenized_text.apply(lambda x: remove_stopwords(x))
stop_words.head()

### __6.Lemmatisation__

In [None]:
def train_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]  

lemmatizer = WordNetLemmatizer()

df_lemma = stop_words.apply(train_lemma, args=(lemmatizer, ))

### __7.Train Test Split__

In [None]:
X = df_train.text
y = df_train['lang_id']

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

### __8.1 Appplying Logistic Regression__

In [None]:
#Appplying Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

lr = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LogisticRegression(multi_class='ovr')),
              ])
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [None]:
print(f"Accuracy is : {accuracy_score(y_pred,y_test)}")
print("\n\nClassification Report:\n\n", classification_report(y_test, y_pred))

### __8.2 Multinomial naive bayes__

In [None]:
from sklearn.naive_bayes import MultinomialNB


mn = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clr', MultinomialNB()),
              ])
mn.fit(X_train,y_train)

In [None]:
y_pred = mn.predict(X_test)

print(f"Accuracy is : {accuracy_score(y_pred,y_test)}")
print("\n\nClassification Report:\n\n", classification_report(y_test, y_pred))

### __8.3 Decision tree__

In [None]:
from sklearn.tree import DecisionTreeClassifier


tree = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clr', DecisionTreeClassifier()),
              ])
tree.fit(X_train,y_train)
y_pred = tree.predict(X_test)

In [None]:
print(f"Accuracy is : {accuracy_score(y_pred,y_test)}")
print("\n\nClassification Report:\n\n", classification_report(y_test, y_pred))

### __8.3 Random Forest__

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clr', RandomForestClassifier()),
              ])
random_forest.fit(X_train,y_train)
y_pred = random_forest.predict(X_test)

In [None]:
print(f"Accuracy is : {accuracy_score(y_pred,y_test)}")
print("\n\nClassification Report:\n\n", classification_report(y_test, y_pred))

### __8.4 Support Vector Machine__

In [None]:
from sklearn.svm import SVC

svc = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clr', SVC(gamma='auto')),
              ])
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)



In [None]:
print(f"Accuracy is : {accuracy_score(y_pred,y_test)}")
print("\n\nClassification Report:\n\n", classification_report(y_test, y_pred))

### __8.4 KNN__

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clr', KNeighborsClassifier(n_neighbors=3)),
              ])
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)

In [None]:
print(f"Accuracy is : {accuracy_score(y_pred,y_test)}")
print("\n\nClassification Report:\n\n", classification_report(y_test, y_pred))

### __Unseen Data__ 

In [None]:
df_test.isnull().sum()

In [None]:
# Clean_test
clean(df_test)

In [None]:
df_sample

In [None]:
X_unseen = df_test['text']

In [None]:
#y_predict = lr.predict(X_unseen) #linear regression
y_predict = mn.predict(X_unseen) #multinomial naive bayes
#y_predict = tree.predict(X_unseen) #decision tree
#y_predict = random_forest.predict(X_unseen) #decision tree
#y_predict = svc.predict(X_unseen) #svc
#y_predict = knn.predict(X_unseen) #knn


In [None]:
#counting list
count_list  = range(1, 5683 )

In [None]:
submission = pd.DataFrame(
    {
     'index': count_list,
     'lang_id': y_predict
    })

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)

### __Final Model__

In [None]:
from sklearn import model_selection
import pickle

In [None]:
# storing best multi nomial naive bayes
pkl_file = "model.pkl"  

with open(pkl_file, 'wb') as file:  
    pickle.dump(mn, file)

In [None]:
# loading model
with open(pkl_file, 'rb') as file:  
    pickle_model = pickle.load(file)

In [None]:
new_input = 'i just love south africa'

In [None]:
# making predictions on unseen test dataset
predictions = pickle_model.predict([new_input])

In [None]:
predictions