### Importing Libraries

In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re

In [None]:
# importing csv
df = pd.read_csv("./Dataset/Language Detection.csv")
data = df.copy()

### EDA

In [None]:
df.head()

In [None]:
# checking for nulls
df.isnull().sum()

In [None]:
df.columns=df.columns.str.lower()

In [None]:
ser = df['language'].value_counts()/df.shape[0]

In [None]:
# checking for distribution of data of each language
plt.pie((ser)*100,labels =ser.index,autopct="%.2f",rotatelabels=True)
plt.show()

In [None]:
# checking data to cleanup
df['text'][:10]

### Data Cleaning and Transformation

In [None]:
def textclean(text):
    """
    function to cleanup the textual data
    text: any text
    """
    text = text.lower()
    text = re.sub(r'[!@#$(),\n"%^*?:;~`0-9]', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    return text.strip()

In [None]:
# encoding languages with label encoder
le = LabelEncoder()
df['language']=le.fit_transform(df['language'])

In [None]:
# other transformers and vectorizer
X = df['text'].apply(textclean)
y = df['language']

X_train, X_test, ytrain, ytest = train_test_split(X,y,test_size=.20)

count = CountVectorizer()
X_train = count.fit_transform(X_train)
X_test = count.transform(X_test)

tf = TfidfTransformer()
X_train = tf.fit_transform(X_train)
X_test = tf.transform(X_test)


### Primary model training

In [None]:
# training primary model 
model = MultinomialNB()
model.fit(X_train,ytrain)
y_pred = model.predict(X_test)
print(classification_report(ytest,y_pred))

The above model shows great combination of precision and recall with an overall accuracy score of 96%.

In [None]:
model.predict([X_test.toarray()[0]])

In [None]:
ytest[:1]

### Pipeline Creation

splitting data from previously copied main dataframe

In [None]:
data.columns=data.columns.str.lower()

In [None]:
X = data['text'].apply(textclean)
y = data['language']

# encoding languages with label encoder
le = LabelEncoder()
y=le.fit_transform(y)

X_train, X_test, ytrain, ytest = train_test_split(X,y,test_size=.20)

#### Data Cleaning Function

In [None]:
# class to transform text for Pipeline
class TextClean(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self, X, y=None):
        if type(X)!="list":
            X=list(X)
            X_=[]
            for text in X:
                text = text.lower()
                text = re.sub(r'[!@#$(),\n"%^*?:;~`0-9]', '', text)
                text = re.sub(r'\[.*?\]', '', text)
                X_.append(text.strip())
            return X_
        elif type(X)=='str':
            text = X.lower()
            text = re.sub(r'[!@#$(),\n"%^*?:;~`0-9]', '', text)
            text = re.sub(r'\[.*?\]', '', text)
            return text

In [None]:
# creating Pipeline
model_pipe = Pipeline([('textclean',TextClean()),
                       ('vectorizer', CountVectorizer()),
                       ('tfidf',TfidfTransformer()),
                       ('bayes',MultinomialNB())])

In [None]:
# fitting pipeline
model_pipe.fit(df['text'],df['language'])

In [None]:
# getting predictions
pred = model_pipe.predict(X_test)

In [None]:
print(classification_report(ytest,pred))

In [None]:
import pickle
# dumping model
with open("./resources/model_pipe.pkl", 'wb') as f:
    f.write(pickle.dumps(model_pipe))

with open("./resources/label_enc.pkl", 'wb') as f:
    f.write(pickle.dumps(le))

In [None]:
# loading model
with open("./resources/model_pipe.pkl", 'rb') as f:
    model = pickle.load(f)