In [78]:
#importing libraries
import string
import re
import codecs
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn import pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [79]:
#loading english raw data
eng_df = pd.read_csv("/content/train english.csv", encoding="utf-8", header=None, names=["English"])
eng_df.head()

Unnamed: 0,Unnamed: 1,English
text,label,source
okay i\u2019m sorry but TAYLOR SWIFT LOOKS NOTHING LIKE JACKIE O SO STOP COMPARING THE TWO. c\u2019mon America aren\u2019t you sick of her yet? (sorry),negative,sem_eval_2017
@user the DC comics site has Batman 44 releases on the 9th but its out now?,neutral,sem_eval_2017
"""Frank Gaffrey\u002c Cliff May\u002c Steve Emerson: Brilliant. \""""""""Looming Threats: Iran\u002c Hezbollah Hamas\"""""""" is the best #cufidc session I\u2019ve had thus far.""",positive,sem_eval_2017
The tragedy of only thinking up hilarious tweets for the Summer Olympics now is that in four years there may be no place for them.,negative,sem_eval_2017


In [80]:
#loading german raw data
ger_df=pd.read_csv("/content/train german.csv",encoding="utf-8",header=None, names=["German"])
ger_df.head()

Unnamed: 0,Unnamed: 1,German
text,label,source
Normal bin ich ja ned der mensch dwer sich beschwert wegen dem essen aber diese Pizza von Joeys...boah wie ekelhaft,negative,sb_10k
VfB Stuttgart gegen SV Hamburg: Das Warten auf den nächsten Schritt - Stuttgarter Zeitung: DIE WELTVfB Stuttga... http,neutral,sb_10k
@user @user @user @user @user @user @user @user fühlt sich schon so spät an 😚😚😚,positive,sb_10k
@user glaub ich! XD 30€ sind echt not bad T___T *grübel*,negative,sb_10k


In [81]:
#loading french raw data
fre_df=pd.read_csv("/content/train french.csv",encoding="utf-8",header=None, names=["French"])
fre_df.head()

Unnamed: 0,Unnamed: 1,French
text,label,source
"A Fukushima, la radioactivité met à mal la biodiversité et les oiseaux: http",negative,deft_2017
Biodiversité et développement durable : la petite histoire verte du campus http @user,neutral,deft_2017
Des aides du #gouvernement pour des #logements plus #écologiques. -- http,positive,deft_2017
"#fabius : ""nous sommes là 1e génération à prendre conscience du changement climatique et nous sommes la dernière à pouvoir agir""",negative,deft_2017


In [82]:
#loading spanish raw data
spa_df=pd.read_csv("/content/train spanish.csv",encoding="utf-8",header=None, names=["Spanish"])
spa_df.head()

Unnamed: 0,Unnamed: 1,Spanish
text,label,source
estoy hasta el ojete de que me digáis que tengo cara de mala leche,negative,intertass_2017
@user Por? Tenía pensado verla después de la segunda de Daredevil,neutral,intertass_2017
Esto de estar feliz mola,positive,intertass_2017
Ya no es tan divertido,negative,intertass_2017


In [83]:
# Loading Chinese raw data and skipping bad lines
chi_df = pd.read_csv("/content/train chinese.csv", encoding="utf-8", header=None, names=["Chinese"], on_bad_lines='skip')
chi_df.head()

Unnamed: 0,Unnamed: 1,Chinese
text,label,source
本人账号被盗，资金被江西（杨建）挪用，请亚马逊尽快查实，将本人的200元资金退回。本人已于2017年11月30日提交退货申请，为何到2018年了还是没解决？亚马逊是什么情况？请给本人一个合理解释。,negative,amazon_reviews_multi
这简直就是太差了！出版社怎么就能出版吗？我以为是百度摘录呢！这到底是哪个鱼目混珠的教授啊？！能给点干货吗？！总算应验了一句话，一本书哪怕只有一句花你感到有意义也算是本好书。哇为了找这本书哪怕一句不是废话的句子都费了我整整一天时间。。,negative,amazon_reviews_multi
购买页面显示1～2日发货，付款之后显示半个月后送达，实际收到商品距离下单日期已经一个多月。 无力吐槽，一星是给商品的。,negative,amazon_reviews_multi
音箱播放时断断续续的！质量完全不行，第一次在亚马逊买东西，晕！怎么是这样的呀？有客服和我联系吗？,negative,amazon_reviews_multi


In [84]:
# Data pre-processing
for char in string.punctuation:
    print(char, end="")

translate_table = {ord(char): None for char in string.punctuation}

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

In [85]:
# Initialize lists for storing pre-processed data
data_eng = []
lang_eng = []

data_ger = []
lang_ger = []

data_fre = []
lang_fre = []

data_spa = []
lang_spa = []

data_chi = []
lang_chi = []

In [86]:
# Process  data

for i, line in eng_df.iterrows():
  line=line['English']
  if len(line) !=0:
    line=line.lower()
    line=re.sub(r"\d+","",line)
    line=line.translate(translate_table)
    data_eng.append(line)
    lang_eng.append("English")

for i, line in ger_df.iterrows():
  line=line['German']
  if len(line) !=0:
    line=line.lower()
    line=re.sub(r"\d+","",line)
    line=line.translate(translate_table)
    data_ger.append(line)
    lang_ger.append("German")

for i, line in fre_df.iterrows():
  line=line['French']
  if len(line) !=0:
    line=line.lower()
    line=re.sub(r"\d+","",line)
    line=line.translate(translate_table)
    data_fre.append(line)
    lang_fre.append("French")

for i, line in spa_df.iterrows():
  line=line['Spanish']
  if len(line) !=0:
    line=line.lower()
    line=re.sub(r"\d+","",line)
    line=line.translate(translate_table)
    data_spa.append(line)
    lang_spa.append("Spanish")

for i, line in chi_df.iterrows():
  line=line['Chinese']
  if len(line) !=0:
    line=line.lower()
    line=re.sub(r"\d+","",line)
    line=re.sub(r"[a-zA-Z]+","",line)
    line=line.translate(translate_table)
    data_chi.append(line)
    lang_chi.append("Chinese(Traditional)")

In [87]:
# Combine data into a DataFrame
df = pd.DataFrame({
    "Text": data_eng + data_ger + data_fre + data_spa + data_chi,
    "Language": lang_eng + lang_ger + lang_fre + lang_spa + lang_chi
})

# Print the shape of the DataFrame
print(df.shape)

(127361, 2)


In [91]:
#Spliitng data into training and tests (80:20)
X,y=df.iloc[:,0],df.iloc[:,1]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(101888,) (25473,)
(101888,) (25473,)


In [92]:
# Define the TF-IDF vectorizer with character n-grams
vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1, 3), analyzer="char")

# Create the pipeline with TF-IDF vectorizer and Logistic Regression classifier
pipe_lr_r13 = pipeline.Pipeline([
    ('vectorizer', vectorizer),
    ('clf', linear_model.LogisticRegression())
])

In [93]:
# Fit the pipeline
pipe_lr_r13.fit(X_train, y_train)

In [98]:
#model prediction
y_predicted = pipe_lr_r13.predict(X_test)

In [99]:
#model evaluation
acc=metrics.accuracy_score(y_test,y_predicted)*100
print(acc,'%')

100.0 %


In [100]:
# Assuming y_test and y_predicted are defined
matrix = metrics.confusion_matrix(y_test, y_predicted)
print('Confusion matrix : \n', matrix)

Confusion matrix : 
 [[23989     0     0     0     0]
 [    0   359     0     0     0]
 [    0     0   368     0     0]
 [    0     0     0   379     0]
 [    0     0     0     0   378]]


In [107]:
#model saving
import pickle
#persist model so that it can be used by different consumers
lrFile=open('LRModel.pckl','wb')
pickle.dump(pipe_lr_r13,lrFile)
lrFile.close()

In [109]:
#model loading
global lrLangDetectModel
lrLangDetectFile=open('LRModel.pckl','rb')
lrLangDetectModel=pickle.load(lrLangDetectFile)
lrLangDetectFile.close()
