In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [3]:
# dataset = pd.read_csv("/content/Language Detection.csv")
dataset = pd.read_csv("/content/Lang_detect.csv")
dataset.head()

Unnamed: 0,Text,language,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian,,,,,,,,,,,,,,,,,,
1,sebes joseph pereira thomas på eng the jesuit...,Swedish,,,,,,,,,,,,,,,,,,
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai,,,,,,,,,,,,,,,,,,
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil,,,,,,,,,,,,,,,,,,
4,de spons behoort tot het geslacht haliclona en...,Dutch,,,,,,,,,,,,,,,,,,


In [4]:
dataset = dataset.loc[dataset['language'].notna()]
dataset = dataset.loc[dataset['Text'].notna()]

In [5]:
len(dataset["language"].unique())

32

In [9]:
dataset = dataset.dropna(axis=1)

In [10]:
dataset.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [11]:
dataset = dataset.values

In [12]:
dataset.shape

(32337, 2)

Preprocess the data

In [13]:
# Change the text to have the same casing
dataset[:,0] = [entry.lower() for entry in dataset[:,0]]

In [14]:
# Tokenize the data
dataset[:,0] = [word_tokenize(entry) for entry in dataset[:,0]]

In [15]:
tags = defaultdict(lambda : wn.NOUN)
tags['J'] = wn.ADJ
tags['V'] = wn.VERB
tags['R'] = wn.ADV

In [16]:
for index, entry in enumerate(dataset[:, 0]):
  words = []
  word_lemmatized = WordNetLemmatizer()
  for word, tag in pos_tag(entry):
    final_word = word_lemmatized.lemmatize(word, tags[tag[0]])
    words.append(final_word)
  dataset[index, 0] = str(words)

In [18]:
x_train, x_test, y_train, y_test = train_test_split(dataset[:,0], dataset[:,1], test_size = 0.3, random_state=45)

In [19]:
vectorizer = TfidfVectorizer(ngram_range=(1,3), analyzer='char')
vectorizer.fit(dataset[:,0])

TfidfVectorizer(analyzer='char', ngram_range=(1, 3))

In [20]:
train_x_tfidf = vectorizer.transform(x_train)
test_x_tfidf = vectorizer.transform(x_test)

In [21]:
sup = svm.SVC()
sup.fit(train_x_tfidf, y_train)

SVC()

In [22]:
pred = sup.predict(test_x_tfidf)
svm_acc = (accuracy_score(pred, y_test))*100
print("SVM accuracy: ", svm_acc, '%')

SVM accuracy:  98.60853432282003 %


Functionality

In [23]:
# Preprocess input from user
def preprocess(data):
  data = [entry.lower() for entry in data]

  for index, entry in enumerate(data):
    words = []
    word_lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
      final_word = word_lemmatized.lemmatize(word, tags[tag[0]])
      words.append(final_word)
    data[index] = str(words)

  return data

In [24]:
def most_frequent(List):
    dict = {}
    count, word = 0, ''
    for item in reversed(List):
        dict[item] = dict.get(item, 0) + 1
        if dict[item] >= count :
            count, word = dict[item], item
    return(word)

In [25]:
def predict(text):
  words = text.split(" ");
  x = preprocess(words)
  x = vectorizer.transform(x)
  y = sup.predict(x)
  final = most_frequent(y)
  
  return final

In [26]:
predict("This is a bottom line test sentence used for the first time.")

'English'

Translate

In [27]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l[K     |▍                               | 10 kB 30.1 MB/s eta 0:00:01[K     |▊                               | 20 kB 36.2 MB/s eta 0:00:01[K     |█                               | 30 kB 21.9 MB/s eta 0:00:01[K     |█▍                              | 40 kB 17.9 MB/s eta 0:00:01[K     |█▊                              | 51 kB 12.7 MB/s eta 0:00:01[K     |██                              | 61 kB 14.9 MB/s eta 0:00:01[K     |██▍                             | 71 kB 13.6 MB/s eta 0:00:01[K     |██▊                             | 81 kB 14.9 MB/s eta 0:00:01[K     |███                             | 92 kB 16.4 MB/s eta 0:00:01[K     |███▍                            | 102 kB 13.3 MB/s eta 0:00:01[K     |███▊                            | 112 kB 13.3 MB/s eta 0:00:01[K     |████                            | 122 kB 13.3 MB/s eta 0:00:01[K     |████▍                           | 133 kB 13.3 MB/s eta 0:00:

In [28]:
from langdetect import detect

In [29]:
!pip install deep_translator

Collecting deep_translator
  Downloading deep_translator-1.8.3-py3-none-any.whl (29 kB)
Collecting beautifulsoup4<5.0.0,>=4.9.1
  Downloading beautifulsoup4-4.11.1-py3-none-any.whl (128 kB)
[K     |████████████████████████████████| 128 kB 21.9 MB/s 
Installing collected packages: beautifulsoup4, deep-translator
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed beautifulsoup4-4.11.1 deep-translator-1.8.3


In [30]:
from deep_translator import GoogleTranslator

In [31]:
def translate(text):
  if (predict(text) != "English") or (detect(text) != 'en'):
    translated = GoogleTranslator(target='en').translate(text)
    new_text = translated
  else:
    new_text = text
    
  return new_text

Summarize

In [32]:
stopWords = set(stopwords.words('english'))
import re

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

def build_matrix(sentences):
  tfidf = TfidfVectorizer()
  vect_matrix = tfidf.fit_transform(sentences)

  similarity = cosine_similarity(vect_matrix)

  return similarity

In [34]:
import networkx as nx
import math

def summarize(matrix, sentences):
  summary = []

  graph = nx.from_numpy_array(matrix)
  scores = nx.pagerank(graph)

  ranked = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

  for i in range(int(math.sqrt(len(sentences)))):
    summary.append(ranked[i][1])

  return summary

In [35]:
def prep_paragraph(text):
  sentences = sent_tokenize(text)
  
  matrix = build_matrix(sentences)

  summary = summarize(matrix, sentences)

  return summary

In [36]:
# Test translator and summarizer
test = 'Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small- and large-scale projects.[30] Python is dynamically-typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library.[31][32] Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0.[33] Python 2.0 was released in 2000 and introduced new features such as list comprehensions, cycle-detecting garbage collection, reference counting, and Unicode support. Python 3.0, released in 2008, was a major revision that is not completely backward-compatible with earlier versions. Python 2 was discontinued with version 2.7.18 in 2020.[34] Python consistently ranks as one of the most popular programming languages'
test = translate(test)
for i in prep_paragraph(test):
  print(i)

[31][32] Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0.
[33] Python 2.0 was released in 2000 and introduced new features such as list comprehensions, cycle-detecting garbage collection, reference counting, and Unicode support.
Python is a high-level, general-purpose programming language.


In [37]:
# Test translator and summarizer
test2 = 'Los niños caminan por el bosque. “Podemos construir una fortaleza,” dice Jorge. Los niños no lo saben, pero no están solos en el bosque. Un oso y unos conejos están escondidos detrás de los árboles. Los niños encuentran un árbol grande. Sofía dice “¡Es el lugar perfecto para construir nuestra fortaleza!”'
test2 = translate(test2)
for i in prep_paragraph(test2):
  print(i)

Sofia says "It's the perfect place to build our fortress!"
The children walk through the woods.
