In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
EN_THRESHOLD = 1.0
KN_THRESHOLD = 1.0

WITHOUT_KN_MODIFIERS = True

CLASSIFY_COL_NAME = 'script'+str(EN_THRESHOLD)+'-'+str(KN_THRESHOLD)

In [3]:
import pandas as pd
url = '/content/drive/Shareddrives/Lingua/Datasets/dravidian/Kannada_Edited.csv'
df = pd.read_csv(url)

In [4]:
row_count = df.shape[0]

In [5]:
df[CLASSIFY_COL_NAME] = ['' for i in range(row_count)]


In [6]:
df_classified = df.copy()
df_classified['class'] = ['' for i in range(row_count)]

In [7]:
def check_only_numbers(string):
    return string.isdigit()

In [8]:
import re

def remove_symbols_and_numbers(string):
    pattern = r"[-()\"#/@;:<>{}`+=~|_‚ñÅ.!?,1234567890]"
    clean_string = re.sub(pattern, '', string)
    return clean_string

In [9]:
def is_kannada(char):
  # https://unicode.org/charts/PDF/U0C80.pdf
  return ord(char) >= 0x0C80 and ord(char) <= 0x0CFF

In [10]:
def get_chars(word, without_kn_modifiers = True):
  mods = [0x0CBE,0x0CBF,0x0CC0,0x0CC1,0x0CC2,0x0CC3,0x0CC4,0x0CC5,0x0CC6,0x0CC7,0x0CC8,0x0CC9,0x0CCA,0x0CCB,0x0CCC]
  if without_kn_modifiers:
    return [char for char in list(word) if ord(char) not in mods]
  else:
    return list(word)

In [11]:
classes = []

In [12]:
for index, row in df.iterrows():
  tokens = row["Sentence"].split()
  total_chars = 0
  latin_char_count = 0
  kn_char_count = 0
  for t_i,t in enumerate(tokens):
    if check_only_numbers(t):
      continue
    token_list = get_chars(remove_symbols_and_numbers(t),without_kn_modifiers = WITHOUT_KN_MODIFIERS)
    token_len = len(token_list)
    total_chars += token_len
    for ch in token_list:
      if is_kannada(ch):
        kn_char_count += 1
      else:
        latin_char_count += 1
  if total_chars == 0:
     classes.append('Symbol')
  else:
    en_percentage = latin_char_count/total_chars
    kn_percentage = kn_char_count/total_chars
    if en_percentage >= EN_THRESHOLD:
      classes.append('Latin')
    elif kn_percentage >= KN_THRESHOLD:
      classes.append('Kannada')
    elif en_percentage < EN_THRESHOLD and kn_percentage < KN_THRESHOLD:
      classes.append('Mixed')

In [13]:
df_classified['class'] = classes
df[CLASSIFY_COL_NAME] = classes

In [14]:
df_classified.loc[df_classified['class'] == 'Symbol']

Unnamed: 0,SID,Sentence,Sentiment,Hate-Speech,script1.0-1.0,class


In [15]:
df_classified.loc[df_classified['class'] == 'Latin']

Unnamed: 0,SID,Sentence,Sentiment,Hate-Speech,script1.0-1.0,class
0,0,# 1 in trending,Positive,Not-Kannada,,Latin
1,1,#1 ON TRENDING,Positive,Not offensive,,Latin
2,2,#1 trending in India,Positive,Not-Kannada,,Latin
3,3,#1.46 Bgm music super,Not-Kannada,Not offensive,,Latin
4,4,#ASN chitrakke olle dagli song atu adbuta reet...,Positive,Not offensive,,Latin
...,...,...,...,...,...,...
7268,7268,ü§£ü§£ü§£brooo I'm big fan,Not-Kannada,Not offensive,,Latin
7269,7269,ü§£ü§£ü§£ü§£ü§£ anna super,Positive,Not offensive,,Latin
7270,7270,ü•∞ spr song ü§òü§òü§òüß°ü•∞ü§´ü§´,Positive,Not offensive,,Latin
7271,7271,ü•∞ü•∞ü•∞ü•∞Super song ü•∞ü•∞ü•∞ü•∞,Positive,Not offensive,,Latin


In [16]:
df_classified.loc[df_classified['class'] == 'Kannada']

Unnamed: 0,SID,Sentence,Sentiment,Hate-Speech,script1.0-1.0,class
20,20,#‡≤ï‡≤®‡≥ç‡≤®‡≤°‡≤ø‡≤ó‡≤∞‡≥Å ‡≤≤‡≥à‡≤ï‡≥ç ‡≤Æ‡≤æ‡≤°‡≤ø ‡≤™‡≥ç‡≤≤‡≥Ä‡≤∏‡≥ç,Positive,Not offensive,,Kannada
21,21,#‡≤¶‡≤ø‡≤Ø‡≤æ ‡≤∞‡≥Ä ‡≤∞‡≤ø‡≤≤‡≥Ä‡≤∏‡≥ç ‡≤Æ‡≤æ‡≤°‡≤ø,Positive,Not offensive,,Kannada
37,37,.‡≤Æ‡≥Ü‡≤∞‡≥Ü‡≤Ø‡≤≤‡≤ø ‡≤ó‡≤ó‡≤®‡≤¶‡≤≤‡≥ç‡≤≤‡≤ø ‡≤®‡≤ø‡≤Æ‡≥ç‡≤Æ‡≤¶‡≥á ‡≤≤‡≤æ‡≤Ç‡≤õ‡≤®‡≤æ...,Positive,Not offensive,,Kannada
49,49,10 ‡≤Æ‡≤ø‡≤≤‡≤ø‡≤Ø‡≤®‡≥ç ‡≤µ‡≤ø‡≤µ‡≥ç‡≤∏‡≥ç ‡≤Ü‡≤ó‡≥Å‡≤§‡≥ç‡≤§‡≥Ü ‡≤Ö‡≤®‡≥ç‡≤®‡≥ã‡≤∞‡≥Å ‡≤≤‡≥à‡≤ï‡≥ç ‡≤Æ‡≤æ‡≤°‡≤ø,Neutral,Not offensive,,Kannada
90,90,2019 ‡≤∞‡≤≤‡≥ç‡≤≤‡≤ø ‡≤à ‡≤ï‡≤•‡≥Ü ‡≤Ø‡≤æ‡≤∞‡≥ç ‡≤Ø‡≤æ‡≤∞‡≥ç ‡≤ï‡≥á‡≤≥‡≤ø‡≤∞‡≥Ü‡≤™‡≤æ,Positive,Not offensive,,Kannada
...,...,...,...,...,...,...
7245,7245,‡≤π‡≥å‡≤¶‡≥ã ‡≤π‡≥Å‡≤≤‡≤ø‡≤Ø‡≤æ ‡≤ó‡≤ø‡≤ö‡≥ç‡≤ö‡≤ø ‡≤ó‡≤ø‡≤ö‡≥ç‡≤ö‡≤ø ‡≤ó‡≤ø‡≤≤‡≤ø ‡≤ó‡≤ø‡≤≤‡≤ø ‡≤π‡≤æ‡≤°‡≥Å,Negative,Offensive-Other,,Kannada
7246,7246,‡≤π‡≥å‡≤¶‡≥ã ‡≤π‡≥Å‡≤≤‡≤ø‡≤Ø‡≤æ.....‡≤π‡≤æ‡≤°‡≥Å ‡≤∏‡≥Ç..............‡≤™‡≤∞‡≥ç.........,Positive,Not offensive,,Kannada
7248,7248,‡≤π‡≥ç‡≤Ø‡≤æ‡≤Ç‡≤°‡≥ç ‡≤∏‡≤™‡≥ç ‡≤Ö‡≤®‡≤µ‡≤∞‡≤§,Positive,Not offensive,,Kannada
7249,7249,‡≤π‡≥ç‡≤Ø‡≤æ‡≤Ç‡≤°‡≥ç‡≤∏‡≥ç ‡≤Ö‡≤™‡≥ç ‡≤á‡≤¶‡≥Å ‡≤Ö‡≤®‡≤µ‡≤∞‡≤§....,Positive,Not offensive,,Kannada


In [17]:
df_classified.loc[df_classified['class'] == 'Mixed']

Unnamed: 0,SID,Sentence,Sentiment,Hate-Speech,script1.0-1.0,class
7,7,#DBOSS ‡≤¶‡≤∞‡≥ç‡≤∂‡≤®‡≥ç ‡≤Ö‡≤£‡≥ç‡≤£‡≤® ‡≤Ö‡≤≠‡≤ø‡≤Æ‡≤æ‡≤®‡≤ø‡≤ó‡≤≥ ‡≤ï‡≤°‡≥Ü ‡≤á‡≤Ç‡≤¶ All the...,Positive,Not offensive,,Mixed
8,8,#Dia ‡≤∞‡≥Ä ‡≤∞‡≤ø‡≤≤‡≥Ä‡≤∏‡≥ç ‡≤Æ‡≤æ‡≤°‡≤ø,Positive,Not offensive,,Mixed
15,15,#handsup ‡≤á‡≤¶‡≥Å ‡≤ö‡≤∞‡≤ø‡≤§‡≥ç‡≤∞‡≥Ü ‡≤∏‡≥É‡≤∑‡≤ø‡≤∏‡≥Ç ‡≤Ö‡≤µ‡≤§‡≤æ‡≤∞,Positive,Not offensive,,Mixed
18,18,#win 12 ‡≤®‡≤ø‡≤Æ‡≥ç‡≤Æ ‡≤ï‡≤®‡≥ç‡≤®‡≤° ‡≤™‡≥ç‡≤∞‡≥á‡≤Æ‡≤ï‡≥ç‡≤ï‡≥Ü ‡≤ß‡≤®‡≥ç‡≤Ø‡≤µ‡≤æ‡≤¶,Positive,Not offensive,,Mixed
19,19,#‡≤á‡≤¶‡≥Å ‡≤ö‡≤∞‡≤ø‡≤§‡≥ç‡≤∞‡≥Ü ‡≤∏‡≥ç‡≤∞‡≥É‡≤∑‡≥ç‡≤ü‡≤ø‡≤∏‡≥Å‡≤µ ‡≤Ö‡≤µ‡≤§‡≤æ‡≤∞ nBaground music ü§∫ü§ò,Positive,Not offensive,,Mixed
...,...,...,...,...,...,...
7234,7234,‡≤π‡≥å‡≤¶‡≥Å ‡≤∏‡≤∞‡≥ç ‡≤´‡≤ø‡≤≤‡≥ç‡≤Æ‡≥ç ‡≤∏‡≥Ç‡≤™‡≤∞‡≥ç ‡≤°‡≥Å‡≤™‡≤∞‡≥ç n‡≤ú‡≤® ‡≤Ø‡≤æ‡≤ï‡≥á ‡≤¨‡≤∞‡≥Å‡≤§‡≥ç‡≤§‡≤ø‡≤≤‡≥ç...,Positive,Offensive-Other,,Mixed
7239,7239,‡≤π‡≥å‡≤¶‡≥Å ‡≤π‡≥Å‡≤≤‡≤ø‡≤Ø‡≤æ ü§£ü§£,Positive,Offensive-Other,,Mixed
7247,7247,‡≤π‡≥å‡≤¶‡≥ç ‡≤π‡≥Å‡≤≤‡≤ø‡≤Ø ...nHands up .. ‡≤Ö‡≤µ‡≤®‡≥Å ‡≤¨‡≤∞‡≥ç‡≤§‡≤ø‡≤¶‡≤æ‡≤®‡≥á ‡≤¨‡≤Ç‡≤¶ ...,Positive,Not offensive,,Mixed
7250,7250,‡≤π‡≥ç‡≤Ø‡≤æ‡≤™‡≤ø ‡≤®‡≥ç‡≤Ø‡≥Ç year ‡≤á‡≤®‡≥ç ‡≤Ö‡≤°‡≥ç‡≤µ‡≤æ‡≤®‡≥ç‡≤∏‡≥ç ‡≤Ö‡≤Ç‡≤°‡≥ç ‡≤á‡≤Æ‡≥ç ‡≤µ‡≥à‡≤ü‡≤ø‡≤Ç‡≤ó...,Neutral,Not offensive,,Mixed


In [18]:
df_classified['class'].value_counts()

Latin      5307
Kannada    1009
Mixed       957
Name: class, dtype: int64

In [19]:
df

Unnamed: 0,SID,Sentence,Sentiment,Hate-Speech,script1.0-1.0
0,0,# 1 in trending,Positive,Not-Kannada,Latin
1,1,#1 ON TRENDING,Positive,Not offensive,Latin
2,2,#1 trending in India,Positive,Not-Kannada,Latin
3,3,#1.46 Bgm music super,Not-Kannada,Not offensive,Latin
4,4,#ASN chitrakke olle dagli song atu adbuta reet...,Positive,Not offensive,Latin
...,...,...,...,...,...
7268,7268,ü§£ü§£ü§£brooo I'm big fan,Not-Kannada,Not offensive,Latin
7269,7269,ü§£ü§£ü§£ü§£ü§£ anna super,Positive,Not offensive,Latin
7270,7270,ü•∞ spr song ü§òü§òü§òüß°ü•∞ü§´ü§´,Positive,Not offensive,Latin
7271,7271,ü•∞ü•∞ü•∞ü•∞Super song ü•∞ü•∞ü•∞ü•∞,Positive,Not offensive,Latin


In [20]:
# save_url = '/content/drive/Shareddrives/Lingua/Datasets/dravidian/kannada_all.csv'
# df.to_csv(save_url, index=False)