In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
EN_THRESHOLD = 1.0
KN_THRESHOLD = 1.0

WITHOUT_KN_MODIFIERS = True

CLASSIFY_COL_NAME = 'script'+str(EN_THRESHOLD)+'-'+str(KN_THRESHOLD)

In [3]:
import pandas as pd
url = '/content/drive/Shareddrives/Lingua/Datasets/dravidian/Kannada_Edited.csv'
df = pd.read_csv(url)

In [4]:
row_count = df.shape[0]

In [5]:
df[CLASSIFY_COL_NAME] = ['' for i in range(row_count)]


In [6]:
df_classified = df.copy()
df_classified['class'] = ['' for i in range(row_count)]

In [7]:
def check_only_numbers(string):
    return string.isdigit()

In [8]:
import re

def remove_symbols_and_numbers(string):
    pattern = r"[-()\"#/@;:<>{}`+=~|_▁.!?,1234567890]"
    clean_string = re.sub(pattern, '', string)
    return clean_string

In [9]:
def is_kannada(char):
  # https://unicode.org/charts/PDF/U0C80.pdf
  return ord(char) >= 0x0C80 and ord(char) <= 0x0CFF

In [10]:
def get_chars(word, without_kn_modifiers = True):
  mods = [0x0CBE,0x0CBF,0x0CC0,0x0CC1,0x0CC2,0x0CC3,0x0CC4,0x0CC5,0x0CC6,0x0CC7,0x0CC8,0x0CC9,0x0CCA,0x0CCB,0x0CCC]
  if without_kn_modifiers:
    return [char for char in list(word) if ord(char) not in mods]
  else:
    return list(word)

In [11]:
classes = []

In [12]:
for index, row in df.iterrows():
  tokens = row["Sentence"].split()
  total_chars = 0
  latin_char_count = 0
  kn_char_count = 0
  for t_i,t in enumerate(tokens):
    if check_only_numbers(t):
      continue
    token_list = get_chars(remove_symbols_and_numbers(t),without_kn_modifiers = WITHOUT_KN_MODIFIERS)
    token_len = len(token_list)
    total_chars += token_len
    for ch in token_list:
      if is_kannada(ch):
        kn_char_count += 1
      else:
        latin_char_count += 1
  if total_chars == 0:
     classes.append('Symbol')
  else:
    en_percentage = latin_char_count/total_chars
    kn_percentage = kn_char_count/total_chars
    if en_percentage >= EN_THRESHOLD:
      classes.append('Latin')
    elif kn_percentage >= KN_THRESHOLD:
      classes.append('Kannada')
    elif en_percentage < EN_THRESHOLD and kn_percentage < KN_THRESHOLD:
      classes.append('Mixed')

In [13]:
df_classified['class'] = classes
df[CLASSIFY_COL_NAME] = classes

In [14]:
df_classified.loc[df_classified['class'] == 'Symbol']

Unnamed: 0,SID,Sentence,Sentiment,Hate-Speech,script1.0-1.0,class


In [15]:
df_classified.loc[df_classified['class'] == 'Latin']

Unnamed: 0,SID,Sentence,Sentiment,Hate-Speech,script1.0-1.0,class
0,0,# 1 in trending,Positive,Not-Kannada,,Latin
1,1,#1 ON TRENDING,Positive,Not offensive,,Latin
2,2,#1 trending in India,Positive,Not-Kannada,,Latin
3,3,#1.46 Bgm music super,Not-Kannada,Not offensive,,Latin
4,4,#ASN chitrakke olle dagli song atu adbuta reet...,Positive,Not offensive,,Latin
...,...,...,...,...,...,...
7268,7268,🤣🤣🤣brooo I'm big fan,Not-Kannada,Not offensive,,Latin
7269,7269,🤣🤣🤣🤣🤣 anna super,Positive,Not offensive,,Latin
7270,7270,🥰 spr song 🤘🤘🤘🧡🥰🤫🤫,Positive,Not offensive,,Latin
7271,7271,🥰🥰🥰🥰Super song 🥰🥰🥰🥰,Positive,Not offensive,,Latin


In [16]:
df_classified.loc[df_classified['class'] == 'Kannada']

Unnamed: 0,SID,Sentence,Sentiment,Hate-Speech,script1.0-1.0,class
20,20,#ಕನ್ನಡಿಗರು ಲೈಕ್ ಮಾಡಿ ಪ್ಲೀಸ್,Positive,Not offensive,,Kannada
21,21,#ದಿಯಾ ರೀ ರಿಲೀಸ್ ಮಾಡಿ,Positive,Not offensive,,Kannada
37,37,.ಮೆರೆಯಲಿ ಗಗನದಲ್ಲಿ ನಿಮ್ಮದೇ ಲಾಂಛನಾ...,Positive,Not offensive,,Kannada
49,49,10 ಮಿಲಿಯನ್ ವಿವ್ಸ್ ಆಗುತ್ತೆ ಅನ್ನೋರು ಲೈಕ್ ಮಾಡಿ,Neutral,Not offensive,,Kannada
90,90,2019 ರಲ್ಲಿ ಈ ಕಥೆ ಯಾರ್ ಯಾರ್ ಕೇಳಿರೆಪಾ,Positive,Not offensive,,Kannada
...,...,...,...,...,...,...
7245,7245,ಹೌದೋ ಹುಲಿಯಾ ಗಿಚ್ಚಿ ಗಿಚ್ಚಿ ಗಿಲಿ ಗಿಲಿ ಹಾಡು,Negative,Offensive-Other,,Kannada
7246,7246,ಹೌದೋ ಹುಲಿಯಾ.....ಹಾಡು ಸೂ..............ಪರ್.........,Positive,Not offensive,,Kannada
7248,7248,ಹ್ಯಾಂಡ್ ಸಪ್ ಅನವರತ,Positive,Not offensive,,Kannada
7249,7249,ಹ್ಯಾಂಡ್ಸ್ ಅಪ್ ಇದು ಅನವರತ....,Positive,Not offensive,,Kannada


In [17]:
df_classified.loc[df_classified['class'] == 'Mixed']

Unnamed: 0,SID,Sentence,Sentiment,Hate-Speech,script1.0-1.0,class
7,7,#DBOSS ದರ್ಶನ್ ಅಣ್ಣನ ಅಭಿಮಾನಿಗಳ ಕಡೆ ಇಂದ All the...,Positive,Not offensive,,Mixed
8,8,#Dia ರೀ ರಿಲೀಸ್ ಮಾಡಿ,Positive,Not offensive,,Mixed
15,15,#handsup ಇದು ಚರಿತ್ರೆ ಸೃಷಿಸೂ ಅವತಾರ,Positive,Not offensive,,Mixed
18,18,#win 12 ನಿಮ್ಮ ಕನ್ನಡ ಪ್ರೇಮಕ್ಕೆ ಧನ್ಯವಾದ,Positive,Not offensive,,Mixed
19,19,#ಇದು ಚರಿತ್ರೆ ಸ್ರೃಷ್ಟಿಸುವ ಅವತಾರ nBaground music 🤺🤘,Positive,Not offensive,,Mixed
...,...,...,...,...,...,...
7234,7234,ಹೌದು ಸರ್ ಫಿಲ್ಮ್ ಸೂಪರ್ ಡುಪರ್ nಜನ ಯಾಕೇ ಬರುತ್ತಿಲ್...,Positive,Offensive-Other,,Mixed
7239,7239,ಹೌದು ಹುಲಿಯಾ 🤣🤣,Positive,Offensive-Other,,Mixed
7247,7247,ಹೌದ್ ಹುಲಿಯ ...nHands up .. ಅವನು ಬರ್ತಿದಾನೇ ಬಂದ ...,Positive,Not offensive,,Mixed
7250,7250,ಹ್ಯಾಪಿ ನ್ಯೂ year ಇನ್ ಅಡ್ವಾನ್ಸ್ ಅಂಡ್ ಇಮ್ ವೈಟಿಂಗ...,Neutral,Not offensive,,Mixed


In [18]:
df_classified['class'].value_counts()

Latin      5307
Kannada    1009
Mixed       957
Name: class, dtype: int64

In [19]:
df

Unnamed: 0,SID,Sentence,Sentiment,Hate-Speech,script1.0-1.0
0,0,# 1 in trending,Positive,Not-Kannada,Latin
1,1,#1 ON TRENDING,Positive,Not offensive,Latin
2,2,#1 trending in India,Positive,Not-Kannada,Latin
3,3,#1.46 Bgm music super,Not-Kannada,Not offensive,Latin
4,4,#ASN chitrakke olle dagli song atu adbuta reet...,Positive,Not offensive,Latin
...,...,...,...,...,...
7268,7268,🤣🤣🤣brooo I'm big fan,Not-Kannada,Not offensive,Latin
7269,7269,🤣🤣🤣🤣🤣 anna super,Positive,Not offensive,Latin
7270,7270,🥰 spr song 🤘🤘🤘🧡🥰🤫🤫,Positive,Not offensive,Latin
7271,7271,🥰🥰🥰🥰Super song 🥰🥰🥰🥰,Positive,Not offensive,Latin


In [20]:
# save_url = '/content/drive/Shareddrives/Lingua/Datasets/dravidian/kannada_all.csv'
# df.to_csv(save_url, index=False)