In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

os.makedirs('/content/data/', exist_ok=True)
os.system('cp  /content/drive/MyDrive/fyp/TamilStopWords.txt /content/data/TamilStopWords.txt')
os.system('cp -rf /content/drive/MyDrive/fyp/saves/ /content/data/')

0

In [3]:
import pandas as pd

df = pd.read_csv( '/content/data/saves/dataset.csv', sep='`' )

### Exploration and analysis

In [4]:
df.head()

Unnamed: 0,Fine Tag,Coarse Tag,Comment
0,Not_offensive,Offensive_Untargeted,VISWASAM Movie Pathutu Yarellaam inka Vantheen...
1,Not_offensive,Offensive_Untargeted,viswasam trailer ah pathutu enga vanthavanga y...
2,Not_offensive,Offensive_Targeted_Insult_Individual,Thooku duraiyin aattatai trailerla pathutu ya...
3,Not_offensive,Offensive_Untargeted,Yaarellam viswasam trailer pathathuku Apparam...
4,Not_offensive,Offensive_Targeted_Insult_Individual,“Adha vida koduma kaanama pona crush correct a...


In [5]:
df.shape

(9073, 3)

In [6]:
df['Fine Tag'].value_counts()

Fine Tag
Offensive_others         4512
Not_offensive            3745
Offensive_women           303
Offensive_caste           266
Offensive_race            119
Offensive_sexuality        59
Offensive_handicapped      37
Offensive_religion         32
Name: count, dtype: int64

### Pre-Processing and Helper functions

In [7]:
# create a list of stopwords

import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')

tamil_stopwords = '/content/data/TamilStopWords.txt'
# sources :
#  https://gist.github.com/arulrajnet/e82a5a331f78a5cc9b6d372df13a919c

with open(tamil_stopwords, 'r') as file:
  content = file.read()
  stop.extend( content.split() )

len(stop)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


306

In [8]:
!pip install demoji
!pip install clean-text

Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: demoji
Successfully installed demoji-1.1.0


In [9]:
import re
import string
import demoji

def custom_clean( text ):
  # normalisation
  text = text.lower()

  # remove punctuation
  text = "".join([t for t in text if t not in string.punctuation])

  # remove emojis
  text = demoji.replace(text, '[EMOJI]')

  # remove numbers
  timestamp_pattern = r'\d+'
  text = re.sub( timestamp_pattern, '[NUMBER]', text )

  # remove stop words
  text = " ".join( [word for word in text.split() if word not in stop] )

  # remove unicode formatting characters
  formatting_characters = [
      '\u200b', '\u200c', '\u200d', '\u200e', '\u200f', # formatting characters
      '\u202a', '\u202b', '\u202c', '\u202d', '\u202e', # directional characters
      '\u2060', '\u2063' # other formatting characters
  ]

  text = ''.join( [char for char in text if char not in formatting_characters] )

  return text


In [10]:
  # 0:18 enna da aniyayam pandringa
# என்ன டா நீங்க பக பக பக பக னு சங்கு வேற ஊதிக்கிறீங்க🤣🤣🤣symbolic ah சொல்றீங்களோ🤔🤔🤔🤔🤔‍‍‍‍

example = "என்ன டா நீங்க பக பக பக பக னு சங்கு வேற ஊதிக்கிறீங்க🤣🤣🤣symbolic ah சொல்றீங்களோ🤔🤔🤔🤔🤔‍‍‍‍"

# \u200d is a zero space formating unicode character used with indic languages and emojis to help with rendering.

custom_clean( example )

'டா நீங்க பக பக பக பக னு சங்கு வேற ஊதிக்கிறீங்க[EMOJI][EMOJI][EMOJI]symbolic ah சொல்றீங்களோ[EMOJI][EMOJI][EMOJI][EMOJI][EMOJI]'

In [11]:
def mapping(prediction):
    if prediction == 0:
        output = "offensive to other castes."
    elif prediction == 1:
        output = "offensive to handicapped people."
    elif prediction == 2:
        output = "offensive to others."
    elif prediction == 3:
        output = "racially offensive."
    elif prediction == 4:
        output = "offensive to other religions."
    elif prediction == 5:
        output = "offensive to people of the LGBTQIA+ community."
    else:
        output = "offensive to women."
    return output

### Inference Pipeline

In [18]:
DL_MODEL = "xlm-roberta"
# DL_MODEL = "bert-multilingual"
# DL_MODEL = "muril"
# DL_MODEL = "lstm"
# DL_MODEL = "bi-lstm"

ML_MODEL = "svm/rbf_only_offensive"
# ML_MODEL = "svm/poly_only_offensive"
# ML_MODEL = "naive-bayes"

In [36]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle

# Load model based on type
def load_dl_model():
  if( DL_MODEL in ['xlm-roberta', 'bert-multilingual', 'muril'] ):
    model_checkpoint = f"/content/drive/MyDrive/fyp/models/{DL_MODEL}-finetuned"

    dl_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    dl_model = AutoModelForSequenceClassification.from_pretrained( model_checkpoint )

  elif ( DL_MODEL=='lstm' ) :
    model_path = "/content/drive/MyDrive/fyp/models/pkl-files/lstm"

    dl_tokenizer = pickle.load( open( f"{model_path}/lstm_one_hot.pkl", "rb") )
    dl_model = pickle.load( open(f"{model_path}/lstm.pkl", "rb") )

  elif ( DL_MODEL=='bi-lstm' ) :
    model_path = "/content/drive/MyDrive/fyp/models/pkl-files/bi-lstm"

    dl_tokenizer = pickle.load( open(f"{model_path}/encoder.pkl", "rb") )
    dl_model = pickle.load( open(f"{model_path}/bi-lstm.pkl", "rb") )

  return( dl_tokenizer, dl_model )

# Run inference based on type
def dl_inference( test_sentence, dl_tokenizer, dl_model ):
  if( DL_MODEL in ['xlm-roberta', 'bert-multilingual', 'muril'] ):
    inputs = dl_tokenizer(test_sentence, return_tensors="pt")

    logits = dl_model(**inputs).logits
    prediction = logits.argmax().item()

    print( logits )
    print( "Classification : ", prediction )

    return prediction
  elif ( DL_MODEL in ['lstm', 'bi-lstm'] ) :
    test_encoding = [ dl_tokenizer(test_sentence, 1e5) ]
    test_emb_doc = pad_sequences( test_encoding, padding='pre', maxlen = 100 )
    predictions = dl_model.predict( np.array(test_emb_doc) )

    if predictions[0]>0.5 :
      result = 1
    else :
      result = 0

    print( predictions[0] )
    print( "Classification : ",result )
    return result

In [33]:
DL_MODEL = "xlm-roberta"
# DL_MODEL = "bert-multilingual"
# DL_MODEL = "muril"
# DL_MODEL = "lstm"
# DL_MODEL = "bi-lstm"

dl_tokenizer, dl_model = load_dl_model()
dl_inference( "nee oru loosu", dl_tokenizer, dl_model )

# 1 -> offensive
# 0 -> not-offensive

tensor([[-0.0059,  0.1491]], grad_fn=<AddmmBackward0>)
Classification :  1


1

In [34]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

def load_ml_model():
 if( ML_MODEL.split("/")[0] =="svm" ):
  vec = TfidfVectorizer( max_features = 1e5 )
  vec = pickle.load( open("/content/drive/MyDrive/fyp/models/pkl-files/svm/tfidf.pickle", "rb") )

  ml_model = pickle.load( open(f"/content/drive/MyDrive/fyp/models/pkl-files/{ML_MODEL}.pkl", "rb") )

 elif( ML_MODEL=="naive-bayes" ):
  vec = TfidfVectorizer( max_features = 17293 )
  vec = pickle.load( open("/content/drive/MyDrive/fyp/models/pkl-files/naive-bayes/bayes_tfidf.pickle", "rb") )

  ml_model = pickle.load( open(f"/content/drive/MyDrive/fyp/models/pkl-files/naive-bayes/naive_bayes_model.pkl", "rb") )

 return vec, ml_model

def ml_inference( test_sentence, vec, ml_model ):
  test_sentence = vec.transform( [test_sentence] )
  prediction = ml_model.predict( test_sentence )[0]

  print( "offensive class label: ", prediction )

  return mapping( prediction )

In [35]:
ML_MODEL = "svm/rbf_only_offensive"
# ML_MODEL = "svm/poly_only_offensive"
# ML_MODEL = "naive-bayes"

vec, ml_model = load_ml_model()
ml_inference( "nee oru loosu", vec, ml_model )

offensive class label:  2


'offensive to others.'

In [38]:
# inference pipeline
import torch

# example = "indha ponnu waste da"
# example = "matha naatu karangala india kulla vidave kudadhu"
# example = "என்ன டா நீங்க பக பக பக பக னு சங்கு வேற ஊதிக்கிறீங்க🤣🤣🤣symbolic ah சொல்றீங்களோ🤔🤔🤔🤔🤔‍‍‍‍"
example = "indha ponnu waste da"
# example = "nee lam uyir vaalave kudadhu"

DL_MODEL = "xlm-roberta"
# DL_MODEL = "bert-multilingual"
# DL_MODEL = "muril"
# DL_MODEL = "lstm"
# DL_MODEL = "bi-lstm"

ML_MODEL = "svm/rbf_only_offensive"
# ML_MODEL = "svm/poly_only_offensive"
# ML_MODEL = "naive-bayes"

clean_text = custom_clean( example )

with torch.no_grad():
    dl_tokenizer, dl_model = load_dl_model()
    prediction = dl_inference( clean_text, dl_tokenizer, dl_model )

    if prediction:
      vec, ml_model = load_ml_model()
      fine_tag = ml_inference( example, vec, ml_model )
    else:
      fine_tag = "Not Offensive"
    print( "This sentence is ", fine_tag )

tensor([[-0.0313,  0.1384]])
Classification :  1
offensive class label:  2
This sentence is  offensive to others.


In [None]:
clean(sample_inpuut, fix_unicode=True, to_ascii=True,
        normalize_whitespace=True, lower=True, no_line_breaks=True, no_emoji=True,lang="en")