In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import re 
import scipy
from scipy import sparse

from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge

In [3]:
!ls data

multilingual-toxic-comment-classification
ruddit-jigsaw-dataset
temporary
toxic-comment-classification-challenge
toxic-severity-rating
unintended-bias-in-toxicity-classification


### Read Data

In [6]:
df_train = pd.read_csv("data/toxic-comment-classification-challenge/train.csv")
df_sub = pd.read_csv("data/toxic-severity-rating/comments_to_score.csv")

In [16]:
def diffList(new_list, base_list):
    return list(set(new_list) - set(base_list))

In [53]:
def weightedSum(data, weight_dict):
    return (data[weight_dict.keys()] * np.array(weight_dict)).sum(axis = 1)

In [54]:
weights_target_dict = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5,  'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

In [56]:
weightedSum(df_train, weights_target_dict).to_frame('y')

Unnamed: 0,y
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
159566,0.0
159567,0.0
159568,0.0
159569,0.0


In [62]:
class TextDataSet:
    def __init__(self, data):
        self.data = data
    def set_target(self, response_col = None, average_weights_dict = None, y_col = 'y'):
        assert (response == None)^(average_weights_dict == None)
        if average_weights_dict != None: 
            assert isinstance(average_weights_dict, dict)
            self.data['y'] = weightedSum(self.data, weights_target_dict).to_frame('y')
            self.y_col = 'y'
        if response != None: 
            assert isinstance(response, str)
            assert response_col in self.data.columns
            self.data.rename(columns = {response_col:y_col}, inplace = True)

In [None]:
def removeHTMLTags(text):
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    text = soup.get_text()
    return text
def removeURL(text):
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    return text
def removeEmoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text
def removeSpecialCharacters(text):
    text = re.sub(r"[^a-zA-Z\d]", " ", text)
    return text
def removeExtraSpaces(text):
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    return text #Remove Extra Spaces
def removeBegingEndSpace(text):
    return text.strip()

In [None]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text