In [1]:
import pandas as pd
import os
import torch
import numpy as np
import random

In [2]:
from Config import *
import TextDataFrame as tdf
import utils

import imp
imp.reload(tdf)
imp.reload(utils)

<module 'utils' from '/home/adrian/Projects/Competition/kaggle-toxic-comments-2021/utils.py'>

In [29]:
def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

In [28]:
set_seed()

In [3]:
df_processor = tdf.DatasetProcessor(DEFAULT_CLEAN_PROCEDURE)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_tokenizer = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))

from sklearn.linear_model import Ridge

model_ridge = Ridge(alpha=0.5)

### Read Data

In [4]:
weights_target_dict = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5,  'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}
text_col = 'comment_text'

In [6]:
df_train = pd.read_csv(os.path.join(DATA_PATH,"toxic-comment-classification-challenge/train.csv"))

In [7]:
df_train['y'] = df_processor.set_target(df_train, average_weights_dict = weights_target_dict)
df_train[text_col] = df_processor.clean_text(df_train[text_col])

  0%|          | 0/159571 [00:00<?, ?it/s]



In [8]:
tfidf_tokenizer = tfidf_tokenizer.fit(df_train[text_col])

In [9]:
df_train_new = utils.sample_binary(df_train, 'y')

X = tfidf_tokenizer.transform(df_train_new[text_col])
y = df_train_new['y']

model_ridge.fit(X, y)

Ridge(alpha=0.5)

In [10]:
## Pipeline
    # sample class/method
    # tokenize class
    # model class

In [23]:
def predict(text_series, text_cleaner_func, tokenizer_transform_func, predict_func,
           ):
    cleaned_text = text_cleaner_func(text_series)
    cleaned_text = tokenizer_transform_func(cleaned_text)
    probability =  predict_func(cleaned_text)
    return probability
def evaluate():

    df_val = pd.read_csv(os.path.join(DATA_PATH,"toxic-severity-rating/validation_data.csv"))

    p1 = predict(df_val['less_toxic'], 
                 df_processor.clean_text, 
                 tfidf_tokenizer.transform,
                 model_ridge.predict
                )

    p2 = predict(df_val['more_toxic'], 
                 df_processor.clean_text, 
                 tfidf_tokenizer.transform,
                 model_ridge.predict
                )

    return {"% Correct":(p1<p2).mean(), "% Equal":(p1==p2).mean()}

In [24]:
evaluate()

  0%|          | 0/30108 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

{'% Correct': 0.6691576989504451, '% Equal': 0.0}

In [30]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from scipy import sparse

class LengthTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return sparse.csr_matrix([[(len(x)-360)/550] for x in X])
    def get_feature_names(self):
        return ["lngth"]
    
features = FeatureUnion([
    ('vect1', LengthTransformer()),
    #('vect2', LengthUpperTransformer()),
    ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),
    #("vect4", TfidfVectorizer(min_df= 5, max_df=0.5, analyzer = 'word', token_pattern=r'(?u)\b\w{8,}\b')),

])
pipeline = Pipeline(
    [
        ("features", features),
        #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
        ("clf", Ridge()),
        #("clf",LinearRegression())
    ]
    )

## Weights of features
feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)