In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
# import visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# import preprocessing libraries
import re
import string
import spacy

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading training and test data

In [None]:
%cd /kaggle/input

In [None]:
data_train = pd.read_csv('jigsaw-toxic-comment-classification-challenge/train.csv.zip')
data_test = pd.read_csv('jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_label = pd.read_csv('jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')
sample_submission = pd.read_csv('jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

In [None]:
with open("text-preprocessing-tools-light/lemmatization-en.json") as file:
    lemmatizer = json.load(file)

with open("text-preprocessing-tools-light/stopwords.txt") as file:
    stopwords = file.read().splitlines()

In [None]:
sample_submission

## Examine the data (EDA)

In [None]:
data_train

In [None]:
data_train.describe()

In [None]:
data_train.info()

In [None]:
data_classify = {}
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for i, col in enumerate(classes):
    data_classify[col + '_cmt'] = data_train[data_train[col] == 1]['comment_text'].values
#     data_classify[col + '_cmt_len'] = [len(x) for x in data_classify[col + '_cmt']]
#     data_classify[col + '_cmt_word_count'] = [len(x.split()) for x in data_classify[col + '_cmt']]
#     data_classify[col + '_cmt_unique_word_count'] = [len(set(x.split())) for x in data_classify[col + '_cmt']]
#     data_classify[col + '_cmt_stopword_count'] = [len([w for w in x.split() if w in STOPWORDS]) for x in data_classify[col + '_cmt']]

In [None]:
data_classify['identity_hate_cmt'][0]

In [None]:
# data_classify.head()

In [None]:
for col in classes:
    print(col, len(data_classify[col + '_cmt']))

## Visualize meta features of each class

In [None]:
# clean_cmt = data_train[data_train[classes].sum(axis = 1) == 0]
# clean_cmt
df = data_train.copy()
df['clean'] = (df.iloc[:,2:].sum(axis=1) == 0)

In [None]:
## Indirect features

#Sentense count in each comment:
    #  '\n' can be used to count the number of sentences in each comment
    
META_FEATURES  = ['count_sent', 'count_word', 'count_unique_word', 'count_letters', 'count_punctuations', 'count_words_upper', 'count_words_title', 'count_stopwords', 'mean_word_len', 'word_unique_percent', 'punct_percent']

df['count_sent']=df["comment_text"].apply(lambda x: len(re.findall("\n",str(x)))+1)
#Word count in each comment:
df['count_word']=df["comment_text"].apply(lambda x: len(str(x).split()))
#Unique word count
df['count_unique_word']=df["comment_text"].apply(lambda x: len(set(str(x).split())))
#Letter count
df['count_letters']=df["comment_text"].apply(lambda x: len(str(x)))
#punctuation count
df["count_punctuations"] =df["comment_text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
#upper case words count
df["count_words_upper"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
#title case words count
df["count_words_title"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
#Number of stopwords
df["count_stopwords"] = df["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))
#Average length of the words
df["mean_word_len"] = df["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
#Word count percent in each comment:
df['word_unique_percent']=df['count_unique_word']*100/df['count_word']
#Punct percent in each comment:
df['punct_percent']=df['count_punctuations']*100/df['count_word']

#### Rescale extreme cases

In [None]:
df.loc[df['count_sent']>10, 'count_sent'] = 10 
df.loc[df['count_word'] > 200, 'count_word'] = 200
df.loc[df['count_unique_word'] > 200, 'count_unique_word'] = 200
df.loc[df['count_letters'] > 1000, 'count_letters'] = 1000
df.loc[df['count_punctuations'] > 50, 'count_punctuations'] = 50
df.loc[df['count_words_upper'] > 30, 'count_words_upper'] = 30
df.loc[df['count_words_title'] > 30, 'count_words_title'] = 30
df.loc[df['count_stopwords'] > 100, 'count_stopwords'] = 100
df.loc[df['mean_word_len'] > 10, 'mean_word_len'] = 10

In [None]:
plt.figure(figsize=(16,12))
plt.subplot(2, 2, 1)
plt.title('Word count')
temp_df = pd.melt(df, value_vars=['count_word', 'count_unique_word'], id_vars='clean')
sns.violinplot(data=temp_df, y = 'value', x = 'variable', hue = 'clean', split = True, inner = 'quart')
plt.subplot(2, 2, 2)
plt.title('Letter count')
temp_df = pd.melt(df, value_vars=['count_letters'], id_vars='clean')
sns.violinplot(data=temp_df, y = 'value', x = 'variable', hue = 'clean', split = True, inner = 'quart')
plt.subplot(2, 2, 3)
plt.title('Sentence count')
temp_df = pd.melt(df, value_vars=['count_sent'], id_vars='clean')
sns.violinplot(data=temp_df, y = 'value', x = 'variable', hue = 'clean', split = True, inner = 'quart')
plt.legend()
plt.subplot(2, 2, 4)
sns.kdeplot(df[df.clean == 0].word_unique_percent, label="Bad")
sns.kdeplot(df[df.clean == 1].word_unique_percent, label="Clean")
plt.legend()


### Length

In [None]:
# plt.figure(figsize = (20, 15))
# feat = '_cmt_len'
# for i, col in enumerate(classes):
#     plt.subplot(2, 3, i + 1)
#     plt.title(col + feat)
#     for i, x in enumerate(data_classify[col + feat]):
#         if x > 1000:
#             data_classify[col + feat][i] = 1000
#     sns.violinplot(data_classify[col + feat])
# # plt.show()

### Word Count

In [None]:
# plt.figure(figsize = (20, 15))
# feat = '_cmt_word_count'
# for i, col in enumerate(classes):
#     plt.subplot(2, 3, i + 1)
#     plt.title(col + feat)
#     for i, x in enumerate(data_classify[col + feat]):
#         if x > 200:
#             data_classify[col + feat][i] = 200
#     sns.violinplot(data_classify[col + feat])
# # plt.show()

### Unique Word Count

In [None]:
# plt.figure(figsize = (20, 15))
# feat = '_cmt_unique_word_count'
# for i, col in enumerate(classes):
#     plt.subplot(2, 3, i + 1)
#     plt.title(col + feat)
#     for i, x in enumerate(data_classify[col + feat]):
#         if x > 100:
#             data_classify[col + feat][i] = 100
#     sns.violinplot(data_classify[col + feat])
# # plt.show()

## Correlation of features and targets

In [None]:
# This block uses Pearson Correlation, which doesn't work with categorical data
colormap = plt.cm.plasma
plt.figure(figsize=(7,7))
plt.title('Correlation of features & targets',y=1.05,size=14)
sns.heatmap(data_train[classes].astype(float).corr(),linewidths=0.1,vmax=1.0,square=True,cmap=colormap,
           linecolor='white',annot=True)

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# Create a cross-tabulation table
cross_tab = pd.crosstab(df['toxic'], df['severe_toxic'])

# Calculate the chi-square statistic and p-value
chi2, p, _, _ = chi2_contingency(cross_tab)

# Calculate Cramér's V
n = cross_tab.sum().sum()
v = np.sqrt(chi2 / (n * (min(cross_tab.shape) - 1)))

# Print the correlation
print("Cramér's V:", v)


## Clean the text

In [None]:
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [None]:
# nlp = spacy.load("en_core_web_sm")

def split_numbers_from_characters(text):
    # Split numbers and characters
    parts = re.split('(\d+)', text)
    
    # If there was a number and a character part, add a space between them
    if len(parts) > 1:
        return ' '.join(parts)
    
    # If there was no number, return the original text
    return text

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non ascii characters
    text = re.sub(r'[^\x00-\x7f]', r' ', text)
    # Remove special characters
    # text = re.sub(r'\W', ' ', text)
    # Remove \n\r
    text= re.sub(r'/\\n+|\\r+|\n+|\r+/', r' ', text)
    # Remove leaky elements like ip,user
    # text=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",text)
    # Removing usernames
    # text=re.sub("\[\[.*\]","",text)
    # Remove emojis
#     text = re.sub(r'[\U00010000-\U0010ffff]', ' ', text, flags=re.UNICODE)
    
    # Remove punctuation
    text = re.sub(r'[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-]+', ' ', text)
    
    # Remove numbers
    text = split_numbers_from_characters(text)
    text = re.sub(r'\d', ' ', text)
    # Replace appos
    words=[APPO[word] if word in APPO else word for word in text.split()]
    # lemmatization
#     words = [lemmatizer.get(word, word) for word in words]
    # Remove stopwords
#     words = [word for word in words if word not in stopwords]
    return ' '.join(words)

In [None]:
data_classify['identity_hate_cmt']

In [None]:
clean_text(data_classify['identity_hate_cmt'][0])

In [None]:
data_train['comment_text'] = data_train['comment_text'].apply(clean_text)
data_test['comment_text'] = data_test['comment_text'].apply(clean_text)
labels = data_train.drop(['id', 'comment_text'], axis = 1)
data_train.head()

In [None]:
data_train['comment_text'].values

In [None]:
# combined = pd.concat([data_train['comment_text'], data_test['comment_text']], axis=0)
# combined.describe()

In [None]:
# %cd d:/python/Toxic-comment-classification

In [None]:
# data_train.to_json('clean_data/data_train_cleaned_ver2.json')
# data_test.to_json('clean_data/data_test_cleaned_ver2.json')
# # labels.to_json('clean_data/labels.json')

In [None]:
# try:
#     data_train = pd.read_json('clean_data/data_train_cleaned.json')
#     data_test = pd.read_json('clean_data/data_test_cleaned.json')
#     labels = pd.read_json('clean_data/labels.json')
# except:
#     print('No such file')


## Split the data

In [None]:
# from skmultilearn.model_selection import iterative_train_test_split
# X_train, y_train, X_test, y_test = iterative_train_test_split(data_train, labels, test_size = 0.1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(data_train, labels, test_size = 0.2, random_state = 42)

## Apply TF_IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(max_features = 100000, ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True, smooth_idf=True, sublinear_tf=True)
train_term_doc = vec.fit_transform(X_train['comment_text'])
val_term_doc = vec.transform(X_val['comment_text'])
test_term_doc = vec.transform(data_test['comment_text'])
vec.get_feature_names_out()

In [None]:
vec.get_feature_names_out().shape

In [None]:
train_term_doc, test_term_doc

In [None]:
# from scipy.sparse import csr_matrix, hstack
# stack meta features with term-doc matrix
# x = hstack(trn_term_doc, df[META_FEATURES]).tocsr()
X_train = train_term_doc
X_val = val_term_doc
del train_term_doc
del val_term_doc

## Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, solver = 'lbfgs', dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.solver = solver
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(solver = self.solver, C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self


In [None]:
# def pr(y_i, y):
#     p = x[y==y_i].sum(0)
#     return (p+1) / ((y==y_i).sum()+1)

# def get_mdl(y):
#     y = y.values
#     r = np.log(pr(1,y) / pr(0,y))
#     model = LogisticRegression()
#     x_nb = x.multiply(r)
#     return model.fit(x_nb, y), r

# def get_model(y):
#     y = y.values
#     model = MultinomialNB()
#     return model.fit(x, y)

In [None]:
# preds = np.zeros((len(test), len(classes)))
models = []
for i, col in enumerate(classes):
    print(col)
    models.append(NbSvmClassifier(solver = 'liblinear', C=4, n_jobs=-1).fit(X_train, y_train[col]))
    preds = models[i].predict_proba(X_val)[:,1].reshape(-1, 1)
    print(roc_auc_score(y_val, preds))

In [None]:
# preds = np.zeros((len(test), len(clas)))
# model = []
# for i, col in enumerate(classes):
#     print(col)
#     model.append(get_model(train_labels[col]))
#     preds= model[i].predict_proba(val_x)[:,1].reshape(-1, 1)
#     print(roc_auc_score(valid_labels, preds))

In [None]:
preds = np.zeros((len(data_test), len(classes)))

for i, col in enumerate(classes):
    print(col)
    preds[:, i] = models[i].predict_proba(test_term_doc)[:, 1]

In [None]:
%cd /kaggle/working

In [None]:
submid = pd.DataFrame({'id': sample_submission["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = classes)], axis=1)
submission.to_csv('submission.csv', index=False)