In [15]:
import pandas as pd
import numpy as np
import zipfile
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from nltk.corpus import stopwords
from sklearn.metrics import *
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.regexp import RegexpTokenizer
from tqdm.notebook import tqdm
from sklearn.linear_model import LogisticRegression

In [2]:
df='/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'
df2=pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip")

In [3]:
df2

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [4]:
stop_words=stopwords.words("english")

In [5]:
df2["comment_text"]=df2.comment_text.apply(
    lambda row:re.sub(r'[^a-z0-9]+',"",row.lower()).strip()
)

In [6]:
stemmer=SnowballStemmer('english')
tokenizer=RegexpTokenizer(r'\w{2,}')
def preprocessing(text):
    list_words=[]
    word=tokenizer.tokenize(text)
    for w in word:
        if w not in stop_words:
            x=stemmer.stem(w)
            list_words.append(x)
    new_list=''.join(list_words)
    return new_list
            

In [7]:
corpus=[preprocessing(text) for text in tqdm(df2.comment_text)]

  0%|          | 0/159571 [00:00<?, ?it/s]

In [8]:
df2["corpus"]=corpus
df2

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,corpus
0,0000997932d777bf,explanationwhytheeditsmadeundermyusernamehardc...,0,0,0,0,0,0,explanationwhytheeditsmadeundermyusernamehardc...
1,000103f0d9cfb60f,dawwhematchesthisbackgroundcolourimseeminglyst...,0,0,0,0,0,0,dawwhematchesthisbackgroundcolourimseeminglyst...
2,000113f07ec002fd,heymanimreallynottryingtoeditwaritsjustthatthi...,0,0,0,0,0,0,heymanimreallynottryingtoeditwaritsjustthatthi...
3,0001b41b1c6bb37e,moreicantmakeanyrealsuggestionsonimprovementiw...,0,0,0,0,0,0,moreicantmakeanyrealsuggestionsonimprovementiw...
4,0001d958c54c6e35,yousiraremyheroanychanceyourememberwhatpagetha...,0,0,0,0,0,0,yousiraremyheroanychanceyourememberwhatpagetha...
...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,andforthesecondtimeofaskingwhenyourviewcomplet...,0,0,0,0,0,0,andforthesecondtimeofaskingwhenyourviewcomplet...
159567,ffea4adeee384e90,youshouldbeashamedofyourselfthatisahorriblethi...,0,0,0,0,0,0,youshouldbeashamedofyourselfthatisahorriblethi...
159568,ffee36eab5c267c9,spitzerummtheresnoactualarticleforprostitution...,0,0,0,0,0,0,spitzerummtheresnoactualarticleforprostitution...
159569,fff125370e4aaaf3,anditlookslikeitwasactuallyyouwhoputonthespeed...,0,0,0,0,0,0,anditlookslikeitwasactuallyyouwhoputonthespeed...


In [9]:
# new_df['comment_text']=new_df["comment_text"].str.lower()

In [10]:
X_train, X_test = train_test_split(
    df2['corpus'], test_size = 0.4, random_state = 2024
) 

X_valid, X_test = train_test_split(
    X_test, test_size = 0.5, random_state = 2024
) 

In [11]:
vec = TfidfVectorizer(stop_words=stop_words)
features_train = vec.fit_transform(X_train)
features_test = vec.transform(X_test)
features_valid = vec.transform(X_valid)

In [12]:
target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [13]:
models_list = []
for col in tqdm(target_cols):
    clf = LogisticRegression(class_weight='balanced', n_jobs=-1)
    clf.fit(features_train, df2.loc[X_train.index, col])
    
    models_list += [(col, clf)]

  0%|          | 0/6 [00:00<?, ?it/s]

In [17]:
for item in models_list:
    col, clf = item
    print(col)
    print(f'train f1: {f1_score(df2.loc[X_train.index, col], clf.predict(features_train))}')
    print(f'test f1: {f1_score(df2.loc[X_test.index, col], clf.predict(features_test))}')
    print(f'valid f1: {f1_score(df2.loc[X_valid.index, col], clf.predict(features_valid))}' + '\n'*2)

toxic
train f1: 0.9995079547318353
test f1: 0.01420271142672692
valid f1: 0.012861736334405146


severe_toxic
train f1: 0.9955022488755623
test f1: 0.006944444444444445
valid f1: 0.0


obscene
train f1: 0.9992166079122601
test f1: 0.0085995085995086
valid f1: 0.012621916236374068


threat
train f1: 0.9911190053285969
test f1: 0.03669724770642201
valid f1: 0.0


insult
train f1: 0.9983111674055309
test f1: 0.009120521172638436
valid f1: 0.010989010989010988


identity_hate
train f1: 0.998282770463652
test f1: 0.016
valid f1: 0.01384083044982699


