In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.multioutput import MultiOutputClassifier
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBClassifier




In [2]:
df = pd.read_csv('data/train.csv/train.csv')
total = df.shape[0]
print('rows:', df.shape[0])
# toxic,severe_toxic,obscene,threat,insult,identity_hate
print('# toxic:', df[(df['toxic'] == 1) | (df['severe_toxic'] == 1) | (df['obscene'] == 1) | (df['threat'] == 1) | (df['insult'] == 1) | (df['identity_hate'] == 1)].shape[0])

print('# toxic:', 1-df[(df['toxic'] == 1)].shape[0]/total)
print('# severe_toxic:', 1-df[(df['severe_toxic'] == 1)].shape[0]/total)
print('# obscene:', 1-df[(df['obscene'] == 1)].shape[0]/total)
print('# threat:', 1-df[(df['threat'] == 1)].shape[0]/total)
print('# insult:', 1-df[(df['insult'] == 1)].shape[0]/total)
print('# identity_hate:', 1-df[(df['identity_hate'] == 1)].shape[0]/total)

rows: 159571
# toxic: 16225
# toxic: 0.9041555169799024
# severe_toxic: 0.9900044494300343
# obscene: 0.947051782592075
# threat: 0.9970044682304429
# insult: 0.9506363938309592
# identity_hate: 0.9911951419744189


In [3]:
''' 
resampling - undersampling
'''

# Count the number of labels per row (you might already have this)
df['label_count'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)

# Separate the minority and majority instances
minority_df = df[df['label_count'] > 0]     # s = 16k
majority_df = df[df['label_count'] == 0]

# Under-sample the majority dataframe
sampled_majority_df = majority_df.sample(n=len(minority_df))

# Combine back the minority and downsampled majority instances
balanced_df = pd.concat([minority_df, sampled_majority_df])

# Now balanced_df is the under-sampled DataFrame
df = balanced_df
print(df.shape[0])

32450


In [4]:
total = df.shape[0]
print('rows:', df.shape[0])
# toxic,severe_toxic,obscene,threat,insult,identity_hate
print('# toxicity:', df[(df['toxic'] == 1) | (df['severe_toxic'] == 1) | (df['obscene'] == 1) | (df['threat'] == 1) | (df['insult'] == 1) | (df['identity_hate'] == 1)].shape[0])

print('# toxic:', 1-df[(df['toxic'] == 1)].shape[0]/total)
print('# severe_toxic:', 1-df[(df['severe_toxic'] == 1)].shape[0]/total)
print('# obscene:', 1-df[(df['obscene'] == 1)].shape[0]/total)
print('# threat:', 1-df[(df['threat'] == 1)].shape[0]/total)
print('# insult:', 1-df[(df['insult'] == 1)].shape[0]/total)
print('# identity_hate:', 1-df[(df['identity_hate'] == 1)].shape[0]/total)

rows: 32450
# toxicity: 16225
# toxic: 0.5286902927580894
# severe_toxic: 0.9508474576271186
# obscene: 0.7396302003081664
# threat: 0.9852696456086286
# insult: 0.7572573189522342
# identity_hate: 0.9567026194144839


In [5]:
'''
Basic Data Cleaning and Preprocessing
NOTE:
run time: 30 to 40 sec
Tokenization: turn text into tokens
Lemmatization: extracting a word's base form (ex: running -> run)
'''


# remove line breaks and special characters
df['comment_text'] = df['comment_text'].apply(lambda x: re.sub(r'\n', ' ', x))
df['comment_text'] = df['comment_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
print(df['comment_text'][6])

# Convert to lowercase
df['comment_text'] = df['comment_text'].apply(lambda x: x.lower())
print(df['comment_text'][6])

# Tokenization
df['comment_text'] = df['comment_text'].apply(lambda x: nltk.word_tokenize(x))
print(df['comment_text'][6])

# Removing Stop Words
stop_words = set(stopwords.words('english'))
df['comment_text'] = df['comment_text'].apply(lambda x: [word for word in x if word not in stop_words])
print(df['comment_text'][6])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['comment_text'] = df['comment_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print(df['comment_text'][6])

# Rejoin tokens into strings (for context for tfidf)
df['comment_text'] = df['comment_text'].apply(lambda x: ' '.join(x))
print(df['comment_text'][6])

# Create Vectorizer and transform data
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorized_data = tfidf_vectorizer.fit_transform(df['comment_text'])
print(tfidf_vectorized_data)

# Example output
df['comment_text'].head()

COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
cocksucker before you piss around on my work
['cocksucker', 'before', 'you', 'piss', 'around', 'on', 'my', 'work']
['cocksucker', 'piss', 'around', 'work']
['cocksucker', 'piss', 'around', 'work']
cocksucker piss around work
  (0, 70116)	0.36841989661122454
  (0, 7999)	0.4185522667627553
  (0, 48639)	0.5495493060800604
  (0, 15327)	0.6221545949003817
  (1, 68179)	0.13181942649756834
  (1, 44648)	0.2143077426641946
  (1, 34730)	0.11974909565530292
  (1, 10210)	0.15737065292629002
  (1, 15029)	0.1731964399550588
  (1, 57549)	0.26720643074482986
  (1, 8313)	0.1263563402492379
  (1, 44444)	0.2777365316772076
  (1, 19537)	0.2154621738029848
  (1, 7343)	0.24173387879126715
  (1, 4859)	0.2597352052342374
  (1, 51309)	0.11509816620524554
  (1, 8322)	0.21214228689383513
  (1, 45864)	0.08340080268346349
  (1, 27117)	0.18940599224571808
  (1, 51004)	0.24920510430185966
  (1, 55958)	0.23346350024595497
  (1, 19533)	0.2019607874960612
  (1, 28120)	0.1013

6                           cocksucker piss around work
12    hey talk exclusive group wp talibanswho good d...
16         bye dont look come think comming back tosser
42    gay antisemmitian archangel white tiger meow g...
43                            fuck filthy mother as dry
Name: comment_text, dtype: object

In [6]:
''' 
Train and Val split
'''

# Prepare the target variable
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y = df[label_columns]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectorized_data, y, test_size=0.2)

In [7]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=ConvergenceWarning)



# Train a logistic regression model for each label
param_grid = {
    'solver': ['liblinear', 'lbfgs', 'saga'],
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10],
    'max_iter': [1000]
}


models = {}
for column in y_train.columns:
    # Initialize the grid search model
    grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, n_jobs=-3)  
    grid_search.fit(X_train, y_train[column])
    models[column] = grid_search
    print("Best parameters:", grid_search.best_params_)

# Make predictions and evaluate each model
for label, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test[label], y_pred)
    print(f"Accuracy for {label}: {accuracy:.4f}")

NameError: name 'ConvergenceWarning' is not defined

In [None]:
for label, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test[label], y_pred)
    print(f"Accuracy for {label}: {accuracy:.4f}")
    print(f'Best model params {model.best_params_}')
    
''' 
Accuracy for toxic: 0.8749
Best model params {'C': 10, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}
Accuracy for severe_toxic: 0.9536
Best model params {'C': 1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy for obscene: 0.8992
Best model params {'C': 1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy for threat: 0.9875
Best model params {'C': 10, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy for insult: 0.8613
Best model params {'C': 1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy for identity_hate: 0.9635
Best model params {'C': 1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}'''

Accuracy for toxic: 0.8749
Best model params {'C': 10, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}
Accuracy for severe_toxic: 0.9536
Best model params {'C': 1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy for obscene: 0.8992
Best model params {'C': 1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy for threat: 0.9875
Best model params {'C': 10, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy for insult: 0.8613
Best model params {'C': 1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy for identity_hate: 0.9635
Best model params {'C': 1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
