In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import zipfile
import os
import plotly.express as px
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import umap
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier
from joblib import dump


In [2]:
nltk.download('wordnet', '/kaggle/output')
wordnet_zip_path = '/kaggle/output/corpora/wordnet.zip'

wordnet_extract_path = '/kaggle/output/corpora/'

nltk.data.path.append('/kaggle/output')

if not os.path.exists(os.path.join(wordnet_extract_path, 'corpora', 'wordnet')):
    with zipfile.ZipFile(wordnet_zip_path, 'r') as zip_ref:
        zip_ref.extractall(wordnet_extract_path)

[nltk_data] Downloading package wordnet to /kaggle/output...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
train_dataset = pd.read_csv('/kaggle/input/kma-ml2/kmaml223/train.csv')
train_comments = train_dataset['comment_text'].values
print('Train dataset number of samples: ' + str(train_comments.shape[0]))

Train dataset number of samples: 159571


In [4]:
test_dataset = pd.read_csv('/kaggle/input/kma-ml2/kmaml223/test.csv')
test_comments = test_dataset['comment_text'].values
print('Test dataset number of samples: ' + str(test_comments.shape[0]))

Test dataset number of samples: 63978


### Selecting Dataset Sample

In [5]:
dataset_size = 100000
train_dataset = train_dataset.sample(n=dataset_size, random_state=42)
train_comments = train_dataset['comment_text'].values
train_labels = train_dataset.iloc[:, 2:].to_numpy()

### Cleaning

In [6]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def clean_comments(comments):
    translator = str.maketrans('', '', string.punctuation)
    comments_clean = [comment.translate(translator).lower() for comment in comments]
    return comments_clean

def preprocess_comments(comments):
    comments_clean = clean_comments(comments)
    comments_lem = [lemmatize_text(comment) for comment in comments_clean]
    return comments_lem


In [7]:
train_comments_preproc = preprocess_comments(train_comments)
test_comments_preproc = preprocess_comments(test_comments)

### Vectorizing: TfidfVectorizer

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, strip_accents='unicode', stop_words = 'english')

train_feature_vectors_tfidf = tfidf_vectorizer.fit_transform(train_comments_preproc).toarray()
print(len(train_feature_vectors_tfidf))

test_feature_vectors_tfidf = tfidf_vectorizer.transform(test_comments_preproc).toarray()

100000


### Classification

In [9]:
def get_classifier(data, train_labels):
    X = data
    y = train_labels

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    classifier = MultiOutputClassifier(LogisticRegression(C=10, max_iter=100000))

    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    y_pred_train = classifier.predict(X_train)

    accuracy = accuracy_score(y_test, y_pred)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    print(f"Accuracy train: {accuracy_train}")
    print(f"Accuracy test: {accuracy}\n")
    return classifier

### Dimensionality reduction: PCA explained variance 0.8

In [10]:
pca2 = PCA(n_components=0.8)
reduced_train2_pca = pca2.fit_transform(train_feature_vectors_tfidf)
print(reduced_train2_pca.shape)

reduced_test2_pca = pca2.transform(test_feature_vectors_tfidf)
print(reduced_test2_pca.shape)

classifier2 = get_classifier(reduced_train2_pca, train_labels)

(100000, 2036)
(63978, 2036)
Accuracy train: 0.927125
Accuracy test: 0.9144



### Dimensionality reduction: PCA explained variance 0.9

In [11]:
pca3 = PCA(n_components=0.9)
reduced_train3_pca = pca3.fit_transform(train_feature_vectors_tfidf)
print(reduced_train3_pca.shape)

reduced_test3_pca = pca3.transform(test_feature_vectors_tfidf)
print(reduced_test3_pca.shape)

classifier3 = get_classifier(reduced_train3_pca, train_labels)

(100000, 3023)
(63978, 3023)
Accuracy train: 0.9306875
Accuracy test: 0.91365



### Dimensionality reduction: PCA explained variance 0.95

In [12]:
pca4 = PCA(n_components=0.95)
reduced_train4_pca = pca4.fit_transform(train_feature_vectors_tfidf)
print(reduced_train4_pca.shape)

reduced_test4_pca = pca4.transform(test_feature_vectors_tfidf)
print(reduced_test4_pca.shape)

classifier4 = get_classifier(reduced_train4_pca, train_labels)

(100000, 3752)
(63978, 3752)
Accuracy train: 0.9325625
Accuracy test: 0.9143



In [10]:
pca5 = PCA(n_components=0.99)
reduced_train5_pca = pca5.fit_transform(train_feature_vectors_tfidf)
print(reduced_train5_pca.shape)

reduced_test5_pca = pca5.transform(test_feature_vectors_tfidf)
print(reduced_test5_pca.shape)

classifier5 = get_classifier(reduced_train5_pca, train_labels)

(100000, 4591)
(63978, 4591)
Accuracy train: 0.9342125
Accuracy test: 0.9144



### Classification of test data

In [11]:
classifier = classifier5

In [12]:
test_pred = classifier.predict(reduced_test5_pca)

In [13]:
columns = ['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

ids = test_dataset['id'].values

In [14]:
result = np.column_stack((ids, *test_pred.T)).tolist()

In [15]:
df = pd.DataFrame(result, columns=columns)

df.to_csv('sample_submission.csv', index=False)

print(df)

                     id  toxic  severe_toxic  obscene  threat  insult  \
0      0001ea8717f6de06      0             0        0       0       0   
1      000247e83dcc1211      1             0        0       0       0   
2      0002f87b16116a7f      0             0        0       0       0   
3      0003e1cccfd5a40a      0             0        0       0       0   
4      00059ace3e3e9a53      0             0        0       0       0   
...                 ...    ...           ...      ...     ...     ...   
63973  fff8f64043129fa2      0             0        0       0       0   
63974  fff9d70fe0722906      1             0        1       0       1   
63975  fffa8a11c4378854      1             0        0       0       0   
63976  fffac2a094c8e0e2      1             1        1       0       1   
63977  fffb5451268fb5ba      0             0        0       0       0   

       identity_hate  
0                  0  
1                  0  
2                  0  
3                  0  
4       

In [16]:
dump(classifier, "classifier.joblib")
dump(tfidf_vectorizer, "vectorizer.joblib")
dump(pca5, 'pca_model.joblib')

['pca_model.joblib']