<a href="https://colab.research.google.com/github/GaoangLiu/ipynb/blob/master/Toxic_Comment_Classification_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# What is this ?
This is a kaggle contest [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

The contestants are required to build a multi-headed model that’s capable of **detecting different types of of toxicity like threats, obscenity, insults, and identity-based hate better than Perspective’s current models**. You’ll be using a dataset of comments from Wikipedia’s talk page edits. Improvements to the current model will hopefully help online discussion become more productive and respectful.

Refer blog 
[kernal](https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline)



In [0]:
# Download data
! rm *
! wget -O data.zip bwg.140714.xyz:8000/toxic.zip 
! unzip data.zip 
! unzip train.csv.zip 
! unzip test.csv.zip 
! unzip test_labels.csv.zip 
! unzip sample_submission.csv.zip 
! ls 

Load necessary packages 

In [0]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


Read and process data

In [0]:
# train = pd.read_csv('train.csv')
# test  = pd.read_csv('test.csv')
# sumb  = pd.read_csv('sample_submission.csv')
# train

In [0]:
# label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# train['good'] = 1-train[label_cols].max(axis=1)
# train[train['comment_text'].str.len() < 15]

# re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
# def tokenize(s): 
#   return re_tok.sub(r' \1 ', s).split()

# # for ct in train['comment_text']:
# #   if len(ct) < 15:
# #     print(ct, tokenize(ct))

# # s = '“”¨«»®´·º½¾¿¡§£₤‘’'
# # print(s, tokenize(s))
# n = train.shape[0]
# COMMENT = 'comment_text'
# vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
#                min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
#                smooth_idf=1, sublinear_tf=1 )
# trn_term_doc = vec.fit_transform(train[COMMENT])
# test_term_doc = vec.transform(test[COMMENT])


In [0]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('train.csv').fillna(' ')
test = pd.read_csv('test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=1000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    # stop_words='english',
    ngram_range=(2, 6),
    max_features=5000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

# train_features = train_word_features 
# test_features  = test_word_features 

scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag')

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

submission.to_csv('submission.csv', index=False)

In [0]:
from google.colab import files 
files.download('submission.csv')