*Baseline model using Logstic Regression 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

In [3]:
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

x_train=preprocess(train["comment_text"])
y_train=preprocess(train["target"])


In [4]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1))

In [5]:
x_test=preprocess(test["comment_text"])
text = pd.concat([x_train, x_test])
word_vectorizer.fit(text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [6]:
x_train = word_vectorizer.transform(x_train)
x_test = word_vectorizer.transform(x_test)

In [7]:
y_train = np.where(train['target'] >= 0.5, 1, 0)

In [8]:
#put down regulization
clf = LogisticRegression(C=0.2, solver='sag', max_iter=200)
cv_score = np.mean(cross_val_score(clf, x_train, y_train, cv=5, scoring='roc_auc'))

In [9]:
clf.fit(x_train, y_train)
predictions = clf.predict_proba(x_test)[:, 1]

In [10]:
clf.score(x_train, y_train)

0.9456671213613803

In [11]:
print('CV score is {}'.format(cv_score))

CV score is 0.9443779727528769


*We can see by removing the vocab size limit, the accuracy is improved. 

In [12]:
submission = pd.DataFrame.from_dict({
    'id': test['id'],
    'prediction': predictions
})
submission.to_csv('submission.csv', index=False)

*The sample code is from this kernel [https://www.kaggle.com/tyagit3/logistic-regression-with-tfidf-word-level]