In [1]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train = pd.read_csv('train_data.csv',error_bad_lines=False)
test = pd.read_csv('df.csv',error_bad_lines=False)

In [3]:
train['label'].value_counts(normalize=True)

0    0.52987
1    0.47013
Name: label, dtype: float64

In [4]:
train['language'].value_counts(normalize=True)

Hindi         0.461896
Telugu        0.145873
Marathi       0.108330
Tamil         0.104500
Malayalam     0.061598
Bengali       0.034336
Kannada       0.020966
Odia          0.016501
Gujarati      0.013274
Haryanvi      0.013250
Bhojpuri      0.008727
Rajasthani    0.006568
Assamese      0.004180
Name: language, dtype: float64

In [5]:
lens = train.commentText.str.len()
lens.mean(), lens.std(), lens.max()

(70.200833932293, 92.01143523853266, 13829)

In [6]:
label_cols = ['label']

In [7]:
len(train),len(test)

(665042, 128337)

In [8]:
train['commentText'].fillna("unknown", inplace=True)
test['commentText'].fillna("unknown", inplace=True)

In [9]:
import re, string
from indicnlp.tokenize import indic_tokenize
def tokenize(s): return indic_tokenize.trivial_tokenize(s)

In [10]:
n = train.shape[0]

vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                      strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1)


trn_term_doc = vec.fit_transform(train['commentText'])
test_term_doc = vec.transform(test['commentText'])

In [11]:
trn_term_doc, test_term_doc

(<665042x4978822 sparse matrix of type '<class 'numpy.float64'>'
 	with 15402533 stored elements in Compressed Sparse Row format>,
 <128337x4978822 sparse matrix of type '<class 'numpy.float64'>'
 	with 1404991 stored elements in Compressed Sparse Row format>)

In [12]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [13]:
x = trn_term_doc
test_x = test_term_doc

In [14]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=2, dual=False, solver='liblinear')
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [15]:
preds = np.zeros((len(test), len(label_cols)))


for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds = m.predict(test_x.multiply(r))

fit label


In [16]:
submid = pd.DataFrame({'CommentId': test["CommentId"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = ['Expected'])], axis=1)

submission.to_csv('submission.csv', index=False)