In [1]:
from __future__ import division

import base64
import csv
import gzip
import zlib
import logging
import re
import urllib
import matplotlib.pyplot as plt
import numpy as np

from HTMLParser import HTMLParser
from collections import namedtuple, Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import stats

%matplotlib inline

In [2]:
TRACE_NUM = 1000
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

def trace(items_num, trace_num=TRACE_NUM):
    if items_num % trace_num == 0: logging.info("Complete items %05d" % items_num)

In [3]:
def to_utf8(text):
    if isinstance(text, unicode): text = text.encode('utf8')
    return text

def convert2unicode(f):
    def tmp(text):
        if not isinstance(text, unicode): text = text.decode('utf8')
        return f(text)
    return tmp

def convert2lower(f):
    def tmp(text):        
        return f(text.lower())
    return tmp

In [4]:
class TextHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self._text = []
        self._title = ""
        self._in_title = False
        self._title_text = []
        self._anchor = False
        self._anchor_text = []
        self._head = False
        self._head_text = []
        self._table = False
        self._table_text = []
        self._img_count = 0
        self._anchor_count = 0
        self._meta = []
        self._link_count = 0
        self._li_count = 0
        self._style_count = 0
        self._script_count = 0
        self._div_count = 0
        self._iframe_count = 0
        self._tag_count = 0
        self._anchor_data = []
        self._h1_count = 0
        self._h2_count = 0
        self._h3_count = 0
        

    def handle_data(self, data):
        text = data.strip()
        if len(text) > 0:
            text = re.sub('[ \t\r\n]+', ' ', text)
            self._text.append(text + ' ')
            if self._in_title:
                self._in_title = False
                self._title_text.append(text + ' ')
            elif self._anchor:
                self._anchor = False
                self._anchor_text.append(text + ' ')
            elif self._head:
                self._head = False
                self._head_text.append(text + ' ')
            elif self._table:
                self._table = False
                self._table_text.append(text + ' ')

    def handle_starttag(self, tag, attrs):
        self._tag_count += 1
        if tag == 'p':
            self._text.append('\n\n')
        elif tag == 'br':
            self._text.append('\n')
        elif tag == 'title':
            self._in_title = True
        elif tag == 'a':
            self._anchor = True
            self._anchor_count += 1
            self._anchor_data.append(dict(attrs))
        elif tag == 'head':
            self._head = True
        elif tag == 'table':
            self._table = True
        elif tag == 'img':
            self._img_count += 1
        elif tag == 'meta':
            self._meta.append(dict(attrs))
        elif tag == 'link':
            self._link_count += 1
        elif tag == 'li':
            self._li_count += 1
        elif tag == 'style':
            self._style_count += 1
        elif tag == 'div':
            self._div_count += 1
        elif tag == 'script':
            self._script_count += 1
        elif tag == 'iframe':
            self._iframe_count += 1
        elif tag == 'h1':
            self._h1_count += 1
        elif tag == 'h2':
            self._h2_count += 1
        elif tag == 'h3':
            self._h3_count += 1
            
    def handle_startendtag(self, tag, attrs):
        if tag == 'br':
            self._text.append('\n\n')

    def text(self):
        return ''.join(self._text).strip()
    
    def title_text(self):
        return ''.join(self._title_text).strip()
    
    def anchor_text(self):
        return ''.join(self._anchor_text).strip()
    
    def head_text(self):
        return ''.join(self._head_text).strip()
    
    def table_text(self):
        return ''.join(self._table_text).strip()
    
    def meta(self):
        return [self._meta, self._anchor_data]
    
    def tag_counts(self):
        return [self._anchor_count, self._img_count, self._link_count, self._li_count,
                self._style_count, self._iframe_count, self._div_count, self._script_count,
                self._h1_count, self._h2_count, self._h3_count, self._tag_count]
    
    

@convert2unicode
def html2text_parser(text):
    parser = TextHTMLParser()
    parser.feed(text)
    return [parser.text(), parser.title_text(), parser.anchor_text(), parser.head_text(), 
            parser.table_text()] + parser.meta() + parser.tag_counts()

In [5]:
html2text = html2text_parser

In [6]:
@convert2lower
@convert2unicode
def easy_tokenizer(text):
    word = unicode()
    for symbol in text:
        if symbol.isalnum(): word += symbol
        elif word:
            yield word
            word = unicode()
    if word: yield word


In [7]:
def html2word(raw_html, to_text=html2text, tokenizer=easy_tokenizer):
    text, title_text, anchor_text, head_text, table_text, meta, anchor_data, anchor, img, \
    link, li, style, iframe, div, script, h1, h2, h3, tag = to_text(raw_html)
    return [list(tokenizer(text.lower())), list(tokenizer(title_text.lower())), list(tokenizer(anchor_text.lower())), 
            list(tokenizer(head_text.lower())), list(tokenizer(table_text.lower())), 
            meta, anchor_data, anchor, img, link, li, style, iframe, div, script, h1, h2, h3, tag]

In [8]:
spam_dict = Counter()
not_spam_dict = Counter()
spam_url = Counter()
not_spam_url = Counter()
Docs_train = []
Docs_test = []

In [9]:
import nltk
from nltk.corpus import stopwords
stemmer = nltk.stem.snowball.RussianStemmer()
stop_words = stopwords.words('russian')

In [10]:
def load_docs_train(html_data, mark):
    packed = html2word(html_data)
    words = list(packed[0])
    anchor_data = packed[6]
    
    text = ' '.join(words)
    text = text.encode('utf-8')
    Docs_train.append(text)
    link_titles = []
    for anchor_dict in anchor_data:
        if 'href' in anchor_dict:
            url = anchor_dict['href']
        if 'title' in anchor_dict and anchor_dict['title'] is not None:
            link_titles.extend(filter(len, anchor_dict['title'].split()))
    
    if mark:
        for word in words:
            if word not in stop_words:
                spam_dict[word] += 1
        for word in link_titles:
            spam_url[word] += 1
    else:
        for word in words:
            if word not in stop_words:
                not_spam_dict[word] += 1
        for word in link_titles:
            not_spam_url[word] += 1
    return

In [11]:
def load_docs_test(html_data, mark=None):
    packed = html2word(html_data)
    words = list(packed[0])
    text = ' '.join(words)
    text = text.encode('utf-8')
    Docs_test.append(text)
    return

In [12]:
def csv_to_dicts(input_file_name, func):    
    with gzip.open(input_file_name) if input_file_name.endswith('gz') else open(input_file_name)  as input_file:            
        headers = input_file.readline()
        
        for i, line in enumerate(input_file):
            trace(i)
            parts = line.strip().split('\t')
            url_id = int(parts[0])                                        
            if int(parts[1]) == -1:
                mark = None
            else:
                mark = bool(int(parts[1]))                    
            url = parts[2]
            pageInb64 = parts[3]
            html_data = base64.b64decode(pageInb64)
            func(html_data, mark)          
                
        trace(i, 1)        

In [15]:
%%time
TRAIN_DATA_FILE  = 'kaggle_train_data_tab.csv.gz'
csv_to_dicts(TRAIN_DATA_FILE, load_docs_train)

15:28:07 INFO:Complete items 00000
15:28:40 INFO:Complete items 01000
15:29:07 INFO:Complete items 02000
15:29:36 INFO:Complete items 03000
15:30:04 INFO:Complete items 04000
15:30:34 INFO:Complete items 05000
15:31:02 INFO:Complete items 06000
15:31:33 INFO:Complete items 07000
15:31:34 INFO:Complete items 07043


CPU times: user 3min 26s, sys: 640 ms, total: 3min 27s
Wall time: 3min 26s


In [20]:
%%time
TEST_DATA_FILE  = 'kaggle_test_data_tab.csv.gz'
csv_to_dicts(TEST_DATA_FILE, load_docs_test)

15:35:58 INFO:Complete items 00000
15:36:28 INFO:Complete items 01000
15:36:55 INFO:Complete items 02000
15:37:19 INFO:Complete items 03000
15:37:42 INFO:Complete items 04000
15:38:03 INFO:Complete items 05000
15:38:29 INFO:Complete items 06000
15:38:51 INFO:Complete items 07000
15:39:13 INFO:Complete items 08000
15:39:37 INFO:Complete items 09000
15:40:08 INFO:Complete items 10000
15:40:30 INFO:Complete items 11000
15:40:52 INFO:Complete items 12000
15:41:14 INFO:Complete items 13000
15:41:38 INFO:Complete items 14000
15:42:02 INFO:Complete items 15000
15:42:29 INFO:Complete items 16000
15:42:30 INFO:Complete items 16038


CPU times: user 6min 32s, sys: 1.23 s, total: 6min 33s
Wall time: 6min 31s


In [13]:
print len(spam_dict)
print len(not_spam_dict)
print len(spam_url)
print len(Docs_train)
print len(Docs_test)

0
0
0
0
0


In [14]:
import pickle

def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [23]:
save_obj(spam_dict, "spam_dict")
save_obj(not_spam_dict, "not_spam_dict")
save_obj(spam_url, "spam_url")
save_obj(not_spam_url, "not_spam_url")
save_obj(Docs_train, "Docs_train")
save_obj(Docs_test, "Docs_test")

In [15]:
spam_dict = load_obj("spam_dict")
not_spam_dict = load_obj("not_spam_dict")
spam_url = load_obj("spam_url")
not_spam_url = load_obj("not_spam_url")
Docs_train = load_obj("Docs_train")
Docs_test = load_obj("Docs_test")

In [16]:
print len(spam_dict)
print len(not_spam_dict)
print len(spam_url)
print len(Docs_train)
print len(Docs_test)

350855
448424
9302
7044
16039


In [17]:
avg_l = np.mean(map(len, Docs_train + Docs_test))
print avg_l

21450.5082961


In [18]:
most_common_spam_200 = dict(spam_dict.most_common(200))
most_common_not_spam_200 = dict(not_spam_dict.most_common(200))
most_common_spam = dict()
for key in most_common_spam_200:
    if key not in stop_words and key not in most_common_not_spam_200:
        most_common_spam[stemmer.stem(key)] = most_common_spam_200[key] / len(Docs_train)
print len(most_common_spam)

79


In [20]:
vectorizer = TfidfVectorizer(stop_words=stop_words)
vectorizer.fit(Docs_train + Docs_test)

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=[u'\u0438', u'\u0432', u'\u0432\u043e', u'\u043d\u0435', u'\u0447\u0442\u043e', u'\u043e\u043d', u'\u043d\u0430', u'\u044f', u'\u0441', u'\u0441\u043e', u'\u043a\u0430\u043a', u'\u0430', u'\u0442\u043e', u'\u0432\u0441\u0435', u'\u043e\u043d\u0430', u'\u0442\u0430\u043a', u'\u0435\u0433\u...043a\u043e\u043d\u0435\u0447\u043d\u043e', u'\u0432\u0441\u044e', u'\u043c\u0435\u0436\u0434\u0443'],
        strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [21]:
def safe_divide(a, b):
    if a == 0: return 0.0
    elif b == 0: return 0.0
    else: return a/b

In [22]:
def url_features(url):
    size = 6
    Features = [0] * size
    if not isinstance(url, str):
        return Features
    url = re.sub(r".+://", r"", url)
    url = re.sub(r"/$", r"", url)
    url = re.sub(r"\n", r"", url)
    lst_split = re.split(r"\?" , urllib.unquote(unicode(url)))
    #убираем имя хоста
    path = re.split(r"/" , lst_split[0], maxsplit=1)
    params = []
    if len(lst_split) == 2:
        params = re.split(r"&" , lst_split[1])
    idx = 0
    Features[idx] = len(params)
    idx += 1
    if len(path) == 2:
        path = path[1]
        Features[idx] = len(re.findall(r'/', path)) + 1
        idx += 1
        segments = re.split(r"/", path)
        for segment in segments:
            Features[idx] = len(segment)
            idx += 1
            ext = re.search(r"(\.)([^\.]+$)" , segment)
            Features[idx] = int(bool(ext))
            idx += 1
            if idx == len(Features):
                break
    
    return Features

In [23]:
#здесь может получиться много нулевых фичей, но случайный лес позаботится об отборе нужных фичей
def statistics(sample):
    size = len(sample)
    if size == 0:
        return [0.0] * 17
    mean = np.mean(sample)
    maximum = max(sample)
    minimum = min(sample)
    _range = maximum - minimum
    deviation = np.std(sample)
    median = np.median(sample)
    iqr = stats.iqr(sample)
    mode = stats.mode(sample)[0][0]
    mode_amplitude = safe_divide(sample.count(mode), size * 100.0)
    volt_index = safe_divide(mode_amplitude, 2.0 * mode * _range)
    veg_bal_ind = safe_divide(mode_amplitude, _range)
    veg_rithm = safe_divide(1.0, mode * _range)
    adequacy = safe_divide(mode_amplitude, mode)
    hmean = stats.hmean(filter(lambda x: x > 0.0, sample))
    gmean = stats.gmean(sample)
    kurtosis = stats.kurtosis(sample)
    skew = stats.skew(sample)
    
    return [mean, maximum, minimum, _range, deviation, median, iqr, mode, mode_amplitude,
            volt_index, veg_bal_ind, veg_rithm, adequacy, hmean, gmean, kurtosis, skew]

In [24]:
def calc_features(url, html_data, mark=None):
    words, title_words, anchor_words, head_words, table_words, meta, anchor_data, anchor_count, \
    img_count, link_count, li_count, style_count, iframe_count, div_count, script_count, \
    h1_count, h2_count, h3_count, tag_count = html2word(html_data)
    
    words_num = len(words)
    title_words_num = len(title_words)
    anchor_words_num = len(anchor_words)
    head_words_num = len(head_words)
    table_words_num = len(table_words)
    meta_count = len(meta)
    
    words_len = map(len, words)
    title_words_len = map(len, title_words)
    anchor_words_len = map(len, anchor_words)
    head_words_len = map(len, head_words)
    table_words_len = map(len, table_words)
    
    stats = (statistics(words_len) + statistics(title_words_len) + statistics(anchor_words_len)
             + statistics(head_words_len) + statistics(table_words_len))
    
    compression_level = 0
    text = ' '.join(words)
    text = text.encode('utf-8')
    compressed = zlib.compress(text)
    compression_level = 1.0 - safe_divide(len(compressed), len(text))
    
    keywords = []
    for meta_dict in meta:
        if u'name' in meta_dict and meta_dict[u'name'] == 'keywords':
            if u'content' in meta_dict:
                keywords = filter(len, re.split(r",| ", meta_dict[u'content'].lower()))
        elif u'name' in meta_dict and meta_dict[u'name'] == 'description':
            if u'content' in meta_dict:
                keywords = filter(len, re.split(r",| ", meta_dict[u'content'].lower()))
    
    keywords_freq = [0.0]* len(keywords)
    keywords_ratio = [0.0]* len(keywords)
    keywords_w = [0.0]* len(keywords)
    k1 = 0.0
    b = 0.0
    for idx, keyword in enumerate(keywords):
        keywords_freq[idx] = words.count(keyword)
        if keyword in vectorizer.vocabulary_:
            keywords_w[idx] = (k1 + 1) * keywords_freq[idx] / (k1 + ((1 - b) + b * (len(text)/avg_l))) \
             * vectorizer.idf_[vectorizer.vocabulary_[keyword]]
            keywords_ratio[idx] = keywords_w[idx] / words_num
       
    stats.extend(statistics(keywords_freq))
    stats.extend(statistics(keywords_ratio)) 
    stats.extend(statistics(keywords_w))
    
    words_freq = Counter()
    for word in words:
        words_freq[word] += 1
    stats.extend(statistics(words_freq.values()))
    
    link_features = []
    link_titles = []
    for anchor_dict in anchor_data:
        if 'href' in anchor_dict:
            url = anchor_dict['href']
            link_features.append(url_features(url))
        if 'title' in anchor_dict and anchor_dict['title'] is not None:
            link_titles.extend(filter(len, anchor_dict['title'].split()))
    if link_features:
        stats.extend(statistics(np.mean(np.array(link_features), axis=0).tolist()))
    else:
        stats.extend(statistics([]))
    
    bayes = 0
    bayes_url = 0
    most_common_counter = {key:0 for key in most_common_spam}
    
    for word in words:
        stemmed = stemmer.stem(word)
        if stemmed in most_common_counter:
            most_common_counter[stemmed] += 1
        
        if word in spam_dict and word in not_spam_dict:
            bayes += np.log(spam_dict[word] / not_spam_dict[word])
        elif word in spam_dict and word not in not_spam_dict:
            bayes += np.log(spam_dict[word])
        elif word not in spam_dict and word in not_spam_dict:
            bayes -= np.log(not_spam_dict[word])
    bayes /= words_num
    
    for word in link_titles:
        if word in spam_url and word in not_spam_url:
            bayes_url += np.log(spam_url[word] / not_spam_url[word])
        elif word in spam_url and word not in not_spam_url:
            bayes_url += np.log(spam_url[word])
        elif word not in spam_url and word in not_spam_url:
            bayes_url -= np.log(not_spam_url[word])
    bayes_url = safe_divide(bayes_url, len(link_titles))
    
    for key in most_common_counter:
        most_common_counter[key] *= most_common_spam[key]
    
    most_common = most_common_counter.values()
    stats.extend(statistics(most_common))
    
    return [words_num, title_words_num, anchor_words_num, head_words_num, table_words_num,
            img_count, anchor_count, meta_count, link_count, li_count, style_count, iframe_count, div_count, script_count,
            compression_level, tag_count, h1_count, h2_count, h3_count,
            safe_divide(words_num, tag_count), safe_divide(tag_count, anchor_count + link_count), 
            safe_divide(words_num, script_count), safe_divide(tag_count, script_count), bayes, bayes_url] \
            + url_features(url) + stats + [sum(most_common)]

In [25]:
DocItem = namedtuple('DocItem', ['doc_id', 'is_spam', 'url', 'features'])

def load_csv(input_file_name, calc_features_f):    
    """
    Загружаем данные и извлекаем на лету признаки
    Сам контент не сохраняется, чтобы уменьшить потребление памяти - чтобы
    можно было запускать даже на ноутбуках в классе
    """
    
    with gzip.open(input_file_name) if input_file_name.endswith('gz') else open(input_file_name)  as input_file:            
        headers = input_file.readline()
        
        for i, line in enumerate(input_file):
            trace(i)
            parts = line.strip().split('\t')
            url_id = int(parts[0])                                        
            if int(parts[1]) == -1:
                mark = None
            else:
                mark = bool(int(parts[1]))                    
            url = parts[2]
            pageInb64 = parts[3]
            html_data = base64.b64decode(pageInb64)
            features = calc_features_f(url, html_data, mark) 
            yield DocItem(url_id, mark, url, features)            
                
        trace(i, 1)        

In [26]:
%%time
TRAIN_DATA_FILE  = 'kaggle_train_data_tab.csv.gz'
train_docs = list(load_csv(TRAIN_DATA_FILE, calc_features))
print len(train_docs)

21:03:08 INFO:Complete items 00000
  return size / np.sum(1.0 / a, axis=axis, dtype=dtype)
  log_a = np.log(np.array(a, dtype=dtype))
21:07:26 INFO:Complete items 01000
21:10:47 INFO:Complete items 02000
21:14:34 INFO:Complete items 03000
21:18:25 INFO:Complete items 04000
21:22:27 INFO:Complete items 05000
21:25:30 INFO:Complete items 06000
21:29:25 INFO:Complete items 07000
21:29:32 INFO:Complete items 07043


7044
CPU times: user 26min 25s, sys: 4.59 s, total: 26min 29s
Wall time: 26min 24s


In [27]:
%%time
TEST_DATA_FILE  = 'kaggle_test_data_tab.csv.gz'
test_docs = list(load_csv(TEST_DATA_FILE, calc_features))
print len(test_docs)

21:29:32 INFO:Complete items 00000
21:33:55 INFO:Complete items 01000
21:37:34 INFO:Complete items 02000
21:41:20 INFO:Complete items 03000
21:44:48 INFO:Complete items 04000
21:48:01 INFO:Complete items 05000
21:51:40 INFO:Complete items 06000
21:55:05 INFO:Complete items 07000
21:58:51 INFO:Complete items 08000
22:02:14 INFO:Complete items 09000
22:06:23 INFO:Complete items 10000
22:09:42 INFO:Complete items 11000
22:13:02 INFO:Complete items 12000
22:16:00 INFO:Complete items 13000
22:19:35 INFO:Complete items 14000
22:23:16 INFO:Complete items 15000
22:26:50 INFO:Complete items 16000
22:27:03 INFO:Complete items 16038


16039
CPU times: user 57min 41s, sys: 10.2 s, total: 57min 51s
Wall time: 57min 31s


In [28]:
save_obj(train_docs, "train_docs")
save_obj(test_docs, "test_docs")

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import scale, normalize
from sklearn.feature_selection import SelectFromModel, SelectKBest
from scipy.sparse import hstack
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator
import xgboost as xgb
import lightgbm as lgbm

In [30]:
X_train = []
y_train = []
for doc in train_docs:
    X_train.append(doc.features)
    y_train.append(doc[1])
X_train = np.array(X_train)
y_train = np.array(y_train).astype(int)
print X_train.shape
print y_train.shape

(7044, 219)
(7044,)


In [31]:
X_train = np.nan_to_num(X_train)
X_norm = scale(X_train)

In [79]:
clf1 = RandomForestClassifier(400, n_jobs=-1)
clf2 = xgb.XGBClassifier(n_estimators = 500, n_jobs=-1)
clf3 = lgbm.LGBMClassifier(n_estimators=100, n_jobs=-1)

In [75]:
print np.mean(cross_val_score(clf1, X_train, y_train, cv=7, n_jobs=-1, scoring='f1_weighted'))
print np.mean(cross_val_score(clf2, X_train, y_train, cv=7, n_jobs=-1, scoring='f1_weighted'))
print np.mean(cross_val_score(clf3, X_train, y_train, cv=7, n_jobs=-1, scoring='f1_weighted'))

0.98453069177
0.987508881302


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


0.986087533686


In [80]:
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
        max_bin=255, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)

In [81]:
X_test = []
for doc in test_docs:
    X_test.append(doc.features)
X_test = np.array(X_test)
print X_test.shape

(16039, 219)


In [82]:
X_test = np.nan_to_num(X_test)

In [89]:
result1 = clf1.predict(X_test)
result2 = clf2.predict(X_test)
result3 = clf3.predict(X_test)

In [94]:
result = np.zeros(result1.shape[0]).astype(int)

In [95]:
for i in xrange(result1.shape[0]):
    if result1[i] + result2[i] == 2 or result1[i] + result3[i] == 2 or result2[i] + result3[i] == 2:
        result[i] = 1

In [96]:
with open('my_submission.csv' , 'wb') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Id','Prediction'])
    for idx, doc in enumerate(test_docs):
        writer.writerow([doc.doc_id, result[idx]])