# ** Описание **
Построить графики распределения в спам и не спам множествах следующих признаков:

1	Количество слов на странице

2	Средняя длинна слова

3	Количество слов в заголовке страниц (слова в теге <html><head><title> Some text </title>)

4	Количество слов в анкорах ссылок (<html><body><a> Some text </a>)

5	Коэффициент сжатия

Нужно посчитать статистику минимум по трем признакам и обязательно сделать для 1-го и 2-го признаков

И отправить первое решение в соревнование https://kaggle.com/join/antispam_infopoisk
На основании одного из указанных выше признаков попытаться разделить мн-во, так чтобы score в соревновании был больше 0.55

При выполнении всех этих условия в течении семинара +1 балл к ДЗ

Описание ДЗ и правил выставления за него баллов в https://inclass.kaggle.com/c/antispam-infopoisk  
Срок для ИТМО - 1 неделя  
Срок для Техносферы - 3 недели  

In [1]:
import base64
import zlib
import csv
from collections import namedtuple

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
TRACE_NUM = 1000
import logging
import importlib
importlib.reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

def trace(items_num, trace_num=TRACE_NUM):
    if items_num % trace_num == 0: logging.info("Complete items %05d" % items_num)

In [3]:
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

ParsingResult = namedtuple('ParsingResult', ['title_text', 'links_text', 'text'])
def parse_html(raw_html):
    soup = BeautifulSoup(raw_html, "html.parser")
    title = "" if soup.title is None or soup.title.string is None else soup.title.string
    links = [link.string for link in soup.findAll('a') if link.string is not None]
    [s.extract() for s in soup(['script', 'style'])]
    full_text = soup.get_text()
    return ParsingResult(title, links, full_text)

In [4]:
def easy_tokenizer(text):
    text = text.lower()
    word = ""
    for symbol in text:
        if symbol.isalnum(): word += symbol
        elif word:
            yield word
            word = ""
    if word: yield word
        
def text2words(text, tokenizer=easy_tokenizer):
    return tokenizer(text.lower())

In [5]:
WordsFeaturesResult = namedtuple('WordsFeaturesResult', ['title_text', 'links_text', 'text_words'])
def get_words_features(url, html_data):
    pr = parse_html(html_data)
    return WordsFeaturesResult(pr.title_text, pr.links_text, list(text2words(pr.text)))

In [6]:
DocItem = namedtuple('DocItem', ['doc_id', 'is_spam', 'url', 'features'])

def load_raw_csv(input_file_name, calc_features_f):    
    with open(input_file_name)  as input_file:
        headers = input_file.readline()
        for i, line in enumerate(input_file):
            trace(i)
            parts = line.strip().split('\t')
            url_id = int(parts[0])                                        
            mark = bool(int(parts[1]))                    
            url = parts[2]
            pageInb64 = parts[3]
            html_data = base64.b64decode(pageInb64)
            features = calc_features_f(url, html_data)            
            yield DocItem(url_id, mark, url, features)            
                
        trace(i, 1)        

In [7]:
def raw_csv2text_csv(name):
    raw_csv_file = '/home/kirill/ssd-pool/kaggle_{}_data_tab.csv'.format(name)
    with open('text_features_{}.csv'.format(name) , 'w') as fout:
        writer = csv.writer(fout)
        writer.writerow(['doc_id','is_spam', 'url', 'title_text', 'links_text', 'text_words'])
        for doc in load_raw_csv(raw_csv_file, get_words_features):
            writer.writerow([doc.doc_id, doc.is_spam, doc.url, 
                             doc.features.title_text, doc.features.links_text, doc.features.text_words])

In [8]:
#raw_csv2text_csv('train')
#raw_csv2text_csv('test')

In [8]:
def split_text_to_list(raw):
    rv = [i[1:-1].strip() for i in raw[1:-1].split(", ")]
    return rv

# change input df
def extract_features_from_texts(texts):
    texts.links_text = texts.links_text.apply(split_text_to_list)
    texts.text_words = texts.text_words.apply(split_text_to_list)
    texts['words_num'] = texts.text_words.apply(lambda x: len(x))
    texts['avg_word_len'] = texts.apply(lambda row: sum([len(w) for w in row['text_words']])/float(row['words_num']), 
                                        axis=1)
    texts['title_words_num'] = texts.title_text.apply(lambda x: len(list(text2words(str(x)))))
    texts['anchor_words_num'] = texts.links_text.apply(lambda x: sum([len(i) for i in x]))
    texts['compression'] = texts.apply(lambda row: row['words_num'] / float(len(set(row['text_words']))), 
                                       axis=1)
    texts['links_num'] = texts.links_text.apply(lambda x: len(x))
    texts['domain_length'] = texts.url.apply(lambda x: len(x.split('//')[1].split('/')[0]))
    return texts

In [9]:
train_texts = extract_features_from_texts(pd.read_csv("text_features_train.csv"))

In [10]:
train_texts[:1]

Unnamed: 0,doc_id,is_spam,url,title_text,links_text,text_words,words_num,avg_word_len,title_words_num,anchor_words_num,compression,links_num,domain_length
0,-9222401963271173253,False,http://lawleader.ru/docs/32/,"Договор займа, договоры, договора","[Главная, О компании, Услуги, Бизнес-договоры,...","[договор, займа, договоры, договора, главная, ...",2914,6.580645,4,1303,2.63472,51,12


In [11]:
def plot_distribution(feature, bins = range(0,3000,10)):
    spam_data = train_texts[train_texts['is_spam']][feature]
    not_spam_data = train_texts[train_texts['is_spam'] == False][feature]
    plt.hist(spam_data, bins=bins, color='red', linewidth=0, normed=True, alpha=0.7, label='spam')
    plt.hist(not_spam_data, bins=bins, color='blue', linewidth=0, normed=True, alpha=0.7, label='not_spam')
    plt.title(feature)
    plt.legend()
    plt.show()

In [12]:
# plot_distribution('words_num')
# plot_distribution('title_words_num', bins = range(0, 30, 1))
# plot_distribution('avg_word_len', bins = 100)

In [13]:
from sklearn.cross_validation import KFold, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
import xgboost as xgb



In [14]:
folds = KFold(train_texts.shape[0], n_folds=5, shuffle=True)
Y = train_texts['is_spam'].as_matrix()

In [15]:
def text_words2_strs(words_lists):
    return [" ".join([w for w in i if len(w) > 2]) for i in list(words_lists)]
X_text_str = text_words2_strs(train_texts.text_words)

In [16]:
# try to free memory
train_texts.text_words.apply(lambda x: "")
del train_texts['text_words']
import gc
gc.collect()

139

In [65]:
nb_text_clf = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', MultinomialNB())
                       ])
nb_text_pred = cross_val_predict(nb_text_clf, X_text_str, Y, folds, n_jobs=-1, verbose=True)
nb_score = f1_score(Y, nb_text_pred, average='weighted')
nb_score
# CountVectorizer: 0.88261277536786598
# CountVectorizer, TfidfTransformer: 0.8833642446256309

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   31.0s remaining:   46.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   45.4s finished


0.8833642446256309

In [21]:
svm_text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', 
                                           penalty='l2',alpha=1e-4, n_iter=5))
                    ])

In [22]:
svm_text_pred = cross_val_predict(svm_text_clf, X_text_str, Y, folds, n_jobs=-1, verbose=True)
svm_score = f1_score(Y, svm_text_pred, average='weighted')
svm_score
# 0.95552005331114609

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   18.0s remaining:   27.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   18.9s finished


0.95552005331114609

In [24]:
svm_text_clf.fit(X_text_str, Y)

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [52]:
train_texts['svm_pred'] = svm_text_clf.predict(X_text_str)

In [70]:
features = ['words_num', 'avg_word_len', 'title_words_num', 'anchor_words_num', 
            'compression', 'links_num', 'domain_length']#, 'svm_pred']
X = train_texts[features].as_matrix()
xgb_clf = xgb.XGBClassifier(max_depth = 7, n_estimators=300, silent=False)
xgb_pred = cross_val_predict(xgb_clf, X, Y, folds)
score = f1_score(Y, xgb_pred, average='weighted')
score

0.89822281114989433

In [71]:
xgb_clf.fit(X, Y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=300, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=1)

In [49]:
# nb_text_clf.fit(X_text_str, Y)

In [107]:
#process test file line by line
reader = pd.read_csv('text_features_test.csv', chunksize=500)
chunk_predictions = []
chunk_ids = []
for i, chunk in enumerate(reader):
    if i % 5 == 0:
        print("chunk: #{}".format(i))
    data = extract_features_from_texts(chunk)
    chunk_ids.append(data['doc_id'])
    # svm_pred = svm_text_clf.predict(text_words2_strs(data.text_words))
    # X = data[features].as_matrix()
    # chunk_predictions.append(xgb_clf.predict(X))
    X = text_words2_strs(data.text_words)
    chunk_predictions.append(svm_text_clf.predict(X))

chunk: #0
chunk: #5
chunk: #10
chunk: #15
chunk: #20
chunk: #25
chunk: #30


In [74]:
def flatten(inp):
    return sum([list(i) for i in inp], [])
predictions = flatten(chunk_predictions)
ids = flatten(chunk_ids)
len(predictions), len(ids)

(0, 16039)

In [105]:
result = pd.DataFrame({'Id': ids, 'Prediction': [int(i) for i in predictions]})
result.to_csv('my_submission.csv', index=False)