In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
import base64
import pandas
from sklearn.model_selection import GridSearchCV

In [2]:
def get_data(data_file_path):
    data = pandas.read_csv(data_file_path, sep='\t')
    data['Html'] = data['PageBase64'].apply(base64.b64decode)
    return data.drop(['PageBase64', 'Url'], axis=1)

In [3]:
train_data = get_data('data/kaggle_train_data_tab.csv')
vectorizer = TfidfVectorizer(decode_error='ignore')

In [4]:
train_data.head()

Unnamed: 0,Id,Prediction,Html
0,-9222401963271173253,0,"b'<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0..."
1,-9221399504663603656,0,"b'<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0..."
2,-9219733223042265364,0,"b'<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01..."
3,-9218561774155397294,0,"b'<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 ..."
4,-9216163564171210203,1,"b'<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0..."


In [5]:
tf_idf_train = vectorizer.fit_transform(train_data.Html)

In [6]:
parameters = {
    'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
    'alpha': [10 ** n for n in range(-4, 5)]
}
estimator = SGDClassifier()
grid_search = GridSearchCV(estimator=estimator, param_grid=parameters, scoring='f1', n_jobs=-1)
grid_search.fit(tf_idf_train, train_data.Prediction)

GridSearchCV(cv=None, error_score=nan,
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5,
                                     random_state=None, shuffle=True, tol=0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000,
                                   10000],
                         'loss': ['hinge', 'log', 'modified_huber',
                  

In [7]:
grid_search.best_score_

0.982385110667856

In [8]:
grid_search.best_params_

{'alpha': 0.0001, 'loss': 'modified_huber'}

In [9]:
best_estimator = SGDClassifier(alpha=1e-4, loss='hinge')
test_data = get_data('data/kaggle_test_data_tab.csv')
tf_idf_test = vectorizer.transform(test_data.Html)
best_estimator.fit(tf_idf_train, train_data.Prediction)
test_data.Prediction = best_estimator.predict(tf_idf_test)

In [10]:
with open('data/result.csv', 'w') as file:
    file.write('Id,Prediction\n')
    for cur_object in test_data.itertuples():
        file.write('{0},{1}\n'.format(cur_object.Id, cur_object.Prediction))