In [3]:
import pandas as pd

df = pd.read_csv('train.csv')


In [5]:
import src.proc_data as procd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

[nltk_data] Downloading package stopwords to /home/atsu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
import re
import scipy as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

tfidf = TfidfVectorizer(ngram_range=(1,2),
                        preprocessor=procd.parse_html,
                        tokenizer=tokenizer_stem_nostop,
                        max_features=512)
tfidf = tfidf.fit(df['Page content'])


In [8]:
top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()
for i in range(top):
    print('%s: %.2f' %(tfidf.get_feature_names()[i], idf[sorted_idx[i]]))


[vocabularies with smallest idf scores]
abl: 1.12
access: 1.16
accord: 1.25
account: 1.51
across: 1.53
action: 1.61
activ: 1.63
actual: 1.65
ad: 1.74
add: 1.77


In [9]:
def get_stream(path, size):
    for chunk in pd.read_csv(path, chunksize=size):
        yield chunk

In [15]:
batch_size = 2000

classes = np.array([-1, 1])
iters = int((27643+batch_size-1)/(batch_size))
Cs = [0.01, 0.1, 1.0, 3.0]

best_score = 0
candidates = []



for C in Cs:
    alpha = 1 / (C * batch_size)
    clf = SGDClassifier(loss='hinge', alpha=alpha)
    stream = get_stream(path='train.csv', size=batch_size)
    for i in range(iters):
        batch = next(stream)
        X_train, y_train = batch['Page content'], batch['Popularity']
        if X_train is None:
            break
        X_train = tfidf.transform(X_train)
        clf.partial_fit(X_train, y_train, classes=classes)
    
        score = clf.score(X_train, y_train)
        print('[{}/{}] {}'.format((i+1)*(batch_size), 27643, score))
    
    if score > best_score:
        print('bestscore(C=%f): %f' %(C, score))
        best_score = score
        candidates.append({'clf' : clf, 'C' : C, 'score' : score})

[2000/27643] 0.513
[4000/27643] 0.553
[6000/27643] 0.5155
[8000/27643] 0.5135
[10000/27643] 0.525
[12000/27643] 0.522
[14000/27643] 0.5255
[16000/27643] 0.5145
[18000/27643] 0.502
[20000/27643] 0.502
[22000/27643] 0.516
[24000/27643] 0.5075
[26000/27643] 0.499
[28000/27643] 0.49421789409616557
bestscore(C=0.010000): 0.494218
[2000/27643] 0.616
[4000/27643] 0.5335
[6000/27643] 0.535
[8000/27643] 0.534
[10000/27643] 0.5255
[12000/27643] 0.5165
[14000/27643] 0.5095
[16000/27643] 0.5115
[18000/27643] 0.512
[20000/27643] 0.5365
[22000/27643] 0.5015
[24000/27643] 0.5165
[26000/27643] 0.5115
[28000/27643] 0.516737674984784
bestscore(C=0.100000): 0.516738
[2000/27643] 0.6495
[4000/27643] 0.6525
[6000/27643] 0.6255
[8000/27643] 0.6325
[10000/27643] 0.625
[12000/27643] 0.595
[14000/27643] 0.6105
[16000/27643] 0.598
[18000/27643] 0.5825
[20000/27643] 0.5915
[22000/27643] 0.5825
[24000/27643] 0.607
[26000/27643] 0.569
[28000/27643] 0.5727328058429701
bestscore(C=1.000000): 0.572733
[2000/27643] 0.

In [16]:
df_test = pd.read_csv('test.csv')
X_test = tfidf.transform(df_test['Page content'])

In [20]:
for cand in candidates:
    predict = cand['clf'].predict(X_test)
    output = np.zeros((X_test.shape[0], 2), dtype=int)
    output[:, 0] = df_test['Id']
    output[:, 1] = predict
    df_output = pd.DataFrame(data=output, columns=['Id', 'Popularity'])
    df_output.to_csv('test%f.csv' % (cand['C']), index=False)