In [64]:
import pandas as pd

df = pd.read_csv('train.csv')
not_fit = True

In [65]:
import src.proc_data as procd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

[nltk_data] Downloading package stopwords to /Users/AtSu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [66]:
p_data = df[df['Popularity'] > 0] 
n_data = df[df['Popularity'] < 0]

tokenizer_stem_nostop('amazon microsoft women hahahaha')

['amazon', 'microsoft', 'women', 'hahahaha']

In [81]:
import itertools, operator

def extract_term(top, tfidf):
    feat = tfidf.get_feature_names()
    idf = tfidf.idf_
    idx = idf.argsort()
    sorted_score = []
    
    for i in idx: sorted_score.append(idf[i])
    sorted_smallest = list(zip(feat, sorted_score))
    sorted_smallest = sorted(sorted_smallest, key=operator.itemgetter(1))
    sorted_highest = sorted_smallest[::-1]
    term = []
    print('[vocabularies with smallest idf scores]')
    for i in range(top):
        #print('%s : %f' %(sorted_smallest[i][0], sorted_smallest[i][1]))
        term.append(sorted_smallest[i][0])
        
    print('[vocabularies with height idf scores]')
    for i in range(top):
        #print('%s : %f' %(sorted_highest[i][0], sorted_highest[i][1]))
        term.append(sorted_highest[i][0])
    return term

In [82]:
import scipy as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
import _pickle as pkl

stop = stopwords.words('english')

p = p_data['Page content']
n = n_data['Page content']

if not_fit:
    tfidf = TfidfVectorizer(ngram_range=(1,3),
                        preprocessor=procd.parse_html,
                        tokenizer=tokenizer_stem_nostop,
                        max_features=512,
                        )
    not_fit = False
    tfidf_p = tfidf.fit(p)
    pkl.dump(tfidf_p, open('tfidf_p.pkl', 'wb'))
    tfidf_n = tfidf.fit(n)
    pkl.dump(tfidf_n, open('tfidf_n.pkl', 'wb'))

tfidf_p = pkl.load(open('tfidf_p.pkl', 'rb'))
tfidf_n = pkl.load(open('tfidf_n.pkl', 'rb'))

t_p = extract_term(120, tfidf_p)
t_n = extract_term(120, tfidf_n)
   
extra_stop = []
for t in t_p:
    if t in t_n: extra_stop.append(t)
stop += extra_stop

print(stop)

[vocabularies with smallest idf scores]
[vocabularies with height idf scores]
[vocabularies with smallest idf scores]
[vocabularies with height idf scores]
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'm

In [91]:
tfidf = TfidfVectorizer(ngram_range=(1,2),
                        preprocessor=procd.parse_html,
                        tokenizer=tokenizer_stem_nostop,
                        max_features=128,
                        )
tfidf.fit(df['Page content'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=128, min_df=1,
        ngram_range=(1, 2), norm='l2',
        preprocessor=<function parse_html at 0x10c5bad90>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenizer_stem_nostop at 0x14eb48510>,
        use_idf=True, vocabulary=None)

In [92]:
def get_stream(path, size):
    for chunk in pd.read_csv(path, chunksize=size):
        yield chunk

In [96]:
batch_size = 2000

classes = np.array([-1, 1])
iters = int((27643+batch_size-1)/(batch_size))
Cs = [1.0]

best_score = 0
candidates = []



for C in Cs:
    clf = SGDClassifier(loss='hinge', alpha=1e-3)
    stream = get_stream(path='train.csv', size=batch_size)
    for i in range(iters):
        batch = next(stream)
        X_train, y_train = batch['Page content'], batch['Popularity']
        if X_train is None:
            break
        X_train = tfidf.transform(X_train)
        clf.partial_fit(X_train, y_train, classes=classes)
    
        score = clf.score(X_train, y_train)
        print('[{}/{}] {}'.format((i+1)*(batch_size), 27643, score))
    
    if score > best_score:
        print('bestscore(C=%f): %f' %(C, score))
        best_score = score
        candidates.append({'clf' : clf, 'C' : C, 'score' : score})

[2000/27643] 0.5885
[4000/27643] 0.5975
[6000/27643] 0.547
[8000/27643] 0.5735
[10000/27643] 0.542
[12000/27643] 0.5585
[14000/27643] 0.562
[16000/27643] 0.549
[18000/27643] 0.5375
[20000/27643] 0.565
[22000/27643] 0.549
[24000/27643] 0.558
[26000/27643] 0.5395
[28000/27643] 0.5459525258673159
bestscore(C=1.000000): 0.545953


In [94]:
df_test = pd.read_csv('test.csv')
X_test = tfidf.transform(df_test['Page content'])

In [95]:
for cand in candidates:
    predict = cand['clf'].predict(X_test)
    output = np.zeros((X_test.shape[0], 2), dtype=int)
    output[:, 0] = df_test['Id']
    output[:, 1] = predict
    df_output = pd.DataFrame(data=output, columns=['Id', 'Popularity'])
    df_output.to_csv('test%f.csv' % (cand['C']), index=False)