In [1]:
from model.db import DB_ENGINE
import pandas as pd
import numpy as np
import logging
import jieba
import jieba.analyse
from math import sqrt
import os
from pprint import pprint

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

In [3]:
raw_contents = pd.read_sql('SELECT rid, content, tag, assure FROM rawcontents', DB_ENGINE, index_col='rid')
raw_contents.head()

Unnamed: 0_level_0,content,tag,assure
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,比特币现价41000元左右。至今年底最少跌去一半！立此帖为证。,1.0,1
2,破5000是大概率事件,1.0,1
3,估计到时候都是非去中心化的币才是追捧的对象。没有信用背书的币还是不太靠谱。,0.0,1
4,出天涯钻，5毛一个,1.0,1
5,,1.0,1


In [4]:
tagged_data = raw_contents[raw_contents['assure'] > 0].copy()
print(tagged_data.describe())
tagged_data.head()

              tag  assure
count  2025.00000  2025.0
mean      0.28642     1.0
std       0.45220     0.0
min       0.00000     1.0
25%       0.00000     1.0
50%       0.00000     1.0
75%       1.00000     1.0
max       1.00000     1.0


Unnamed: 0_level_0,content,tag,assure
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,比特币现价41000元左右。至今年底最少跌去一半！立此帖为证。,1.0,1
2,破5000是大概率事件,1.0,1
3,估计到时候都是非去中心化的币才是追捧的对象。没有信用背书的币还是不太靠谱。,0.0,1
4,出天涯钻，5毛一个,1.0,1
5,,1.0,1


### Fit

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

In [6]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(random_state=42, max_iter=64 ,tol=None)) # sublinear_tf 
])

In [7]:
scale = 0.6
mask = np.random.random((len(tagged_data)))
train_data = tagged_data[mask <= scale]
test_data = tagged_data[mask > scale]

In [8]:
text_clf.fit(train_data['content'].values, train_data['tag'].values)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [9]:
predicted = text_clf.predict(test_data['content'].values)
np.mean(predicted == test_data['tag'].values)

0.6964705882352941

#### Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [53]:
def tok2(sentence):
    return jieba.lcut(sentence)


text_clf = Pipeline([
    ('vect', CountVectorizer(tokenizer=tok2)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(random_state=ord(os.urandom(1)), max_iter=512, tol=1e-3)) # sublinear_tf 
])

record = []

def Search(parameters):
    for i in range(10):
        gs_clf =  RandomizedSearchCV(text_clf, parameters, n_jobs=-1, cv=5, iid=False)
        gs_clf_result = gs_clf.fit(train_data['content'].values, train_data['tag'].values)
        print(i, gs_clf_result.best_score_)
        record.append( dict([('score', gs_clf_result.best_score_), *gs_clf_result.best_params_.items()]) )
    
    df = pd.DataFrame(record)
    return df.sort_values('score', ascending=False)

In [26]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__norm': (None, 'l2', 'l1'),
    'tfidf__use_idf': (True, False),
    'tfidf__sublinear_tf': (True, False),
    'tfidf__smooth_idf': (True, False),
    'clf__penalty': ('none', 'l2', 'l1', 'elasticnet'),
    'clf__loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'),
    'clf__fit_intercept': (True, False),
    'clf__alpha': (1, 1e-1, 1e-2, 1e-3)
}

Search(parameters).head(10)

0.7395979238135746 {'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'tfidf__sublinear_tf': False, 'tfidf__smooth_idf': True, 'tfidf__norm': 'l1', 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__fit_intercept': False, 'clf__alpha': 1}
0.7370230026414664 {'vect__ngram_range': (1, 2), 'tfidf__use_idf': True, 'tfidf__sublinear_tf': True, 'tfidf__smooth_idf': True, 'tfidf__norm': None, 'clf__penalty': 'elasticnet', 'clf__loss': 'hinge', 'clf__fit_intercept': False, 'clf__alpha': 1}
0.737044701501968 {'vect__ngram_range': (1, 3), 'tfidf__use_idf': True, 'tfidf__sublinear_tf': True, 'tfidf__smooth_idf': True, 'tfidf__norm': 'l1', 'clf__penalty': 'l2', 'clf__loss': 'log', 'clf__fit_intercept': False, 'clf__alpha': 0.01}
0.7370230026414664 {'vect__ngram_range': (1, 1), 'tfidf__use_idf': False, 'tfidf__sublinear_tf': True, 'tfidf__smooth_idf': False, 'tfidf__norm': 'l1', 'clf__penalty': 'l1', 'clf__loss': 'modified_huber', 'clf__fit_intercept': False, 'clf__alpha': 1}
0.737026608



0.7438751267564827 {'vect__ngram_range': (1, 3), 'tfidf__use_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__smooth_idf': False, 'tfidf__norm': None, 'clf__penalty': 'l1', 'clf__loss': 'perceptron', 'clf__fit_intercept': False, 'clf__alpha': 0.1}
0.7395979238135746 {'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'tfidf__sublinear_tf': False, 'tfidf__smooth_idf': True, 'tfidf__norm': 'l1', 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__fit_intercept': False, 'clf__alpha': 0.1}
0.7378957653317553 {'vect__ngram_range': (1, 3), 'tfidf__use_idf': True, 'tfidf__sublinear_tf': True, 'tfidf__smooth_idf': True, 'tfidf__norm': 'l1', 'clf__penalty': 'l2', 'clf__loss': 'modified_huber', 'clf__fit_intercept': False, 'clf__alpha': 0.01}
0.7370230026414664 {'vect__ngram_range': (1, 1), 'tfidf__use_idf': False, 'tfidf__sublinear_tf': True, 'tfidf__smooth_idf': True, 'tfidf__norm': 'l2', 'clf__penalty': 'elasticnet', 'clf__loss': 'modified_huber', 'clf__fit_intercept': False, 'clf__

In [40]:
parameters = {
    'vect__ngram_range': [(1, 2), (1, 3), (1, 4)],#
    'tfidf__norm': (None, 'l1'),#
    'tfidf__use_idf': (True, False),
    'tfidf__sublinear_tf': (True, False),
    'tfidf__smooth_idf': (True, False),
    'clf__penalty': ('l2', 'l1', 'elasticnet'),#
    'clf__loss': ('log', 'modified_huber', 'squared_hinge', 'perceptron'),
    'clf__fit_intercept': (True, False),
    'clf__alpha': (2, 1, 1e-1, 1e-2, 1e-3)#
}

Search(parameters).head(10)

0.7370230026414664
0.7370230026414664
0.7370230026414664
0.7370230026414664
0.7404960223892788
0.7370302458690485
0.7395979238135746
0.7370266396663799
0.7370230026414664
0.7395979238135746


Unnamed: 0,clf__alpha,clf__fit_intercept,clf__loss,clf__penalty,score,tfidf__norm,tfidf__smooth_idf,tfidf__sublinear_tf,tfidf__use_idf,vect__ngram_range
12,1.0,True,modified_huber,elasticnet,0.752386,,False,False,True,"(1, 3)"
25,1.0,False,modified_huber,elasticnet,0.750727,,False,False,True,"(1, 3)"
16,0.001,False,perceptron,l1,0.744715,l2,True,False,True,"(1, 2)"
29,2.0,True,modified_huber,elasticnet,0.744708,,True,False,True,"(1, 2)"
2,0.1,False,modified_huber,elasticnet,0.743958,,False,False,True,"(1, 3)"
5,0.01,False,perceptron,elasticnet,0.742129,l2,True,True,True,"(1, 2)"
34,0.1,True,modified_huber,l2,0.740496,,True,False,False,"(1, 2)"
3,0.001,False,perceptron,elasticnet,0.740485,l1,True,False,False,"(1, 4)"
26,2.0,False,modified_huber,elasticnet,0.740456,,True,False,True,"(1, 4)"
36,2.0,False,squared_hinge,l2,0.739598,l1,False,False,False,"(1, 4)"


In [37]:
parameters = {
    'vect__ngram_range': [(1, 2), (1, 3), (1, 4), (1, 5)], #
    'tfidf__norm': (None, 'l2', 'l1'),
    'tfidf__use_idf': (True, False),
    'tfidf__sublinear_tf': (True, False),
    'tfidf__smooth_idf': (True, False),
    'clf__penalty': ('none', 'l2', 'l1', 'elasticnet'),
    'clf__loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'),
    'clf__fit_intercept': (True, False),
    'clf__alpha': (1, 1e-1, 1e-2, 1e-3, 1e-4)#
}

Search(parameters).head(10)

0.7371567403626546
0.7395979238135746
0.7439581618845953
0.7404850496700479
0.7370447323242131
0.7421293856201898
0.7387468599837875
0.7370230026414664
0.7370230026414664
0.7370230026414664


Unnamed: 0,clf__alpha,clf__fit_intercept,clf__loss,clf__penalty,score,tfidf__norm,tfidf__smooth_idf,tfidf__sublinear_tf,tfidf__use_idf,vect__ngram_range
2,0.1,False,modified_huber,elasticnet,0.743958,,False,False,True,"(1, 3)"
5,0.01,False,perceptron,elasticnet,0.742129,l2,True,True,True,"(1, 2)"
3,0.001,False,perceptron,elasticnet,0.740485,l1,True,False,False,"(1, 4)"
1,0.1,False,modified_huber,l2,0.739598,l1,True,False,False,"(1, 3)"
6,0.1,False,modified_huber,l2,0.738747,l1,True,True,False,"(1, 3)"
0,0.001,True,squared_hinge,l1,0.737157,l2,False,True,True,"(1, 3)"
4,0.1,False,hinge,none,0.737045,l1,True,False,False,"(1, 4)"
7,0.1,False,perceptron,l1,0.737023,l1,True,False,False,"(1, 3)"
8,0.1,True,hinge,l1,0.737023,l1,False,True,True,"(1, 2)"
9,1.0,True,modified_huber,l1,0.737023,,True,True,False,"(1, 5)"


In [38]:
parameters = {
    'vect__ngram_range': [(1, 2), (1, 3), (1, 4), (1, 5)],
    'tfidf__norm': (None, 'l2', 'l1'),
    'tfidf__use_idf': [True, False],
    'tfidf__sublinear_tf': [True, False],
    'tfidf__smooth_idf': [True, False],
    'clf__penalty': ('l1', 'elasticnet'), #
    'clf__loss': ('modified_huber', 'perceptron'), #
    'clf__fit_intercept': [True, False],
    'clf__alpha': (1, 1e-1, 1e-2, 1e-3, 1e-4)#
}

Search(parameters).head(10)

0 0.7378740664712536
1 0.7378740664712536
2 0.752385703409865
3 0.7395871052055689
4 0.7370230026414664
5 0.7370230026414664
6 0.7447151254003039
7 0.7370230026414664
8 0.7370230026414664
9 0.7370230026414664


Unnamed: 0,clf__alpha,clf__fit_intercept,clf__loss,clf__penalty,score,tfidf__norm,tfidf__smooth_idf,tfidf__sublinear_tf,tfidf__use_idf,vect__ngram_range
12,1.0,True,modified_huber,elasticnet,0.752386,,False,False,True,"(1, 3)"
16,0.001,False,perceptron,l1,0.744715,l2,True,False,True,"(1, 2)"
2,0.1,False,modified_huber,elasticnet,0.743958,,False,False,True,"(1, 3)"
5,0.01,False,perceptron,elasticnet,0.742129,l2,True,True,True,"(1, 2)"
3,0.001,False,perceptron,elasticnet,0.740485,l1,True,False,False,"(1, 4)"
1,0.1,False,modified_huber,l2,0.739598,l1,True,False,False,"(1, 3)"
13,0.1,False,perceptron,elasticnet,0.739587,,True,True,False,"(1, 3)"
6,0.1,False,modified_huber,l2,0.738747,l1,True,True,False,"(1, 3)"
10,0.01,False,modified_huber,l1,0.737874,l1,True,False,True,"(1, 3)"
11,0.01,False,modified_huber,l1,0.737874,l1,True,True,True,"(1, 3)"


In [39]:
parameters = {
    'vect__ngram_range': [(1, 2), (1, 3), (1, 4)],
    'tfidf__norm': (None, 'l2', 'l1'),
    'tfidf__use_idf': [True, False],
    'tfidf__sublinear_tf': [True, False],
    'tfidf__smooth_idf': [True, False],
    'clf__penalty': ('l1', 'elasticnet'),
    'clf__loss': ('modified_huber', 'perceptron'),
    'clf__fit_intercept': [True, False],
    'clf__alpha': (2, 1, 1e-1, 1e-2, 1e-3, 1e-4)#
}

Search(parameters).head(10)

0 0.7370230026414664
1 0.7370230026414664
2 0.7370230026414664
3 0.7370230026414664
4 0.7370230026414664
5 0.7507269734712937
6 0.7404563233376793
7 0.738761469727932
8 0.7370230026414664
9 0.7447080054617018


Unnamed: 0,clf__alpha,clf__fit_intercept,clf__loss,clf__penalty,score,tfidf__norm,tfidf__smooth_idf,tfidf__sublinear_tf,tfidf__use_idf,vect__ngram_range
12,1.0,True,modified_huber,elasticnet,0.752386,,False,False,True,"(1, 3)"
25,1.0,False,modified_huber,elasticnet,0.750727,,False,False,True,"(1, 3)"
16,0.001,False,perceptron,l1,0.744715,l2,True,False,True,"(1, 2)"
29,2.0,True,modified_huber,elasticnet,0.744708,,True,False,True,"(1, 2)"
2,0.1,False,modified_huber,elasticnet,0.743958,,False,False,True,"(1, 3)"
5,0.01,False,perceptron,elasticnet,0.742129,l2,True,True,True,"(1, 2)"
3,0.001,False,perceptron,elasticnet,0.740485,l1,True,False,False,"(1, 4)"
26,2.0,False,modified_huber,elasticnet,0.740456,,True,False,True,"(1, 4)"
1,0.1,False,modified_huber,l2,0.739598,l1,True,False,False,"(1, 3)"
13,0.1,False,perceptron,elasticnet,0.739587,,True,True,False,"(1, 3)"


In [52]:
parameters = {
    'vect__ngram_range': [(1, 2), (1, 3), (1, 4)],
    'tfidf__norm': (None, 'l2', 'l1'),
    'tfidf__use_idf': [True],
    'tfidf__sublinear_tf': [False],
    'tfidf__smooth_idf': [True, False],
    'clf__penalty': ('l1', 'elasticnet'),
    'clf__loss': ('modified_huber', 'perceptron'),
    'clf__fit_intercept': [True, False],
    'clf__alpha': (1.5, 1, 1e-1, 1e-2, 0.001, 0.0005)#
}

gs_clf =  GridSearchCV(text_clf, parameters, n_jobs=-1, cv=5, iid=False)
gs_clf_result = gs_clf.fit(train_data['content'].values, train_data['tag'].values)

In [54]:
print(gs_clf_result.best_score_)
gs_clf_result.best_params_

0.757477630755669


{'clf__alpha': 1,
 'clf__fit_intercept': True,
 'clf__loss': 'modified_huber',
 'clf__penalty': 'elasticnet',
 'tfidf__norm': None,
 'tfidf__smooth_idf': False,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 3)}

In [55]:
parameters = {
    'vect__ngram_range': [(1, 2), (1, 3)],
    'tfidf__norm': [None, 'l2'],
    'tfidf__use_idf': [True],
    'tfidf__sublinear_tf': [False],
    'tfidf__smooth_idf': [False],
    'clf__penalty': ['elasticnet'],
    'clf__loss': ['modified_huber'],
    'clf__fit_intercept': [True, False],
    'clf__alpha': np.linspace(0.8, 1.2, 10)
}

gs_clf =  GridSearchCV(text_clf, parameters, n_jobs=-1, cv=5, iid=False)
gs_clf_result = gs_clf.fit(train_data['content'].values, train_data['tag'].values)

In [56]:
print(gs_clf_result.best_score_)
gs_clf_result.best_params_

0.751523790149827


{'clf__alpha': 1.2,
 'clf__fit_intercept': False,
 'clf__loss': 'modified_huber',
 'clf__penalty': 'elasticnet',
 'tfidf__norm': None,
 'tfidf__smooth_idf': False,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 2)}

In [58]:
predict_tag = gs_clf_result.predict(train_data['content'].values)
result = np.array(predict_tag == train_data['tag'].values)
result.mean()

0.7642553191489362

In [119]:
def tok1(sentence):
    return jieba.analyse.extract_tags(sentence, topK=int(sqrt(len(sentence))))

def tok2(sentence):
    return jieba.lcut(sentence)


text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(random_state=ord(os.urandom(1)), max_iter=128, tol=1e-3)) # sublinear_tf 
])


parameters = {
    'vect__tokenizer': [tok2],
    'vect__ngram_range': [(1, 2)],
    'tfidf__norm': ['l2'],
    'tfidf__use_idf': [True, False],
    'tfidf__sublinear_tf': [True, False],
    'tfidf__smooth_idf': [True, False],
    'clf__penalty': ['none', 'l2'],
    'clf__loss': ['squared_hinge', 'perceptron'],
    'clf__fit_intercept': [True],
    'clf__alpha': np.linspace(0.018, 0.020, 11)
}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, cv=5)
gs_clf_result = gs_clf.fit(train_data['content'].values, train_data['tag'].values)

print(gs_clf_result.best_score_)
gs_clf_result.best_params_



0.7683578104138852


{'clf__alpha': 0.0192,
 'clf__fit_intercept': True,
 'clf__loss': 'squared_hinge',
 'clf__penalty': 'none',
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': False,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 2),
 'vect__tokenizer': <function __main__.tok2(sentence)>}