In [1]:
from model.db import DB_ENGINE
import pandas as pd
import numpy as np
import logging
import jieba
import jieba.analyse
from math import sqrt
import os
from pprint import pprint

In [2]:
from gensim.models.doc2vec import Doc2Vec

In [3]:
model = Doc2Vec.load('RuntimeTY/d2v_2048_5_1216')

In [4]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

In [5]:
raw_contents = pd.read_sql('SELECT rid, content, tag, assure FROM rawcontents', DB_ENGINE)
raw_contents.head()

Unnamed: 0,rid,content,tag,assure
0,1,比特币现价41000元左右。至今年底最少跌去一半！立此帖为证。,1.0,1
1,2,破5000是大概率事件,1.0,1
2,3,估计到时候都是非去中心化的币才是追捧的对象。没有信用背书的币还是不太靠谱。,0.0,1
3,4,出天涯钻，5毛一个,1.0,1
4,5,,1.0,1


In [6]:
tagged_data = raw_contents[raw_contents['assure'] > 0.5].copy()
tagged_data = tagged_data.set_index(np.arange(len(tagged_data)))
print(tagged_data.describe())
tagged_data.head()

                rid            tag    assure
count  1.007420e+05  100742.000000  100742.0
mean   5.841642e+05       0.981507       1.0
std    4.699411e+05       0.134726       0.0
min    1.000000e+00       0.000000       1.0
25%    1.784752e+05       1.000000       1.0
50%    4.258105e+05       1.000000       1.0
75%    1.008566e+06       1.000000       1.0
max    1.587615e+06       1.000000       1.0


Unnamed: 0,rid,content,tag,assure
0,1,比特币现价41000元左右。至今年底最少跌去一半！立此帖为证。,1.0,1
1,2,破5000是大概率事件,1.0,1
2,3,估计到时候都是非去中心化的币才是追捧的对象。没有信用背书的币还是不太靠谱。,0.0,1
3,4,出天涯钻，5毛一个,1.0,1
4,5,,1.0,1


### Fit

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split

In [8]:
scale = 0.1
tagged_data
mask = np.random.random(len(tagged_data))
tagged_data['ss'] = list(map(lambda x: -1 if x[0] > 0.1 else x[1], zip(mask, tagged_data['tag'])))
train_data = tagged_data[tagged_data['ss'] > 0]
test_data = tagged_data[tagged_data['ss'] < 0]

print(tagged_data.describe())
tagged_data

                rid            tag    assure             ss
count  1.007420e+05  100742.000000  100742.0  100742.000000
mean   5.841642e+05       0.981507       1.0      -0.799329
std    4.699411e+05       0.134726       0.0       0.599391
min    1.000000e+00       0.000000       1.0      -1.000000
25%    1.784752e+05       1.000000       1.0      -1.000000
50%    4.258105e+05       1.000000       1.0      -1.000000
75%    1.008566e+06       1.000000       1.0      -1.000000
max    1.587615e+06       1.000000       1.0       1.000000


Unnamed: 0,rid,content,tag,assure,ss
0,1,比特币现价41000元左右。至今年底最少跌去一半！立此帖为证。,1.0,1,-1.0
1,2,破5000是大概率事件,1.0,1,-1.0
2,3,估计到时候都是非去中心化的币才是追捧的对象。没有信用背书的币还是不太靠谱。,0.0,1,-1.0
3,4,出天涯钻，5毛一个,1.0,1,-1.0
4,5,,1.0,1,-1.0
5,6,探讨挖矿相关的技术，探讨区块链数字货币投资。展示行业最新动态，交流各自心得。共同提高对区块链...,0.0,1,-1.0
6,7,有想法吗,1.0,1,-1.0
7,8,看名来,1.0,1,1.0
8,9,挖掘商业新形态，掌握赚钱新模式\n \n 重塑商业新格局，构建商业新思维\n \n 抢占区块...,1.0,1,-1.0
9,10,币圈经过多年发展已经被头部交易所垄断，新币若想上去，需要花很多钱。\n \n 既然区块链是传...,0.0,1,-1.0


In [9]:
ss_X = list(map(model.infer_vector, tagged_data['content'].values))
ss_y = tagged_data['ss'].values

In [10]:
lp_model = LabelSpreading(max_iter=128, tol=1e-3, n_jobs=-1)
lp_model.fit(ss_X, ss_y)

MemoryError: 

In [None]:
y_pred = lp_model.transduction_[test_data.index]
res = np.array(y_pred != test_data['tag'].values)

In [None]:
res.mean()

In [None]:
from scipy import stats

In [None]:
unlabeled_indices = test_data.index

In [None]:
pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)
uncertainty_index = np.argsort(pred_entropies)[::1]
uncertainty_index = uncertainty_index[np.in1d(uncertainty_index, unlabeled_indices)][:2000]

In [None]:
uncertainty_index

### Spread

In [11]:
from model.db import DB_ENGINE, rawcontents
from sqlalchemy import update

In [12]:
raw_contents = pd.read_sql('SELECT rid, content, tag, assure FROM rawcontents', DB_ENGINE)
unlabeled_data = raw_contents[raw_contents['assure'] < 0.5].copy()
labled_data = raw_contents[raw_contents['assure'] > 0.5].copy()
labled_data.describe()

Unnamed: 0,rid,tag,assure
count,100742.0,100742.0,100742.0
mean,584164.2,0.981507,1.0
std,469941.1,0.134726,0.0
min,1.0,0.0,1.0
25%,178475.2,1.0,1.0
50%,425810.5,1.0,1.0
75%,1008566.0,1.0,1.0
max,1587615.0,1.0,1.0


In [14]:
inds = np.arange(len(unlabeled_data))
np.random.shuffle(inds)
current = labled_data.append(unlabeled_data.iloc[inds[:2048]])
current['ss'] = list(map(lambda x: -1 if x[0] < 0.5 else x[1], zip(current['assure'], current['tag'])))
current = current.set_index(np.arange(len(current)))
current

Unnamed: 0,rid,content,tag,assure,ss
0,1,比特币现价41000元左右。至今年底最少跌去一半！立此帖为证。,1.0,1,1.0
1,2,破5000是大概率事件,1.0,1,1.0
2,3,估计到时候都是非去中心化的币才是追捧的对象。没有信用背书的币还是不太靠谱。,0.0,1,0.0
3,4,出天涯钻，5毛一个,1.0,1,1.0
4,5,,1.0,1,1.0
5,6,探讨挖矿相关的技术，探讨区块链数字货币投资。展示行业最新动态，交流各自心得。共同提高对区块链...,0.0,1,0.0
6,7,有想法吗,1.0,1,1.0
7,8,看名来,1.0,1,1.0
8,9,挖掘商业新形态，掌握赚钱新模式\n \n 重塑商业新格局，构建商业新思维\n \n 抢占区块...,1.0,1,1.0
9,10,币圈经过多年发展已经被头部交易所垄断，新币若想上去，需要花很多钱。\n \n 既然区块链是传...,0.0,1,0.0


In [None]:
X_current = list(map(model.infer_vector, current['content'].values))
y_current = current['ss'].values

print("prepared")

lp_model = LabelSpreading(max_iter=128, tol=1e-3, n_jobs=-1)
lp_model_fitted = lp_model.fit(X_current, y_current)

prepared


In [None]:
pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)
current['pred_entropies'] = pred_entropies
sortted = current.sort_values('pred_entropies', ascending=False).set_index(np.arange(len(current)))

In [None]:
shuirows = sortted.loc[0:40]

shuirows

In [None]:
shui = shuirows.content.values
pprint(list(set(shui)))

In [None]:
bushui = []

In [None]:
print('shui =', list(shui))
print('bushui =', list(bushui))