In [66]:
import logging
import os
import pickle
from pprint import pprint
import jieba
import numpy as np
import pandas as pd
from sklearn.model_selection import (GridSearchCV, cross_val_score, train_test_split)
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import LabelSpreading
from scipy import stats
from app_db import DB_ENGINE, rawcontents
from sqlalchemy import update
from utils_log import getLogger
from sklearn.linear_model import SGDClassifier

In [48]:
logger = getLogger('semiTrain')

In [49]:
def fetchAllData():
    return pd.read_sql(
        'SELECT rid, tag, assure FROM rawcontents',
        DB_ENGINE
    )
    return raw_contents

In [51]:
def selectTrainData(data, unlabeled):
    unlabeled_data = data[data['assure'] < 0.5].copy()
    labeled_data = data[data['assure'] > 0.5].copy()

    inds = np.arange(len(unlabeled_data))
    np.random.shuffle(inds)

    return labeled_data.append(unlabeled_data.iloc[inds[:unlabeled]])

In [52]:
def completeTrainData(current):
    stmt = 'SELECT rid, content, vector FROM rawcontents WHERE rid IN {}'

    with DB_ENGINE.connect() as conn:
        traindata = current.merge(
            pd.DataFrame(
                conn.execute(
                    stmt.format(tuple(current['rid'].values))
                ).fetchall(),
                columns=['rid', 'content', 'vector']
            ),
            on='rid'
        )

    traindata['ss'] = list(map(
        lambda x: -1 if x[0] < 0.5 else x[1],
        zip(traindata['assure'], traindata['tag'])
    ))
    traindata['vector'] = traindata['vector'].apply(pickle.loads)

    return traindata.set_index(np.arange(len(current)))

In [75]:
def extractData(df):
    return list(df['vector'].values), list(df['tag'].values) 

In [53]:
logger.info('Read Database ...')
raw_contents = fetchAllData()
logger.info('Complete.')

In [55]:
logger.info('Read train data ...')
current = selectTrainData(raw_contents, 2048)
current = completeTrainData(current)
logger.info('Complete')

In [65]:
labeled = current[current['assure'] > 0.5].copy()
unlabeled = current[current['assure'] < 0.5].copy()

In [67]:
X_train, X_test, y_train, y_test = train_test_split(
    labeled['vector'].values, 
    labeled['tag'].values, 
    random_state = ord(os.urandom(1))
)

In [68]:
clf = SGDClassifier(
    random_state=ord(os.urandom(1)), 
    max_iter=512, 
    tol=1e-3,
    penalty='elasticnet',
    loss='modified_huber',
    fit_intercept=False
)

In [71]:
cross_val_score(
    clf,
    list(labeled['vector'].values), 
    list(labeled['tag'].values), 
    cv=5
)

array([0.98436647, 0.98739392, 0.98803911, 0.98798888, 0.97895468])

In [76]:
X_labeled, y_labeled = extractData(labeled)
X_unlabeled, y_unlabeled = extractData(unlabeled)

In [77]:
clf_fit_result = clf.fit(X_labeled, y_labeled)
clf_fit_result

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=False,
       l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
       max_iter=512, n_iter=None, n_iter_no_change=5, n_jobs=None,
       penalty='elasticnet', power_t=0.5, random_state=229, shuffle=True,
       tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [78]:
y_predict = clf_fit_result.predict(X_unlabeled)

In [79]:
unlabeled['predict'] = y_predict
unlabeled

Unnamed: 0,rid,content_x,tag,assure,content_y,vector,ss,predict
100742,59691,支持楼主，楼主分析的不错,,0,支持楼主，楼主分析的不错,"[-0.00050121656, 0.004646495, 0.00049711263, -...",-1.0,1.0
100743,1074015,有懂行的朋友吗，浙江这边现浇楼梯多少钱，现浇楼面多少钱一平方？,,0,有懂行的朋友吗，浙江这边现浇楼梯多少钱，现浇楼面多少钱一平方？,"[-0.00021289136, -0.0002112096, 0.00020131774,...",-1.0,0.0
100744,605732,假如你是当官的，下场会比秦桧惨,,0,假如你是当官的，下场会比秦桧惨,"[-0.00041641563, 0.004448106, 0.0007133413, -0...",-1.0,1.0
100745,698948,呵呵呵 我三套房产不包括现在父母这套住房 我怎么了 我立马在上上的星期把我其中一套住房（比较...,,0,呵呵呵 我三套房产不包括现在父母这套住房 我怎么了 我立马在上上的星期把我其中一套住房（比较...,"[-0.0001965089, 0.0002559799, -0.00016521648, ...",-1.0,1.0
100746,732077,结伴走天涯，点赞去。\n \n\n 抢红包,,0,结伴走天涯，点赞去。\n \n\n 抢红包,"[-0.00012122595, -0.00019975816, -9.6026626e-0...",-1.0,1.0
100747,604329,@匆匆那年2018ABC 2018-08-19 11:02:37\n \n 接近税务深...,,0,@匆匆那年2018ABC 2018-08-19 11:02:37\n \n 接近税务深...,"[-0.00015112055, -0.00021713384, 4.3946588e-05...",-1.0,0.0
100748,790591,我就在想，要是我当时仔细研究一下她的那些衣服的话，我可以少走好多弯路。\n \n\n 在春天...,,0,我就在想，要是我当时仔细研究一下她的那些衣服的话，我可以少走好多弯路。\n \n\n 在春天...,"[0.00014590095, -2.36256e-05, -5.815563e-06, 7...",-1.0,1.0
100749,1323314,靠一个洗剪吹的tony老师来吹风放消息 ，离死不远,,0,靠一个洗剪吹的tony老师来吹风放消息 ，离死不远,"[2.177861e-06, 0.00014500064, -2.8838269e-05, ...",-1.0,1.0
100750,95676,诺贝尔崔崔\n \n 抢到了\n \n 元素yz\n \n 的红包，价值0.0...,,0,诺贝尔崔崔\n \n 抢到了\n \n 元素yz\n \n 的红包，价值0.0...,"[-0.00053806993, 0.004573738, 0.00034990904, -...",-1.0,1.0
100751,176206,EOS作为一个去中心化的操作系统，支持很多开发者在其上面开发Dapp，EOS token的作...,,0,EOS作为一个去中心化的操作系统，支持很多开发者在其上面开发Dapp，EOS token的作...,"[5.1506522e-05, -5.8857413e-06, 1.2539855e-05,...",-1.0,1.0


In [84]:
(rid, content_x, tag, assure, content_y, vector, ss, predict) = unlabeled.iloc[0]

59691