In [2]:
from gensim import models
from gensim.models.word2vec import LineSentence, Word2Vec
import jieba, re
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from scipy.spatial.distance import cosine

In [3]:
model = Word2Vec.load('word2vec.model')

In [16]:
vlookup = model.wv.vocab  # Gives us access to word index and count
Z = 0
for k in vlookup:
    Z += vlookup[k].count # Compute the normalization constant Z

def get_sentenses_vector(sentences, model=model, alpha=1e-3, Z=Z):
    def sif_embeddings(sentences, model, alpha=alpha, Z=Z):

        vlookup = model.wv.vocab  # Gives us access to word index and count
        vectors = model.wv        # Gives us access to word vectors
        size = model.vector_size  # Embedding size

        output = []

        # Iterate all sentences
        for s in sentences:
            count = 0
            v = np.zeros(size, dtype=np.float32) # Summary vector
            # Iterare all words
            for w in s:
                # A word must be present in the vocabulary
                if w in vlookup:
                    v += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w]
                    count += 1 
            if count > 0:
                v /= count
            output.append(v)
        return np.vstack(output).astype(np.float32 )
    vector = sif_embeddings(sentences, model)
    pca = PCA(1)
    pca.fit(vector)
    u = pca.components_[0]
#     for i in range(len(vector)):
#         vector[i] -= np.multiply(np.multiply(u, u.T), vector[i])
    vector -= np.multiply(np.multiply(u, u.T), vector)
    return vector

In [17]:
csv_content = pd.read_csv( 'sqlResult_1558435.csv', encoding = "gb18030")
csv_content

Unnamed: 0,id,author,source,content,feature,title,url
0,89617,,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""37""...",小米MIUI 9首批机型曝光：共计15款,http://www.cnbeta.com/articles/tech/623597.htm
1,89616,,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",骁龙835在Windows 10上的性能表现有望改善,http://www.cnbeta.com/articles/tech/623599.htm
2,89615,,快科技@http://www.kkj.cn/,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\r\n...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""18""...",一加手机5细节曝光：3300mAh、充半小时用1天,http://www.cnbeta.com/articles/tech/623601.htm
3,89614,,新华社,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\r\n,"{""type"":""国际新闻"",""site"":""环球"",""commentNum"":""0"",""j...",葡森林火灾造成至少62人死亡 政府宣布进入紧急状态（组图）,http://world.huanqiu.com/hot/2017-06/10866126....
4,89613,胡淑丽_MN7479,深圳大件事,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...,"{""type"":""新闻"",""site"":""网易热门"",""commentNum"":""978"",...",44岁女子约网友被拒暴雨中裸奔 交警为其披衣相随,http://news.163.com/17/0618/00/CN617P3Q0001875...
...,...,...,...,...,...,...,...
89606,5,邹峥,新华社,新华社照片，多伦多，2017年6月7日\n（体育）（2）冰球——国家女子冰球队海外选秀在多伦...,"{""type"":""冰球"",""site"":""新华社"",""url"":""http://home.x...",（体育）（2）冰球——国家女子冰球队海外选秀在多伦多展开,http://home.xinhua-news.com/gdsdetailxhsnew/22...
89607,4,王衡,新华社,新华社兰州6月3日电（王衡、徐丹）记者从甘肃省交通运输厅获悉，甘肃近日集中开建高速公路、普通...,"{""type"":""宏观经济"",""site"":""新华社"",""url"":""http://home...",（经济）甘肃集中开工35个重点交通建设项目,http://home.xinhua-news.com/gdsdetailxhsnew/22...
89608,3,张旌,新华社,\n\n2017年5月29日，在法国巴黎郊外的凡尔赛宫，法国总统马克龙出席新闻发布会。（新华...,"{""type"":""其它"",""site"":""新华社"",""url"":""http://home.x...",法国议会选举　马克龙有望获“压倒性胜利”,http://home.xinhua-news.com/gdsdetailxhsnew/22...
89609,2,夏文辉,新华社,\n\n2017年5月25日，在美国马萨诸塞州剑桥市，哈佛大学毕业生在毕业典礼上欢呼。（新华...,"{""type"":""其它"",""site"":""新华社"",""url"":""http://home.x...",哈佛大学为何取消这些新生入选资格？,http://home.xinhua-news.com/gdsdetailxhsnew/22...


In [18]:
sentence_pattern = re.compile(r'[！？。…\r\n(?:\\n)]+')    
def get_all_list(content, title):
    if not (type(content) == str and type(title) == str):
        return (None, None, None)
    sentense = [jieba.lcut(el) for el in sentence_pattern.split(content)]
    title = jieba.lcut(title)
    content = jieba.lcut(content)
    return content, title, sentense

In [92]:
def calculate_knn_value(v_list):
    content_list = [el[0] for el in v_list]
    value_list = np.array([el[1] for el in v_list])
    value_start = (value_list[0] + value_list[1]) / 2
    value_end = (value_list[-1] + value_list[-2]) / 2
    value_list = (value_list[1: -1] + value_list[:-2] + value_list[2:]) / 3
    value_list = np.append(value_start, value_list)
    value_list = np.append(value_list, value_end)
    v_list = list(zip(content_list, value_list))
    return v_list

In [94]:
for i in range(len(csv_content)):
    content = (csv_content['content'][i])
    title = (csv_content['title'][i])
    content, title, sentense = get_all_list(content, title)
    if not content or not title or not sentense:
        continue
    v_content = get_sentenses_vector([content])
    v_title = get_sentenses_vector([title])
    v_sentense = get_sentenses_vector(sentense)
    v_target = (v_title + v_content) / 2
    
    v_list = [(''.join(s), cosine(v, v_target)) for s, v in zip(sentense, v_sentense)]
    v_list = [el for el in v_list if not np.isnan(el[1])]
    
    knn_list = calculate_knn_value(v_list)
    knn_list.sort(key=lambda x: x[1], reverse=True)
    
    print(''.join(title))
    print(knn_list)
    print('===========================================')
    
    if i == 10:
        raise  

  explained_variance_ = (S ** 2) / (n_samples - 1)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  explained_variance_ = (S 

小米MIUI 9首批机型曝光：共计15款
[('MIUI 8去年5月发布，距今已有一年有余，也是时候更新换代了', 0.49579862753550213), ('当然，关于MIUI 9的确切信息，我们还是等待官方消息', 0.482117161154747), ('有人猜测这也是将精力主要用到MIUI 9的研发之中', 0.3428528606891632), ('此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/体验版内测，稳定版暂不受影响），以确保工程师可以集中全部精力进行系统优化工作', 0.327934131026268)]
骁龙835在Windows 10上的性能表现有望改善
[('dows 10桌面平台', 0.4086729884147644), ('os、联发科、华为麒麟、小米澎湃等进入Wi', 0.36866962909698486), ('当然，高通骁龙只是个开始，未来也许还能见到三星Exy', 0.33750319480895996), ('资料显示，骁龙835作为一款集成了CPU、GPU、基带、蓝牙/Wi-Fi的SoC，比传统的Wi', 0.29435616731643677), ('按计划，今年Q4，华硕、惠普、联想将首发骁龙835 Wi', 0.28601733843485516), ('报道称，微软已经拿到了一些新的源码，以便Wi', 0.28514113028844196), ('dows 10更好地理解big.little架构', 0.2845122814178467), ('tel方案可以节省至少30%的PCB空间', 0.27710596720377606), ('骁龙835作为唯一通过Wi', 0.2676331698894501), ('10电脑，预计均是二合一形态的产品', 0.26162779331207275), ('dows 10桌面平台认证的ARM处理器，高通强调，不会因为只考虑性能而去屏蔽掉小核心', 0.24626964330673218), ('相反，他们正联手微软，找到一种适合桌面平台的、兼顾性能和功耗的完美方案', 0.2185290257136027)]
一加手机5细节曝光：3300mAh、充半小时用1天
[('此前的一加3T搭载的是3400mAh电池，Das

武汉千余警察出动 抓获808名俊男靓女全是诈骗犯
[('霖心金鱼A', 0.8416124408443769), ('d张蒙：', 0.8372851933042208), ('都不敢去找工作了', 0.7901845797896385), ('他们当中大多都是年青人，', 0.7666745434204737), ('而给自己的生活抹上污点', 0.7545696546634039), ('我做过， ', 0.7522075548768044), ('接近300人的犯罪团伙，', 0.747958597416679), ('俗话说，知己知彼百战不殆', 0.7408845772345861), ('招聘平台和销售会上百分之八十就是这种工作', 0.7370148872335752), ('用“俊男靓女”来形容也不为过', 0.7359313517808914), ('年纪不大，外形不差，那么到底是犯了什么事呢', 0.7277735322713852), ('z_路漫漫：', 0.7255230123798052), ('可能相当部分人都不大清楚自己到底是啥性质工作吧', 0.7225350886583328), ('The-former-degrees：', 0.7187154764930407), ('弄清网络诈骗的常见手法', 0.7177132119735082), ('赶紧自己跑路了', 0.7166106278697649), ('还有那些天天电话骚扰别人信贷的、卖房的、卖邮票的、发票的社会生活中一不留神就会被坑，所以活着需要理智', 0.7114145971524218), ('冉冉的彗星是个宝：早上还接到说做期货', 0.7102205455303192), ('遗忘的金叶子：碰到这样的骗子找我们做视频，最后被我打电话骂了一顿', 0.7066282050994536), ('a一元复始c：', 0.699535659669588), ('原来他们每天偷偷摸摸干这事', 0.6960663547118505), ('好多同学刚刚毕业都去做过，都是网上投简历，这些公司就猛联系你，其实好多都不知道这是骗人的吧', 0.6889172494411469), ('刑法守望者：', 0.6742491672436396), ('）', 0.672238901

  explained_variance_ = (S ** 2) / (n_samples - 1)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  explained_variance_ = (S ** 2) / (n_samples - 1)
  dist = 1.0 - uv / np.sqrt(uu * vv)


RuntimeError: No active exception to reraise

In [102]:
print(csv_content.iloc[2].title,'\n',
csv_content.iloc[2].content)

一加手机5细节曝光：3300mAh、充半小时用1天 
 此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。
至于电池缩水，可能与刘作虎所说，一加手机5要做市面最轻薄大屏旗舰的设定有关。
按照目前掌握的资料，一加手机5拥有5.5寸1080P三星AMOLED显示屏、6G/8GB RAM，64GB/128GB ROM，双1600万摄像头，备货量“惊喜”。
根据京东泄露的信息，一加5起售价是xx99元，应该是在2799/2899/2999中的某个。

