In [1]:
import copy
import math
import pandas as pd
import numpy as np
import jieba
import nltk
import spacy
import json
from collections import Counter
from tqdm.notebook import tqdm

## 文件导入
    -   读取csv文件
    -   转换csv为列表list

In [2]:
datapath = 'data/test-f.csv'

In [3]:
data = pd.read_csv(datapath,index_col=0)
data_records = data.to_dict('records')

In [4]:
sample_string = data_records[42]['content']

## 分词函数

In [5]:
def cut_sentence(sent):
    """
    :param sent: str
    :return:  list

    Write your own code here
    """
    return list(jieba.cut(sent))

### 测试一下函数

In [6]:
cut_sentence(sample_string)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/c_/f8ygdlpx3nn0nxjx71cjbdkw0000gn/T/jieba.cache
Loading model cost 0.307 seconds.
Prefix dict has been built successfully.


['有些', '事越', '想要', '越', '得不到', '，', '有些', '梦', '只能', '相信', '，', '是', '这样', '吗']

### 获取停用词

In [7]:
def get_stopwords():
    """
    You can provide A better list, here is an example.
    https://github.com/goto456/stopwords
    """
    stopwords = [word.strip() for word in open('data/cn_stopwords.txt').readlines()]
    return stopwords
stop_words = get_stopwords()

### 过滤停用词的函数

In [1]:
def strip_stopwords(words,stopwords):
    """
    :param words:  词语列表
    :param stopwords:   停用词列表
    :return:
    """
    return [word for word in words if word not in stopwords]

### 测试一下

In [9]:
strip_stopwords(cut_sentence(sample_string),stop_words)

['事越', '想要', '越', '得不到', '梦', '只能', '相信']

## 遍历所有数据：分词 -> 过滤停用词

In [10]:
"""
Now loop over all sentences to get their words,
then save the results to the variable data_records for later call.

在这个函数里实现遍历所有数据 并分词
"""
def process_content(data_records):
    """
    Your code here....
    :return: list
    """
    records = []

    for item in data_records:
        item.update({
            'words':strip_stopwords(cut_sentence(str(item['content'])),stop_words)
        })
        records.append(item)
    return records

recordsWithWords = process_content(data_records)

## 获取词典

In [11]:
wordCorpus = []
for item in recordsWithWords:
    wordCorpus+=item['words']
print(wordCorpus[:20])

['宿舍', '要民汉合宿', '毛', '大三', '折腾', '早上', '竟然', '变成', '一个', '无理取闹', '…', '…', '多年', '周天', '先生', '率', '智多星', '律师', '策划师', '团队']


### 这个函数实现了对词语列表的过滤 并存储

In [12]:
# construct the vocabulary list

def construct_vocab(wordCorpus,save=True):
    """
    :param word_list:list
    :return: a vocab set
    [Attention]: A set means there is no repeat item.
    """

    #Your code here..
    vocab = list(set(wordCorpus))


    ## You don't have to modify the following code
    if save:
        res = vocab
        res.sort()
        json.dump(res,open('results/vocab.json','w'),ensure_ascii=False)
    return vocab
vocab = construct_vocab(wordCorpus)
word2id = { w:idx for idx,w in enumerate(vocab)}

### 这个函数获取词语列表的词频统计

In [13]:
def wordFreq(wordList):
    return dict(Counter(wordList))

In [14]:
vocabFreq = { w:0 for idx,w in enumerate(vocab)}
wordCounted = wordFreq(wordCorpus)
vocabFreq.update(wordCounted)

In [15]:
# def get_total_freq_of_sentence(wordList,Freq):
#     total = sum([Freq[word] for word in wordList])
#     return total

def convert_words2freqs(wordList,Freq):
    """
    这个函数获取了词语列表中 每一个词的词频
    :param wordList:
    :param Freq: 字典 从wordFreq获取
    :return: 列表，对应每个词语的词频
    """
    return [Freq[w] for w in wordList]

TF Calculation

为每个句子中的单词计算对应的 在句子中的 词频 aka TF: term frequency

 $tf_{i,j} = \frac{n_{i,j}}{\sum_{k}n_{k,j}}$,

i 是词语索引, j 是句子索引


 each sentence will construct its features on the scale of the whole vocabulary list

In [16]:
def get_tf_feat(wordList,vocab):
    """
    这个函数 根据词频构建句子的特征
    :param wordList: 单词列表
    :param vocab: 词汇表
    :return: 句子的特征向量
    """
    tf = np.zeros(vocab_size)
    freq = wordFreq(wordList)
    for item in freq.items():
        tf[vocab.index(item[0])] = item[1]
    tf = tf/tf.sum()
    return tf

In [17]:
vocab_size = len(vocab)
len(vocab)

21624

 ## 遍历数据集 构建每个句子的tf特征

In [18]:
recordsWithTF = copy.deepcopy(recordsWithWords)
for idx,item in enumerate(tqdm(recordsWithTF)):
    tf = get_tf_feat(item['words'],vocab)
    item['TF_array'] = tf

  0%|          | 0/5000 [00:00<?, ?it/s]

In [19]:
[item for item in recordsWithTF[42]['TF_array'].tolist() if item!=0]

[0.14285714285714285,
 0.14285714285714285,
 0.14285714285714285,
 0.14285714285714285,
 0.14285714285714285,
 0.14285714285714285,
 0.14285714285714285]

IDF Calculation

$idf_i = lg\frac{|D|}{1 + |{j:t_i \in d_j}|}$

|D|: the total of sentences
$d_j$: the specific j-th sentence
|{j:t_i \in d_j}|: the number of sentences that contains the word $t_i$

1+: avoid zero division when the word not in any sentences

idf是针对整个数据集做的统计

In [20]:
def calculateCorpusIDF(vocab,ListOfWordList):
    IDF_values = { w:0 for idx,w in enumerate(vocab)}
    D = len(ListOfWordList)
    for w in tqdm(IDF_values.keys()):
        count = 1
        for doc in ListOfWordList:
            if w in doc:
                count+=1
        idf_value = math.log(D/count)
        IDF_values.update({w:idf_value})
    return IDF_values


def get_idf_feat(wordList,idf_values):
    assert idf_values != None
    idf = np.zeros(vocab_size)
    for w in wordList:
        idf[word2id[w]] = idf_values[w]
    return idf

In [21]:
idf_values = calculateCorpusIDF(vocab,[item['words'] for item in data_records])

  0%|          | 0/21624 [00:00<?, ?it/s]

In [22]:
recordsWithTFIDF = copy.deepcopy(recordsWithTF)
for idx,r in enumerate(recordsWithTFIDF):
    r['IDF'] = get_idf_feat(r['words'],idf_values)
    recordsWithTFIDF[idx] = r

In [23]:
recordsWithTFIDF[42]

{'id': 43,
 'content': '有些事越想要越得不到，有些梦只能相信，是这样吗',
 'task-1': 'neg',
 'task-2': 'sad',
 'words': ['事越', '想要', '越', '得不到', '梦', '只能', '相信'],
 'TF_array': array([0., 0., 0., ..., 0., 0., 0.]),
 'IDF': array([0., 0., 0., ..., 0., 0., 0.])}

In [24]:
[item for item in recordsWithTFIDF[42]['IDF'].tolist() if item!=0]

[7.824046010856292,
 4.688551794927142,
 6.725433722188183,
 4.645992180508347,
 4.990832666800076,
 5.221356325411908,
 5.115995809754082]

In [25]:
def get_tf_idf_feat(wordList,idf_values,vocab):
    tf = get_tf_feat(wordList,vocab)
    idf = get_idf_feat(wordList,idf_values)
    tf_idf = tf * idf
    return tf_idf

In [26]:
recordsWithTFIDF = copy.deepcopy(recordsWithTFIDF)

for idx,r in enumerate(tqdm(recordsWithTFIDF)):
    r['TFIDF'] = get_tf_idf_feat(r['words'],idf_values,vocab)
    recordsWithTFIDF[idx] = r

  0%|          | 0/5000 [00:00<?, ?it/s]

In [27]:
recordsWithTFIDF[42]

{'id': 43,
 'content': '有些事越想要越得不到，有些梦只能相信，是这样吗',
 'task-1': 'neg',
 'task-2': 'sad',
 'words': ['事越', '想要', '越', '得不到', '梦', '只能', '相信'],
 'TF_array': array([0., 0., 0., ..., 0., 0., 0.]),
 'IDF': array([0., 0., 0., ..., 0., 0., 0.]),
 'TFIDF': array([0., 0., 0., ..., 0., 0., 0.])}

In [28]:
[item for item in recordsWithTFIDF[42]['TFIDF'].tolist() if item!=0]

[1.117720858693756,
 0.6697931135610203,
 0.9607762460268832,
 0.6637131686440495,
 0.7129760952571537,
 0.7459080464874154,
 0.7308565442505831]

In [29]:
def similarity(vec_a,vec_b):
    """
    Cosine Similarity Calculation

    :param vec_a: np array
    :param vec_b: np array
    :return: float
    """
    return np.linalg.multi_dot([vec_a,vec_b]) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))

In [36]:
tfidfs = np.array([item['TFIDF'] for item in recordsWithTFIDF])

In [31]:
sim = np.zeros((len(tfidfs),len(tfidfs)))
for x in tqdm(range(len(tfidfs))):
    for y in range(len(tfidfs)):
        sim[x,y] = similarity(tfidfs[x],tfidfs[y])

  0%|          | 0/5000 [00:00<?, ?it/s]


KeyboardInterrupt



用pytorch加速这个计算过程 可以了解一下爱因斯坦求和公式

In [51]:
import torch
torch.nn.functional.normalize(torch.tensor(tfidfs),dim=-1)

torch.Size([5000, 21624])

In [53]:
sim = torch.einsum('xv,yv->xy',torch.nn.functional.normalize(torch.tensor(tfidfs),dim=-1),torch.nn.functional.normalize(torch.tensor(tfidfs),dim=-1))

tensor([1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
       dtype=torch.float64)

In [69]:
inspect_id = 0
k = 50
indices = torch.topk(sim[inspect_id],k=k).indices.tolist()

for idx in indices:
    print(recordsWithTFIDF[idx]['content'])
print(torch.topk(sim[inspect_id],k=k))

宿舍要民汉合宿了为毛都大三了还要折腾我
这样参加个毛招聘会啊拍个毛毕业照啊
今天要崩溃死我不可，为毛就这么倒霉！为毛办个事就这么不容易！KAO!MD!
才睡半小时就被折腾醒宿舍暖气太给力盖被子吧热的不行不盖吧蚊子老咬人咋办
我大三食放了剩下的盛夏！清早心情好。
每次院校抽检别的宿舍是抽检我们宿舍就变成每月例行检查即使我们门口装饰太美！宿舍再整齐！我的贴纸你们再喜欢！！！能不能别每次都来每次都要在我那里拍照
休息一天比上班还累，这一天可折腾死宝宝了
我靠！拆空调的人居然可以直接进宿舍！营业厅的人接网线还得等我们进宿舍！丢东西了谁负责哦！
折腾到现在可算是到了，饿坏了，附近既然既然没什么餐厅
上海家化玩的是什么鬼？作为40块的股票，一天波动1毛4
写教案，做PPT，讲课，上党校，考研，谁TM再告诉我大三好玩
哔了狗了！都快大三下学期了来个指纹打卡！那么多天没课在学校吃屎嘛！
第二招：拉一小点，打火机烧一下，有毛发烧胡的味道。
“为什么心情不好的时候总会想吃东西”“因为伤心欲嚼”为毛我是开心的时候变为胃plus
甘叼毛，天氣，好熱，搞我又冷又熱，好頭痛啊
整天生一些有七没八的破事折腾人，天怒人怨遭雷劈！！！
逼事真多，什么狗玩意，老子要回宿舍??????
今天去医院建卡，从七点半折腾到十点多，抽了七管血
我折腾一中午没睡着那几只睡得现在都还没醒说好的三点去图书馆呢
我的脚丫好酸啊这个点回来宿舍阿姨竟然锁门
再也没有办法像大二大三那样愉快的玩耍了每天事特多总感觉天天都好忙想着还有好多任务没完成心更累
男朋友宿舍里养了只狗然后妈的下大雨了我现在很烦想死全湿了
淋了一晚上雨还穿着高跟鞋来回宿舍和水房两趟…
脑残的点了一个不改点的确定，结果折腾到现在～下手需三思，期待恢复正常使用，快见光明～
今晚通关了逃生。人心最可怕。一个人在宿舍。有点不敢睡。以后不能一个人玩恐怖游戏。晚安。
本人虽然长的不俊，但走到那都招人喜欢，燕过留声人过留名，从哪里离开别人都没有说我不好的。不知为毛偏偏不得现在主管待见，一天不找个事不得劲，还天天下班拖我到现在
集体宿舍真的很不好，没人会考虑你的感受，比如在个人卫生问题上，比如在你睡觉时，别人总是在那放各种电视剧，音乐，制造各种噪音
本来一个人坐着吃饭挺好的，非进来个情侣，来一对还不行啊还进来两对，唉，回了宿舍还是被虐，现在不想看到任何的秀恩爱
宿舍的姑娘