### 使用Faker批量生成测试数据

#### 1.批量生成用户信息

In [4]:
from faker import Faker
import numpy as np
import pandas as pd

In [5]:
fk = Faker("zh_CN")
print(fk.simple_profile(sex=None))

{'username': 'pingliu', 'name': '张雪', 'sex': 'M', 'address': '江苏省玉珍县朝阳吕街w座 479646', 'mail': 'szhang@gmail.com', 'birthdate': datetime.date(1916, 4, 7)}


In [6]:
cate_dict = {
    "1": "Computer vision and Pattern recognition",
    "2": "Machine Learning",
    "3": "Robotics",
    "4": "Image and Video processing",
    "5": "Artificial Intelligence",
    "6": "Others"
}
a = np.random.randint(1, 7, np.random.randint(1, 7)).tolist()
a = [cate_dict[str(item)] for item in a if a.count(item) == 1]
a

['Computer vision and Pattern recognition']

In [7]:
fk = Faker("zh_CN")

def Generate_ID(num):
    '''
    生成用户ID信息
    '''
    BaseHead = "65010000001"
    IDlist = []

    for i in range(num):
        selfAdd = 1001
        selfAdd = 1001 + i
        result = BaseHead + str(selfAdd)
        IDlist.append(result)
    return IDlist

def Generate_other_data(IDlist, num):
    '''
    1. 以IDlist中的数据作为第一列数据
    2. Faker随机生成其他数据
    3. 两者最终合并存入一一对应的列表中
    '''
    otherDataList = []
    for i in range(num):
        result = fk.simple_profile(sex=None)
        tmp = np.random.randint(1, 7, np.random.randint(1, 7)).tolist()
        interest_area = [cate_dict[str(item)] for item in tmp if tmp.count(item) == 1]
        otherDataList.append([IDlist[i], result['name'], result['username'], result['sex'], 
                              result['mail'], result['address'], result['birthdate'], interest_area])
    return otherDataList

def data_to_csv(otherDataList):
    outputfile = 'data/userData.csv'
    columns = ['user_id', 'name', 'username', 'sex', 'mail', 'address', 'birthdate', 'interest_area']
    Data = []
    Data.append(columns)
    Data.extend(otherDataList)
    df = pd.DataFrame(Data)
    df.to_csv(outputfile, encoding='utf-8', index=False, header=0)

In [8]:
user_id_list = Generate_ID(1000)
Data = Generate_other_data(user_id_list, 1000)
data_to_csv(Data)

In [9]:
user_data = pd.read_csv('./data/userData.csv')
user_data.head()

Unnamed: 0,user_id,name,username,sex,mail,address,birthdate,interest_area
0,650100000011001,陈玲,ping69,F,chao78@gmail.com,河北省荆门市上街方街X座 273789,2010-05-27,"['Artificial Intelligence', 'Machine Learning'..."
1,650100000011002,邢雪梅,minggong,M,yanfu@yahoo.com,湖北省东莞县淄川崔路n座 996454,1944-09-27,[]
2,650100000011003,王淑珍,ptian,F,nfeng@gmail.com,福建省慧县南长西安路i座 627612,1917-12-31,"['Computer vision and Pattern recognition', 'A..."
3,650100000011004,古冬梅,gang62,F,li56@hotmail.com,澳门特别行政区兵市璧山徐路S座 874431,2000-05-15,"['Computer vision and Pattern recognition', 'M..."
4,650100000011005,郑志强,ming83,M,oguo@gmail.com,山西省阜新县东丽蔡路q座 976862,1917-11-07,['Image and Video processing']


#### 2.导入论文信息并处理

In [10]:
# 解决数据输出时列名不对齐的问题
pd.set_option('display.unicode.east_asian_width', True)
df = pd.read_excel('data/papers1.xlsx').loc[:, ["id", "title", "paperAbstract", "subject"]]     # 读取excel文件
df.head()

  df = pd.read_excel('data/papers1.xlsx').loc[:, ["id", "title", "paperAbstract", "subject"]]     # 读取excel文件


Unnamed: 0,id,title,paperAbstract,subject
0,6593edae-c1e9-4168-872d-ef3ebcabd270,Dual Cross-Attention Learning for Fine-Grained...,"Recently, self-attention mechanisms have shown...",Computer vision and Pattern recognition
1,8748010d-548b-442c-ae40-bfc3b2e3e858,SimAN: Exploring Self-Supervised Representatio...,Recently self-supervised representation learni...,Computer vision and Pattern recognition
2,68c47039-f44c-404b-a936-2a3a6f7c1a8c,Weakly Supervised Semantic Segmentation by Pix...,Though image-level weakly supervised semantic ...,Computer vision and Pattern recognition
3,acc17ed9-1f72-43b5-8938-78c694966ce1,Controllable Animation of Fluid Elements in St...,We propose a method to interactively control t...,Computer vision and Pattern recognition
4,cce8e8d7-9850-4acc-b7aa-40e13fe31a5b,Recurrent Dynamic Embedding for Video Object S...,Space-time memory (STM) based video object seg...,Computer vision and Pattern recognition


In [11]:
import re
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

def get_key_words(words_str):
    """
    提取TOPN关键词
    :param words_str: 表示需要提取的文本，是论文的摘要
    :param subject: 论文的主题
    :return:
    """
    # 字符串进行清洗
    words_str.replace('\n', '').replace('\u3000', '').replace('\u00A0', '')
    # 定义标点符号列表
    words_str = re.sub("[0-9.:;,?&$@!()%#^*]", "", words_str)
    # 定义停用词
    stops = set(stopwords.words("english"))

    # 切词
    words_list = nltk.word_tokenize(words_str)
    # 去除停用词
    words_list = [word for word in words_list if word not in stops and word.lower() not in stops]

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform([" ".join(words_list)])
    data = {"key_words": vectorizer.get_feature_names(),
            "tfidf": X.toarray().sum(axis=0).tolist()}
    df = pd.DataFrame(data).sort_values(by="tfidf", ascending=False)
    return np.array(df.iloc[0:10, 0]).tolist()

In [12]:
dataList = [["paper_id", "key_words", "subject"]]
# 获取所有的paper_id
paper_id_list = []
for row in df.itertuples():
    tmp = []
    tmp.append(getattr(row, 'id'))
    paper_id_list.append(getattr(row, 'id'))
    tmp.append(get_key_words(getattr(row, 'paperAbstract')))
    tmp.append(getattr(row, 'subject'))
    dataList.append(tmp)
df = pd.DataFrame(dataList)
outputfile = './data/papers.csv'
df.to_csv(outputfile, encoding='utf-8', index=False, header=0)

In [16]:
paper_data = pd.read_csv('./data/papers.csv')
paper_data.set_index("paper_id", inplace=True)
paper_data.head()

Unnamed: 0_level_0,key_words,subject
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6593edae-c1e9-4168-872d-ef3ebcabd270,"['attention', 'self', 'cross', 'image', 'globa...",Computer vision and Pattern recognition
8748010d-548b-442c-ae40-bfc3b2e3e858,"['representation', 'patch', 'learning', 'image...",Computer vision and Pattern recognition
68c47039-f44c-404b-a936-2a3a6f7c1a8c,"['method', 'segmentation', 'wsss', 'class', 'l...",Computer vision and Pattern recognition
acc17ed9-1f72-43b5-8938-78c694966ce1,"['map', 'flow', 'optical', 'elements', 'fluid'...",Computer vision and Pattern recognition
cce8e8d7-9850-4acc-b7aa-40e13fe31a5b,"['memory', 'bank', 'information', 'sam', 'inac...",Computer vision and Pattern recognition


#### 3.批量生成用户交互信息

In [14]:
def Generate_behavior_data(num):
    '''
    批量生成用户交互信息
    '''
    for i in range(num):
        user_id = user_id_list[np.random.randint(0, len(user_id_list))]
        paper_id = paper_id_list[np.random.randint(0, len(paper_id_list))]
        data_time = fk.date_time_between(start_date="-30d", end_date="now")
        

#### 4.Doc2Vec演示

In [17]:
import os
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

model_path = "model/doc2vec.model"
documents = [TaggedDocument(key_words, [paper_id]) for paper_id, key_words in paper_data.items()]

if os.path.exists(model_path) and os.path.isfile(model_path):
    # load model
    model = Doc2Vec.load(model_path)
else:
    model = Doc2Vec(documents, vector_size=100, window=3, min_count=1, workers=4, epochs=20)

tte = model.corpus_count + len(documents)
model.train(documents, total_examples=tte, epochs=20)
model.save(model_path)

In [18]:
user_vector = [['attention', 'person', 'segmentation', 'method', 'image', 'interactions', 'pwca']]
inferred_vector = model.infer_vector(user_vector)
sims = model.docvecs.most_similar([inferred_vector])
sims

  sims = model.docvecs.most_similar([inferred_vector])


[('6593edae-c1e9-4168-872d-ef3ebcabd270', 0.998130738735199),
 ('bcbd1591-2b8f-4cf7-852d-eb8a5d333e4f', 0.9981265068054199),
 ('404835be-38ff-43bd-9528-09004355a57a', 0.998114824295044),
 ('2acb346e-0d2f-4925-b307-ec45c0a69404', 0.9980873465538025),
 ('6dbc0ceb-e73d-4155-b679-df7715bcaec2', 0.998063862323761),
 ('8b36ec47-3c67-4571-bf5b-ec39c99684b7', 0.9980615973472595),
 ('4f7004cb-6397-447f-83f2-42625a9af8c0', 0.9980065822601318),
 ('42c14f25-000d-46e7-aaa0-14ad5436fb2b', 0.9979945421218872),
 ('d760b9b3-4d8d-47e2-9c71-43b01bc658e2', 0.9979574680328369),
 ('8def205e-689c-4a70-b2a5-c62611a3b9d2', 0.9979023933410645)]