In [1]:
import pandas as pd
import numpy as np

In [2]:
data_root = 'feature_engineering/dataset/mag_papers/'

In [5]:
all_data = pd.read_json(data_root+'mag_papers_0.txt',lines=True)
all_data.head()

Unnamed: 0,id,title,authors,venue,year,n_citation,page_start,page_end,doc_type,publisher,volume,issue,doi
0,100000002,Electron Spin Resonance Investigations of Oxyg...,"[{'name': 'Ronald P. Mason', 'id': '2105522006...","{'raw': 'Basic life sciences', 'id': '27556866...",1988.0,7.0,21.0,27.0,Journal,"Springer, Boston, MA",49.0,,10.1007/978-1-4684-5568-7_3
1,1000000047,建筑物地基沉降的灰色模型GM（1，1）预测法,"[{'name': '侯晓亮', 'id': '2400277081'}]",{'raw': '安徽建筑'},2006.0,0.0,143.0,143.0,,,13.0,6.0,
2,1000000056,“民情日记”消除干群“空心层”,"[{'name': '张冬梅', 'id': '2405201566'}]",{'raw': '兵团工运'},2010.0,0.0,46.0,46.0,,,,5.0,
3,1000000068,Telephone interface controller for unattended ...,"[{'name': 'Edward D. Smith', 'id': '2103953395'}]",,1981.0,4.0,,,Patent,,,,
4,1000000079,2—羟基—2—甲基—1—苯基丙酮的合成,"[{'name': '胡应喜', 'id': '2659645928'}, {'name':...",{'raw': '化学世界'},2001.0,0.0,203.0,205.0,,,42.0,4.0,


In [6]:
all_data.shape

(21406986, 13)

In [9]:
# 对title进行去重
model_df = sub_data.drop_duplicates(subset='title',keep='first')
model_df.shape

(20000, 13)

In [None]:
# 筛选出英文文本 删除无用列
model_df = model_df[model_df['lang'] == 'en'].drop[
    ['doc_type', 'doi', 'id', 'issue', 'lang',
    'n_citation', 'page_end', 'page_start',
    'publisher', 'references','url', 'venue', 'volume'],axis = 1
]

In [None]:
model_df.shape

In [None]:
model_df['fos'].head()

In [None]:
len(model_df[model_df['fos'] == np.nan])

In [None]:
model_df['year'].describe()

In [None]:
# 对于fos里空值取0

unique_fos = sorted(list(
    {feature
     for paper_row in model_df['fos'].fillna('0')
     for feature in paper_row}
))

In [None]:
unique_fos

In [None]:
# 年份不重复表
unique_year = sorted(model_df['year'].astype('str').unique())

In [None]:
def feature_array(x,unique_array):
    row_dict = {}
    # 整个的字典
    for i in x.index:
        var_dict = {}
        # 对于每一个行 即 每一个论文 有一个one-hot字典
        for j in range(len(unique_array)):
            if type(x[i]) is list:
            # 如果x[i]是列表 有很多种类 那么出现过的就是1 否则是0
                if unique_array[j] in x[i]:
                    var_dict[unique_array[j]] = 1
                else:
                    var_dict[unique_array[j]] = 0
            else:
            # 如果x[i]是单个值 年份的情况下 那么相等就是1 否则就是0
                if unique_array[j] == str(x[i]):
                    var_dict[unique_array[j]] = 1
                else:
                    var_dict[unique_array[j]] = 0
        row_dict[i] = var_dict
        # 构成矩阵
    feature_df = pd.DataFrame.from_dict(row_dict,dtype='str').T
    # 构成DataFrame
    return feature_df

In [None]:
# 构成fos特征one-hot和year特征one-hot

fos_feature = feature_array(model_df['fos'],unique_fos)

year_feature = fos_feature(model_df['year'],unique_year)

In [None]:
# 第一轮特征工程的处理结果 fos特征与Year特征拼接
first_feature = fos_feature.join(year_feature).T

In [None]:
from sys import getsizeof

print('Size of first feature array: ', getsizeof(first_feature))

In [None]:
# 协同过滤 查找相似值

from scipy.spatial.distance import cosine

def item_collab_filter(feature_df):
    item_similarities = pd.DataFrame(index=feature_df.columns, columns = feature_df.columns)

    for i in feature_df.columns:
        for j in feature_df.columns:
            item_similarities.loc[i][j] = 1-cosine(feature_df[i],feature_df[j])
            # 列和列之间做cos ?
    return item_similarities

In [None]:
first_item = item_collab_filter(first_feature.loc[:,0:1000])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set()
ax = sns.heatmap(first_item.fillna(0),
                 vmin=0, vmax=1,
                 cmap='YlGnBu',
                 xticklabels=250,yticklabels=250)
ax.tick_params(labelsize=12)

In [None]:
# 基于item_df的topn进行推荐
def paper_recommender(paper_ix, item_df):
    print('Based on the paper: \n index = ', paper_ix)
    print(model_df.iloc[paper_ix])

    top_results = item_df.loc[paper_ix].sort_values(ascending=False).head(4)

    print('\n Top three results: ')
    order = 1
    for i in top_results.index.tolist()[-3:]:
        print(order,'. Paper index = ', i)
        print('Similarity Score: ',top_results[i])
        print(model_df.iloc[i],'\n')
        if order<5:
            order += 1

In [None]:
paper_recommender(2, first_item)

In [None]:
# 第一种方法整体而言是采用两列特征 用极为稀疏的近似one-hot矩阵 用cos计算相似度进行推荐

# 总体而言 第一个方法太慢了 找出的结果也不是很理想
# 需要更智能 迭代化的特征工程方法

In [None]:
# ---------------------------------------------------------------------------------------

In [None]:
# 对出版年份进行分析 取值范围与分位数
print('Year spread: ', model_df['year'].min()," - ",model_df['year'].max())
print('Quantile spread: ', model_df['year'].quantile([0.25,0.5,0.75]))

In [None]:
fig, ax = plt.subplots()
model_df['year'].hist(ax=ax, bins= model_df['year'].max() - model_df['year'].min())
ax.tick_params(labelsize=12)
ax.set_xlabel('Year Count', fontsize=12)
ax.set_ylabel('Occurrence', fontsize=12)

In [None]:
bins = int(round((model_df['year'].max() - model_df['year'].min()) / 10))
bins

In [None]:
temp_df = pd.DataFrame(index = model_df.index)
temp_df['yearBinned'] = pd.cut(model_df['year'].tolist(), bins)
X_yrs = pd.get_dummies(temp_df['yearBinned'])
X_yrs.columns.categories

In [None]:
fig,ax = plt.subplots()
X_yrs.sum().plot.bar(ax=ax)
ax.tick_params(labelsize=8)
ax.set_xlabel('Binned Years', fontsize=12)
ax.set_ylabel('Count', fontsize=12)

In [None]:
# 处理fos特征

In [None]:
X_fos = fos_feature.values

print('Pandas Series Size: ', getsizeof(fos_feature))
print('Numpy Array Size: ', getsizeof(X_fos))

In [None]:
# 第二轮特征工程得到的特征结果
second_feature = np.append(X_fos, X_yrs, axis=1)
second_feature.shape

In [None]:
# 继续用余弦相似度进行度量

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def piped_collab_filter(feature_matrix, index, top_n):
    item_similarities = 1-\
                        cosine_similarity(feature_matrix[index:index+1], feature_matrix).flatten()
    # 相似度就是index坐标位置的向量 和其他所有做余弦计算

    related_indices = [i for i in item_similarities.argsort()[::-1] if i != index]
    # 先排序 然后取不等于index的下标
    return [(index, item_similarities[index])
            for index in related_indices][0:top_n]
    # 组成top_n个推荐值

In [None]:
def paper_recommender(item_df, paper_ix, top_n):
    if paper_ix in model_df.index:
        print('Based on the paper: ')
        print('Paper Index = ',model_df.loc[paper_ix]['name'])
        print('Paper Title = ', model_df.loc[paper_ix]['title'])

        array_ix = model_df.index.get_loc[paper_ix]
        top_results = piped_collab_filter(item_df, array_ix, top_n)
        print('\n Top ', top_n, ' results:')

        order = 1
        for i in top_results.index.tolist()[-3:]:
            print(order,'. Paper index = ', i)
            print('Similarity Score: ',top_results[i])
            print(model_df.iloc[i],'\n')
            if order<5:
                order += 1

In [None]:
# 目前改进一次之后 虽然可能有所提升 但是也没有多好 此时该怎么办
# 1. 使用原始数据中的更多数据 看看能否得到更好的结果
# 2. 花费更多时间 探索数据 看能否找到一个足够密集的结合来提供更好的推荐
# 3. 添加更多特征 继续迭代当前模型

# 第一种是有可能的 但是过于类似于大海捞针了
# 第二种可以更好地理解原始数据 应该在数据探索过程中 不断进行调整
# 第三种添加更多的特征 得到更好的效果十分合理

In [None]:
# ---------------------------------------------------------------------------------------

In [None]:
# 第三轮 考虑论文摘要和作者姓名

In [None]:
filled_df = model_df.fillna('None')

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# 对abstract处理

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_abstract = vectorizer.fit_transform(filled_df['abstract'])
third_features = np.append(second_feature, X_abstract.toarray(), axis=1)

# 使用tf-idf将英文单词转化提出特征 再加入成为新特征

In [None]:
# 对author处理

authors_list = []
for row in filled_df['authors'].itertuples():
    y = {}
    if type(row.authors) is str:
        y = {'None': row.Index}
    if type(row.authors) is list:
        y = dict.fromkeys(row.authors[0].values(), row.Index)
    authors_list.append(y)

# 先把作者整理成字典 再进行one-hot编码

In [None]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
D = authors_list
X_authors = v.fit_transform(D)
fourth_feature = np.append(third_features, X_authors, axis=1)

# 使用DictVectorizer转换 作为第四特征

In [None]:
paper_recommender(fourth_feature, 2, 3)