In [1]:
import pandas as pd

model_df = pd.read_json('data/mag_papers/mag_papers_0.txt', lines=True)
# 20000件にデータを絞り込み
# TODO debugのために200に変更
model_df = model_df.head(200)

model_df.shape

(200, 19)

In [2]:
model_df = model_df[model_df.lang == 'en']\
    .drop_duplicates(subset = 'title', keep = 'first')\
    .drop(['doc_type', 'doi', 'id', 'issue', 'lang', 
                'n_citation', 'page_end', 'page_start', 'publisher', 
                'references', 'url', 'venue', 'volume'], axis=1)

model_df.shape

(108, 6)

In [3]:
unique_fos = sorted(list({ feature 
                          for paper_row in model_df.fos.fillna('0') 
                          for feature in paper_row }))

unique_year = sorted(model_df['year'].astype('str').unique())

# TODO アホみたいに遅い、高速化考える
def feature_array(x, unique_array):
    row_dict = {}
    for i in x.index:
        var_dict = {}
        
        for j in range(len(unique_array)):
            if type(x[i]) is list:
                if unique_array[j] in x[i]:
                    var_dict.update({unique_array[j]: 1})
                else:
                    var_dict.update({unique_array[j]: 0})
            else:    
                if unique_array[j] == str(x[i]):
                    var_dict.update({unique_array[j]: 1})
                else:
                    var_dict.update({unique_array[j]: 0})
        row_dict.update({i : var_dict})
    
    feature_df = pd.DataFrame.from_dict(row_dict, dtype='str').T
    return feature_df

year_features = feature_array(model_df['year'], unique_year)
fos_features = feature_array(model_df['fos'], unique_fos)
first_features = fos_features.join(year_features).T

from sys import getsizeof
print('Size of first feature array: ', getsizeof(first_features))

Size of first feature array:  2167045


In [6]:
first_features

Unnamed: 0,0,1,2,5,7,8,9,10,11,12,...,183,185,186,187,188,191,193,194,196,197
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Access control,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Accounting,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Actin,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Actuarial science,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Advertising,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Agricultural science,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Agroforestry,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Agronomy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Ambient ionization,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
from scipy.spatial.distance import cosine

def item_collab_filter(features_df):
    item_similarities = pd.DataFrame(index = features_df.columns, columns = features_df.columns)
    
    for i in features_df.columns:
        for j in features_df.columns:
            item_similarities.loc[i][j] = 1 - cosine(features_df[i], features_df[j])
    
    return item_similarities

# TODO debug様に100に変更
#first_items = item_collab_filter(first_features.loc[:, 0:1000])
first_items = item_collab_filter(first_features.loc[:, 0:100])

TypeError: can't multiply sequence by non-int of type 'str'

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set()
ax = sns.heatmap(first_items.fillna(0), 
                 vmin=0, vmax=1, 
                 cmap="YlGnBu", 
                 xticklabels=250, yticklabels=250)
ax.tick_params(labelsize=12)

In [None]:
def paper_recommender(paper_ix, items_df):
    print('Based on the paper: \nindex = ', paper_ix)
    print(model_df.iloc[paper_ix])
    top_results = items_df.loc[paper_ix].sort_values(ascending=False).head(4)
    print('\nTop three results: ')
    order = 1
    for i in top_results.index.tolist()[-3:]:
        print(order,'. Paper index = ', i)
        print('Similarity score: ', top_results[i])
        print(model_df.iloc[i], '\n')
        if order < 5: order += 1
            
paper_recommender(2, first_items)

In [None]:
# TODO MIN MAX

print("Year spread: ", model_df['year'].min()," - ", model_df['year'].max())
print("Quantile spread:\n", model_df['year'].quantile([0.25, 0.5, 0.75]))

fig, ax = plt.subplots()
model_df['year'].hist(ax=ax, bins= model_df['year'].max() - model_df['year'].min())
ax.tick_params(labelsize=12)
ax.set_xlabel('Year Count', fontsize=12)
ax.set_ylabel('Occurrence', fontsize=12)

bins = int(round((model_df['year'].max() - model_df['year'].min()) / 10))

temp_df = pd.DataFrame(index = model_df.index)
temp_df['yearBinned'] = pd.cut(model_df['year'].tolist(), bins, precision = 0)

X_yrs = pd.get_dummies(temp_df['yearBinned'])
X_yrs.columns.categories

In [None]:
fig, ax = plt.subplots()
X_yrs.sum().plot.bar(ax = ax)
ax.tick_params(labelsize=8)
ax.set_xlabel('Binned Years', fontsize=12)
ax.set_ylabel('Counts', fontsize=12)

In [None]:

X_fos = fos_features.values

print('Our pandas Series, in bytes: ', getsizeof(fos_features))
print('Our hashed numpy array, in bytes: ', getsizeof(X_fos))

In [None]:
second_features = np.append(X_fos, X_yrs, axis = 1)

print("The power of feature engineering saves us, in bytes: ",
         getsizeof(first_features) - getsizeof(second_features))

from sklearn.metrics.pairwise import cosine_similarity

def piped_collab_filter(features_matrix, index, top_n):
    item_similarities = \
        1 - cosine_similarity(features_matrix[index:index+1],
                              features_matrix).flatten()
    related_indices = \
        [i for i in item_similarities.argsort()[::-1] if i != index]
    return [(index, item_similarities[index])
            for index in related_indices
    ][0:top_n]

In [None]:

def paper_recommender(items_df, paper_ix, top_n):
       if paper_ix in model_df.index:
           print('Based on the paper:')
           print('Paper index = ', model_df.loc[paper_ix].name)
           print('Title :', model_df.loc[paper_ix]['title'])
           print('FOS :', model_df.loc[paper_ix]['fos'])
           print('Year :', model_df.loc[paper_ix]['year'])
           print('Abstract :', model_df.loc[paper_ix]['abstract'])
           print('Authors :', model_df.loc[paper_ix]['authors'], '\n')

           array_ix = model_df.index.get_loc(paper_ix)
           top_results = piped_collab_filter(items_df, array_ix, top_n)
           print('\nTop',top_n,'results: ')

           order = 1
           for i in range(len(top_results)):
               print(order,'. Paper index = ',
                     model_df.iloc[top_results[i][0]].name)
               print('Similarity score: ', top_results[i][1])
               print('Title :', model_df.iloc[top_results[i][0]]['title'])
               print('FOS :', model_df.iloc[top_results[i][0]]['fos'])
               print('Year :', model_df.iloc[top_results[i][0]]['year'])
               print('Abstract :', model_df.iloc[top_results[i][0]]['abstract'])
               print('Authors :', model_df.iloc[top_results[i][0]]['authors'], '\n')
               if order < top_n: order += 1
                    
       else:
           print('Whoops! Choose another paper. Try something from here: \n',
                 model_df.index[100:200])

paper_recommender(second_features, 2, 3)

In [None]:
model_df.loc[21]

In [None]:
model_df.iloc[21]

In [None]:
model_df.index.get_loc(30)

In [None]:
filled_df = model_df.fillna('None')

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_abstract = vectorizer.fit_transform(filled_df['abstract'])

third_features = np.append(second_features, X_abstract.toarray(), axis = 1)

In [None]:
authors_list = []

for row in filled_df.authors.itertuples():
    if type(row.authors) is str:
        y = {'None': row.Index}
    if type(row.authors) is list:
        y = dict.fromkeys(row.authors[0].values(), row.Index)

    authors_list.append(y)

authors_list[0:5]

In [None]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
D = authors_list
X_authors = v.fit_transform(D)
fourth_features = np.append(third_features, X_authors, axis = 1)

In [None]:
paper_recommender(fourth_features, 2, 3)