In [1]:
import pickle
import pandas as pd
import scipy.stats

In [9]:
pd.options.display.max_columns = 163

In [142]:
d = pd.read_pickle('/data/team2/processed_pubmed_data_query_meta.pkl')

In [148]:
kd_df = pd.read_csv('/data/team2/stats_KD.csv') # baseline results for the queries, pulled from the PubMed API

In [131]:
data = d.copy()

# create flags for whether the PMIDs were on the first page of search results,
# for whether users clicked on them when they appeared in reverse chronological search,
# and for whether users clicked on them when they appeard in best match
data['TopTenFlag'] = data['rank'] <= 10
data['RelevanceClicked'] = data['clicks_relevance_sort'] >= 1
data['DateClicked'] = data['clicks_date_sort'] >= 1

# cast these fields to string so that drop_duplicates works
data['title'] = data['title'].astype(str)
data['journal'] = data['journal'].astype(str)
data['abstract'] = data['abstract'].astype(str)
data['pubtype'] = data['pubtype'].astype(str)

data['hasAbstract'] = data['hasAbstract'].map({True:1, False:0})
data['hasStructuredAbstract'] = data['hasStructuredAbstract'].map({True:1, False:0})
data_d = data.drop_duplicates()

# keep only fields relevant for initial analysis
data_re = data_d[['query', 'hasAbstract', 'hasStructuredAbstract', 'fulltext_status', 'sort', 'TopTenFlag', 'RelevanceClicked', 'DateClicked']]

In [132]:
# DEPRECATED: convert fulltext_status column to binary
#data_re.loc[data_re.fulltext_status=='fulltext_available','fulltext_status'] = 1
#data_re.loc[data_re.fulltext_status=='fulltext_unavailable','fulltext_status'] = 0


In [133]:
# Create subsets for top ten results for both best match and date descending sort searches
data_tt = data_re[data_re['TopTenFlag']==True]
data_tt_rel = data_tt[data_tt['sort']=='relevance_res']
data_tt_date = data_tt[data_tt['sort']=='date_desc_res']

In [134]:
# Create subsets for clicked results for both best match and date descending sort searches
data_rel_clicked = data_re[data_re['RelevanceClicked']==True]
data_date_clicked = data_re[data_re['DateClicked']==True]

In [155]:
# Data in the top 10 query results for BestMatch, test chi square function
tt_rel_res = data_tt_rel[['query', 'hasAbstract', 'hasStructuredAbstract', 'fulltext_status']].groupby(['query']).agg({'hasAbstract':'sum','hasStructuredAbstract':'sum', 'fulltext_status':sum}).reset_index()
scipy.stats.chisquare((tt_rel_res['fulltext_status']/tt_rel_res['fulltext_status'].sum()).tolist(), f_exp=(kd_df['RELEVANCE_hasFulltext']/kd_df['RELEVANCE_hasFulltext'].sum()).tolist())

In [166]:
def final_df(base_df):
    '''
    Input
    :base_df: pandas dataframe, with the fields ['query', 'hasAbstract', 'hasStructuredAbstract', 'fulltext_status']
    
    Output
    Returns a dataframe with counts grouped by query
    '''
    return base_df[['query', 'hasAbstract', 'hasStructuredAbstract', 'fulltext_status']].groupby(['query']).agg({'hasAbstract':'sum','hasStructuredAbstract':'sum', 'fulltext_status':sum}).reset_index()

In [167]:
def results(baseline_df, other_df, baseline_col, other_col):
    '''
    Inputs
    :baseline_df: pandas dataframe, containing counts of the data from the baseline dataset
    :other_df: pandas dataframe, containg counts of the data from a sample dataset
    :baseline_col: str, the column name of the specific count-feature to compare in the baseline dataframe
    :other_col: str, the column name of the specific count-feature to compare in the baseline dataframe
    '''
    print(scipy.stats.chisquare(f_obs=(other_df[other_col]/other_df[other_col].sum()).tolist(), f_exp=(baseline_df[baseline_col]/baseline_df[baseline_col].sum()).tolist()))
    

In [182]:
# Testing

# Top 10 BM vs Baseline for Frequency
print('Top 10 BM vs Baseline for Full Text')
results(kd_df, final_df(data_tt_rel), 'RELEVANCE_hasFulltext', 'fulltext_status')

print('\nClicked BM vs Baseline for Full Text')
results(kd_df, final_df(data_rel_clicked), 'RELEVANCE_hasFulltext', 'fulltext_status')

print('\nTop 10 Date Sort vs Baseline for Full Text')
results(kd_df, final_df(data_tt_date), 'DATE_DESC_hasFulltext', 'fulltext_status')

# missing values for this
#print('\nClicked Date Sort vs Baseline for Full Text')
#results(kd_df, final_df(data_date_clicked), 'DATE_DESC_hasFulltext', 'fulltext_status')

print('\nTop 10 BM vs Baseline for Abstract')
results(kd_df, final_df(data_tt_rel), 'RELEVANCE_hasAbstract', 'hasAbstract')

print('\nClicked BM vs Baseline for Abstract')
results(kd_df, final_df(data_rel_clicked), 'RELEVANCE_hasAbstract', 'hasAbstract')

print('\nTop 10 Date Sort vs Baseline for Abstract')
results(kd_df, final_df(data_tt_date), 'DATE_DESC_hasAbstract', 'hasAbstract')



print('\nTop 10 BM vs Baseline for Structured Abstract')
results(kd_df, final_df(data_tt_rel), 'RELEVANCE_hasStructuredAbstract', 'hasStructuredAbstract')

print('\nClicked BM vs Baseline for Structured Abstract')
results(kd_df, final_df(data_rel_clicked), 'RELEVANCE_hasStructuredAbstract', 'hasStructuredAbstract')

print('\nTop 10 Date Sort vs Baseline for Structured Abstract')
results(kd_df, final_df(data_tt_date), 'DATE_DESC_hasStructuredAbstract', 'hasStructuredAbstract')



Top 10 BM vs Baseline for Full Text
Power_divergenceResult(statistic=1.3796995028357295, pvalue=0.999999986082397)

Clicked BM vs Baseline for Full Text
Power_divergenceResult(statistic=3.216704368121578, pvalue=0.9999809966745193)

Top 10 Date Sort vs Baseline for Full Text
Power_divergenceResult(statistic=nan, pvalue=nan)

Top 10 BM vs Baseline for Abstract
Power_divergenceResult(statistic=0.5457789245894598, pvalue=0.9999999999969771)

Clicked BM vs Baseline for Abstract
Power_divergenceResult(statistic=0.9832794774117473, pvalue=0.9999999993339146)

Top 10 Date Sort vs Baseline for Abstract
Power_divergenceResult(statistic=0.01926146968702393, pvalue=1.0)

Top 10 BM vs Baseline for Structured Abstract
Power_divergenceResult(statistic=1.8938651896223437, pvalue=0.9999997760587044)

Clicked BM vs Baseline for Structured Abstract
Power_divergenceResult(statistic=3.8913489712487968, pvalue=0.9999140930957853)

Top 10 Date Sort vs Baseline for Structured Abstract
Power_divergenceResult(

  terms = (f_obs_float - f_exp)**2 / f_exp


In [184]:
print('\nBaseline Date Sort vs Baseline Best Match Full Text')
results(kd_df, kd_df, 'RELEVANCE_hasFulltext', 'DATE_DESC_hasFulltext')

print('\nBaseline Date Sort vs Baseline Best Match Abstract')
results(kd_df, kd_df, 'RELEVANCE_hasAbstract', 'DATE_DESC_hasAbstract')

print('\nBaseline Date Sort vs Baseline Best Match Structured Abstract')
results(kd_df, kd_df, 'RELEVANCE_hasStructuredAbstract', 'DATE_DESC_hasStructuredAbstract')




Baseline Date Sort vs Baseline Best Match Full Text
Power_divergenceResult(statistic=0.8195970215326666, pvalue=0.999999999872844)

Baseline Date Sort vs Baseline Best Match Abstract
Power_divergenceResult(statistic=0.07561090919527931, pvalue=1.0)

Baseline Date Sort vs Baseline Best Match Structured Abstract
Power_divergenceResult(statistic=0.7790455549603466, pvalue=0.9999999999200313)
