In [37]:
import json
import re
import os
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from pathlib import Path
from tqdm import tqdm, trange
school_total_path=Path(r'F:\0_Desktop\Citation_Analysis\result\total\school_total.json')
school_total=json.load(school_total_path.open())


In [38]:
import plotly.io as pio
pio.templates.default='plotly_white'

In [39]:
Result_ROOT=Path(r'F:\0_Desktop\Citation_Analysis\result\total')
cite_per_year_df=pd.read_csv(Result_ROOT/'cite_per_year_df.csv',index_col=0,)
faculty_meta_info_df=pd.read_csv(Result_ROOT/'faculty_meta_info_df.csv',index_col=0,)
faculty_publication_df=pd.read_csv(Result_ROOT/'faculty_publication_df.csv',index_col=0,)


In [40]:

df=cite_per_year_df.groupby(['school_name','year']).cite.mean().reset_index()
fig=px.line(df,x='year',y='cite',color='school_name')
fig

In [41]:

# fig.show(renderer='browser')
fig.write_image('images/distribution_per_year.png')

In [42]:
df_log=cite_per_year_df.groupby(['school_name','year']).cite.apply(lambda x:np.mean(np.log2(x))).reset_index()
fig=px.line(df_log,x='year',y='cite',color='school_name')
# fig.show(renderer='browser')
fig.layout.yaxis.title.text='Average cite per year (log)'
fig.write_image('images/distribution_per_year_log_scale.png')

In [43]:
school_name_order=cite_per_year_df.query('year == 2019').groupby('school_name').cite.median().sort_values().index
fig=px.box(cite_per_year_df.query('year >= 2010'),x='year',y='cite',color='school_name',log_y=False,hover_data=['year','cite','faculty_name'],category_orders={'school_name':school_name_order},range_y=[0,5000],width=3000,height=1000)
# fig=px.box(cite_per_year_df,x='year',y='cite',color='school_name',log_y=False,hover_data=['year','cite','faculty_name'],category_orders={'school_name':school_name_order},range_y=[0,5000])

fig.write_image('images/from 2010 boxplot.png',scale=3,)
fig.show()

# increased

### clean pub

In [4]:
faculty_publication_df.head()

Unnamed: 0,school_name,faculty_name,index,publication_year,num_citations,title,source
0,Caltech,Ralph Adolphs,1,1994.0,3271,Impaired recognition of emotion in facial expr...,"Nature 372 (6507), 669-672, 1994"
1,Caltech,Ralph Adolphs,2,2002.0,2587,Neural systems for recognizing emotion,"Current opinion in neurobiology 12 (2), 169-17..."
2,Caltech,Ralph Adolphs,3,2003.0,2519,Cognitive neuroscience of human social behaviour,"Nature Reviews Neuroscience 4 (3), 165-178, 2003"
3,Caltech,Ralph Adolphs,4,2007.0,2166,Damage to the prefrontal cortex increases util...,"Nature 446 (7138), 908-911, 2007"
4,Caltech,Ralph Adolphs,5,2001.0,2145,The neurobiology of social cognition,"Current opinion in neurobiology 11 (2), 231-23..."


In [9]:
faculty_publication_df=faculty_publication_df.loc[:,['school_name', 'faculty_name', 'index', 'publication_year', 'num_citations',]]
faculty_publication_df.dropna(inplace=True)
faculty_publication_df.publication_year=faculty_publication_df.publication_year.astype(int,)
# remove pub before 1980
faculty_publication_df=faculty_publication_df.query('publication_year > 1980')

In [15]:
fig=px.box(faculty_publication_df.query('publication_year >= 2010'),x='publication_year',y='num_citations',color='school_name',log_y=False,category_orders={'school_name':school_name_order},range_y=[0,300],width=3000,height=1000)
fig

## Age distribution

In [46]:
df=cite_per_year_df.groupby(['school_name','faculty_name']).year.min().reset_index()
df['pub_age']=2022-df.year
fig=px.box(x='school_name',data_frame=df,y='pub_age',category_orders={'school_name':df.groupby('school_name').pub_age.median().sort_values().index},title='Publication age distribution')
fig.write_image('images/Publication age distribution.png',scale=3)
fig

In [113]:
life_stage=pd.qcut(df.pub_age,4,labels=['Q1','Q2','Q3','Q4'])
df['life_stage']=life_stage
df

Unnamed: 0,school_name,faculty_name,year,pub_age,life_stage
0,Caltech,Akiko Kumagai,1990,32,Q4
1,Caltech,Alexei Aravin,2004,18,Q3
2,Caltech,Angelike Stathopoulos,1998,24,Q4
3,Caltech,Carlos Lois,1995,27,Q4
4,Caltech,Changhuei Yang,2003,19,Q3
...,...,...,...,...,...
439,Westlake,Zhubing SHI,2013,9,Q1
440,Westlake,Zhuchao Ji,2019,3,Q1
441,Westlake,Zibo Chen,2016,6,Q1
442,Westlake,Zixu Liu,2010,12,Q2


In [80]:
life_stage.map?

In [114]:
faculty_meta_info_df_with_life_stage=faculty_meta_info_df.set_index(['school_name','faculty_name']).join(df.set_index(['school_name','faculty_name'])).reset_index()
faculty_meta_info_df_with_life_stage

Unnamed: 0,school_name,faculty_name,affiliation,citedby5y,hindex,hindex5y,i10index,i10index5y,year,pub_age,life_stage
0,Caltech,Ralph Adolphs,California Institute of Technology,28621,120,83,269,229,1998.0,24.0,Q4
1,Caltech,Richard A. Andersen,"Professor of Neuroscience, California Institut...",9408,103,50,238,157,1986.0,36.0,Q4
2,Caltech,David J. Anderson,California Institute of Technology,17991,149,79,313,200,1990.0,32.0,Q4
3,Caltech,Alexei Aravin,"Professor, Division of Biology and Biological ...",10242,48,40,69,60,2004.0,18.0,Q3
4,Caltech,Frances H. Arnold,"Professor of chemical engineering, Caltech",24773,140,83,408,299,1993.0,29.0,Q4
...,...,...,...,...,...,...,...,...,...,...,...
440,Harvad,Marcia A Testa,"Senior Lecturer on Biostatistics, Harvard Univ...",4692,49,27,114,57,1987.0,35.0,Q4
441,Harvad,Lorenzo Trippa,Harvard and Dana Farber,3922,37,34,80,78,2012.0,10.0,Q1
442,Harvad,Tyler J. VanderWeele,John L. Loeb and Frances Lehman Loeb Professor...,32217,97,85,338,320,2009.0,13.0,Q2
443,Harvad,L. J. Wei,Harvard Medical School,4477,39,34,88,76,2007.0,15.0,Q2


In [116]:
faculty_meta_info_df_with_life_stage.dropna(inplace=True)

In [117]:
px.box(faculty_meta_info_df_with_life_stage,x='school_name',y='i10index5y',facet_col='life_stage')

In [118]:
px.box(faculty_meta_info_df_with_life_stage.query('life_stage == "Q1"'),x='school_name',y='i10index5y',facet_col='life_stage')

In [112]:
faculty_meta_info_df_with_life_stage[faculty_meta_info_df_with_life_stage.life_stage.isna()]

Unnamed: 0,school_name,faculty_name,affiliation,citedby5y,hindex,hindex5y,i10index,i10index5y,year,pub_age,life_stage
423,Harvad,Erin K. Lake,"Instructor of Biostatistics, Harvard University",7,1,1,0,0,,,


In [111]:
faculty_publication_df.li

AttributeError: 'DataFrame' object has no attribute 'life_stage'