In [None]:
# Here, the creation and visualization of correlation values is tested

In [1]:
# >>> Preparation
import pandas

In [2]:
print("Importing publications... ")
cols = [
    'key', 
    'pub_cat',
    'pub_type',
    'title',
    'year',
    'authors',
    'n_authors',
    'n_males',
    'n_females',
    'n_unknown',
    'ratio',
    'n_contribs_females',
    'n_contribs_males',
    'n_contribs_unknown',
]

publs = pandas.read_csv("data/publications_cor2.csv", usecols=cols)

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
publs = publs.set_index("key", drop = False)
print("Publications imported. They look like this: {}".format(publs[:5]))

Importing publications... 


  interactivity=interactivity, compiler=compiler, result=result)


Publications imported. They look like this:                                                                  key  \
key                                                                    
books/acm/0082477                                  books/acm/0082477   
books/acm/Kim95                                      books/acm/Kim95   
books/acm/kim95/AnnevelinkACFHK95  books/acm/kim95/AnnevelinkACFHK95   
books/acm/kim95/Blakeley95                books/acm/kim95/Blakeley95   
books/acm/kim95/BreitbartGS95          books/acm/kim95/BreitbartGS95   

                                        pub_cat pub_type  \
key                                                        
books/acm/0082477                          book      NaN   
books/acm/Kim95                            book      NaN   
books/acm/kim95/AnnevelinkACFHK95  incollection      NaN   
books/acm/kim95/Blakeley95         incollection      NaN   
books/acm/kim95/BreitbartGS95      incollection      NaN   

                              

In [None]:
# >>> Create Correlation Matrix
publs_cor = publs.corr(method='spearman')

In [None]:
# >>> Visualization
# visualize correlation matrix https://stackoverflow.com/questions/29432629/plot-correlation-matrix-using-pandas
# annotated https://matplotlib.org/gallery/images_contours_and_fields/image_annotated_heatmap.html#sphx-glr-gallery-images-contours-and-fields-image-annotated-heatmap-py
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm

autumn = cm.get_cmap('autumn', 10)

#https://stackoverflow.com/questions/332289/how-do-you-change-the-size-of-figures-drawn-with-matplotlib
fig, ax = plt.subplots(figsize=(20, 10))
im = ax.imshow(publs_cor, cmap=autumn)

plt.xticks(np.arange(len(publs_cor.columns.values)), publs_cor.columns.values, rotation='vertical')
plt.yticks(np.arange(len(publs_cor.columns.values)), publs_cor.columns.values)

plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

for i in range(len(publs_cor.columns.values)):
    for j in range(len(publs_cor.columns.values)):
        # round https://www.tutorialspoint.com/python/number_round.htm
        text = ax.text(j, i, round(publs_cor.iloc[i, j], 2), ha="center", va="center", color="k")

plt.title("Correlation Matrix")

plt.show()

In [None]:
# Visualize Scatter graphs
# Scatter graphs can help find evidence of correlation or non-correlation
plt.scatter(publs['year'], publs['n_authors'], alpha=0.4)

plt.xlabel("Year")
plt.ylabel("Amount of authors by publication.")

plt.title("How did the amount of authors contributing to a publication change over time?")
plt.legend()
plt.show()

In [None]:
plt.scatter(publs['year'], publs['n_unknown'], alpha=0.1, color="g", label="unknown")
plt.scatter(publs['year'], publs['n_males'], alpha=0.4, color="r", label="male")
plt.scatter(publs['year'], publs['n_females'], alpha=0.4, color="b", label="female")

plt.xlabel("Year")
plt.ylabel("Amount of authors by publication")

plt.title("How did the amount of authors by gender and publication change over time?")
plt.legend()
plt.show()

In [None]:
plt.scatter(publs['year'], publs['n_contribs_unknown'], alpha=0.1, color="g", label="unknown")
plt.scatter(publs['year'], publs['n_contribs_males'], alpha=0.4, color="r", label="male")
plt.scatter(publs['year'], publs['n_contribs_females'], alpha=0.4, color="b", label="female")

plt.xlabel("Year")
plt.ylabel("Amount of contributions by publication")

plt.title("How did the amount of contributions made by gender and publication change over time?")
plt.legend()
plt.show()

In [None]:
publs['ratio'].hist()