In [None]:
import os

DIR = r"C:\Users\mirza\OneDrive\Desktop\Fakultet\Biological Data Analysis with Python\Code"

In [None]:
import numpy as np
import pandas as pd

from IPython.display import display

In [None]:
import matplotlib.pyplot as plt

# Basic example

In [None]:
x = np.arange(0, 5, 0.1)
y = np.sin(x)

# Creates a line chart
plt.plot(x, y)

In [None]:
# Necessary when working in non-interactive mode (e.g. in another IDE)
plt.show()

# Jupyter integration

In [None]:
# in old versions, this was necessary (if you come accross this)
%matplotlib inline

In [None]:
plt.plot(x, y)

# Figures and axes

In [None]:
# make just a blank figure
fig1 = plt.figure(figsize = (10, 5))

# add stuff to it (in subplots)
ax1 = fig1.add_subplot(1, 2, 1) # 1 rows, 2 columns. Create axes #1 of 2 (1-based index)
ax2 = fig1.add_subplot(1, 2, 2) # Create axes #2 of 2
ax3 = fig1.add_axes([0.69, 0.66, 0.2, 0.2]) # Create axes by explicit coordinates

ax1.plot(x, y, color = 'red')
ax1.plot(x, 2 * y, color = 'blue')
ax2.plot(x * x, y, color = 'green')
ax3.plot(x * x, y * y, color = 'orange')

fig2 = plt.figure()
ax4 = fig2.add_subplot(1, 1, 1)
ax4.plot(x, x + y, color = 'gray')

In [None]:
# they are defined as appropirate types
print(type(fig1))
print(type(ax1), type(ax3))

In [None]:
# we can specify the number of subplots we want (through rows and columns)
rows = 2
cols = 4

# and we can use dedicated subplots function to fill in the data
fig, axes = plt.subplots(nrows = rows, ncols = cols, figsize = (15, 8))
print(type(axes), axes.dtype, axes.shape) # subplots returns a figure and a np.array
print(axes)

for i in range(rows):
    for j in range(cols):
        axes[i, j].plot(x, np.sin((i + 1) * x + j))

In [None]:
# The simplest way to create an axes object (getting one subplot by default)
fig, ax = plt.subplots()
ax.plot(x, y)

In [None]:
# Can draw multiple plots on the same axes
# GCA - a shortcut to Get Current Axes, creating one if it doesn't exist
# (if there are multiple axes, returns the last one by default)
fig = plt.figure()
fig.gca().plot(x, y, color = 'red')
fig.gca().plot(x, y + 1.5, color = 'orange')
plt.plot(x, y + 3, color = 'yellow')
# The shortcut plt.plot uses the gca() of the last used figure (creating one if there aren't any)

# Artist objects

Every time we make something (e.g. a plot) within the axes, we get an artist object.

In [None]:
fig, ax = plt.subplots(figsize = (10, 2))
added_lines = ax.plot(x, y)

print(type(added_lines), added_lines) # it is just a list containing drawn objects
added_line, = added_lines # we can extract the objects...
print(type(added_line))

In [None]:
import matplotlib

# they have specific defined types
print(isinstance(added_line, matplotlib.lines.Line2D))
print(isinstance(added_line, matplotlib.artist.Artist))

In [None]:
# we can change the properties of artist objects after creating them
# although that is not recommended
fig, ax = plt.subplots()
added_line, = ax.plot(x, y)
added_line.set_color('red')

# In most scenarios Artist objects are not that useful (but we'll see some examples where they are).

# Annotations

In [None]:
fig, ax = plt.subplots()

lines1 = ax.plot(x, y, color = 'blue')
lines2 = ax.plot(x, y + 1, color = 'green')
lines = lines1 + lines2 # collect all artist objects

# write labels
ax.set_title('y = sin(x) + C')
ax.set_xlabel('x')
ax.set_ylabel('y [= sin(x) + C]')

# make a legend for artist objects
ax.legend(lines, ['C = 0', 'C = 1'])

# can also make it for the whole plot 
# but it is not the precise way to do it
# TODO
# fig.legend(['C = 0', 'C = 1'])


In [None]:
# Customizations:
# - legend at a specific location
# - Fontsize for title and axes labels

fig, ax = plt.subplots()

lines1 = ax.plot(x, y, color = 'blue')
lines2 = ax.plot(x, y + 1, color = 'green')
lines = lines1 + lines2

ax.set_title('y = sin(x) + C', fontsize = 20)
ax.set_xlabel('x', fontsize = 15)
ax.set_ylabel('y [= sin(x) + C]', fontsize = 15)

ax.legend(lines, ['C = 0', 'C = 1'], loc = 'lower left')

Read more about legends at: http://matplotlib.org/users/legend_guide.html

In [None]:
# fig.suptitle for a super title to all axes (the whole figure)
# E.g.

rows = 2
cols = 4

fig, axes = plt.subplots(nrows = rows, ncols = cols, figsize = (15, 8))

for i in range(rows):
    for j in range(cols):
        axes[i, j].plot(x, np.sin((i + 1) * x + j))
        axes[i, j].set_title('A = %d, B = %d' % (i + 1, j), color = 'green')
        
fig.suptitle('y = sin(Ax + B)', fontsize = 22)

In [None]:
# Setting x and y limits (zoom in to the figure)

fig, ax = plt.subplots()
ax.plot(x, y)

ax.set_xlim(-2, 8)
ax.set_ylim(0.5, 1.5)

In [None]:
# Custom ticks

fig, ax = plt.subplots()
ax.plot(x, y)

ax.set_xticks(np.arange(np.min(x), np.max(x) + 1, 0.4)) # evenly spaced
ax.set_yticks([-1, 0.9, 1.1]) # random ones

# change the properties of tick parameteres
ax.tick_params(axis = 'y', labelsize = 15) # 'x', 'y' or 'both'

In [None]:
# Grid

fig, ax = plt.subplots()
ax.plot(x, y)

ax.grid(True)

In [None]:
# Arrows and lines

fig, ax = plt.subplots()
ax.plot(x, y)

# e.g. draw a line on Y=0
ax.axhline(y = 0, color = 'gray')
# draw an arrow (xy - head of the arrow coordinates; xytext - start of the text)
ax.annotate('Local Max', xy = (np.pi / 2, 1.0), xytext = (np.pi / 2, 0.5), fontsize = 11, \
            # provide parameters of the arrow
        arrowprops = dict(facecolor = 'black', width = 1, headwidth = 5))

# More line options

In [None]:
plt.figure().gca().plot(x, y, linewidth = 5)

In [None]:
fig, axes = plt.subplots(nrows = 2, ncols = 3, figsize = (11, 6))
# accept either word...
axes[0, 0].plot(x, y, linestyle = 'dashed')
axes[0, 1].plot(x, y, linestyle = 'dotted')
axes[0, 2].plot(x, y, linestyle = 'dashdot')
# or symbol arguments...
axes[1, 0].plot(x, y, linestyle = '--')
axes[1, 1].plot(x, y, linestyle = ':')
axes[1, 2].plot(x, y, linestyle = '-.')

# Plotting CDF
Spoiler alert: it is just a line...

In [None]:
N = 10000
# generate a somewhat random distribution
data = np.where(np.random.randint(0, 2, N, dtype = bool), np.random.randn(N), 3 * np.random.randn(N) + 20)
print(data[:10])
print()

fig, ax = plt.subplots(figsize = (12, 5))
# data has to be sorted (on x axis)
# we create evenly-spaced array of len N, between 0,1 (for y axis)
ax.plot(np.sort(data), np.linspace(0, 1, len(data)))

for threshold in [0, 5, 20]:
    # calculate the percentage of the data below certain thresholds
    print('%.2f of the data is below %.1f' % ((data <= threshold).mean(), threshold))

# Scatter plots

In [None]:
viperdb = pd.read_csv(os.path.join(DIR, 'viperdb.csv'))
display(viperdb)

### Basic scatter plot

In [None]:
fig, ax = plt.subplots()
ax.scatter(viperdb['Inner Radius'], viperdb['Outer Radius'], color = 'black', facecolors = 'none')

### Beautified scatter plot

In [None]:
# Gettng rid of outliers
filtered_viperdb = viperdb[viperdb['Inner Radius'] < 500]
# Increase Fig size
fig, ax = plt.subplots(figsize = (15, 6))
ax.scatter(filtered_viperdb['Inner Radius'], filtered_viperdb['Outer Radius'], \
        color = 'red', \
           # size
           s = 80, \
           # transparency (80% transparrent; allows us to see the density in the overlapping units)
           alpha = 0.2)
ax.set_xlabel('Inner Radius (Å)', fontsize = 15)
ax.set_ylabel('Outer Radius (Å)', fontsize = 15)

### More advanced scatter plot
Adding multiple dimensions to the 2-D image...

(still better than the marketing gimmicks of 17D 4K Cinemas)

In [None]:
# Coloring viruses by groups, and making the sizes proportional to their number of subunits

import matplotlib.patches as mpatches # helps with the legend

# we again remove the outlier, and we remove viruses that have no "Genome" records
filtered_viperdb = viperdb[(viperdb['Inner Radius'] < 500) & viperdb['Genome'].notnull()]

# process the genome data (taking only the first word) and extract/color unique groups
groups = filtered_viperdb['Genome'].apply(lambda raw_genome: raw_genome.split()[0])
unique_groups = sorted(groups.unique())
group_to_color = {group: plt.cm.jet(65 * i) for i, group in enumerate(unique_groups)}
colors = groups.map(group_to_color)
# colors were selected automatically using one of the many available colormaps (jet)
# colormap returns a color based on the numerical input
# colors can also be selected manually

# start plotting
fig, ax = plt.subplots(figsize = (15, 6))
ax.scatter(filtered_viperdb['Inner Radius'], filtered_viperdb['Outer Radius'], \
        c = colors, s = 0.6 * filtered_viperdb['Subunits'], alpha = 0.35)

ax.set_xlabel('Inner Radius (Å)', fontsize = 15)
ax.set_ylabel('Outer Radius (Å)', fontsize = 15)

# make a legend
ax.legend(handles = [mpatches.Patch(color = color, label = group) for group, color in group_to_color.items()], \
        loc = 'upper left')

In [None]:
# X-Axis in log scale
# Useful when we have data with vastly different scales
fig, ax = plt.subplots(figsize = (12, 4))
filtered_viperdb = viperdb[viperdb['Outside SASA'] > 0]
ax.scatter(np.log10(filtered_viperdb['Outside SASA']), filtered_viperdb['Outer Radius'], color = 'red', s = 80, alpha = 0.2)
ax.set_xlabel('Log10 Surface Area (Å ^ 2)', fontsize = 15)
ax.set_ylabel('Outer Radius (Å)', fontsize = 15)

# Bar charts

In [None]:
families = viperdb['Family'].unique()
avg_outer_radius_per_family = [viperdb.loc[viperdb['Family'] == family, 'Outer Radius'].mean() for family in families]

width = 0.85
x_positions = np.arange(len(families))

fig, ax = plt.subplots(figsize = (15, 6))
ax.bar(x_positions, avg_outer_radius_per_family, width = width, color = 'red')

ax.set_xticks(x_positions)
_ = ax.set_xticklabels(families, rotation = 'vertical')

In [None]:
families = viperdb['Family'].unique()
avg_inner_radius_per_family = [viperdb.loc[viperdb['Family'] == family, 'Inner Radius'].mean() for family in families]
std_inner_radius_per_family = [viperdb.loc[viperdb['Family'] == family, 'Inner Radius'].std() for family in families]
avg_outer_radius_per_family = [viperdb.loc[viperdb['Family'] == family, 'Outer Radius'].mean() for family in families]
std_outer_radius_per_family = [viperdb.loc[viperdb['Family'] == family, 'Outer Radius'].std() for family in families]

width = 0.4
x_positions = np.arange(len(families))

fig, ax = plt.subplots(figsize = (15, 6))
inner_bars = ax.bar(x_positions, avg_inner_radius_per_family, yerr = std_inner_radius_per_family, width = width, \
        color = 'teal')
outer_bars = ax.bar(x_positions + width, avg_outer_radius_per_family, yerr = std_outer_radius_per_family, width = width, \
        color = 'orange')

ax.set_xticks(x_positions + width / 2)
ax.set_xticklabels(families, rotation = 45, horizontalalignment = 'right')

ax.set_xlabel('Family', fontsize = 15)
ax.set_ylabel('Average Radius (Å)', fontsize = 15)

ax.legend([inner_bars[0], outer_bars[0]], ['Inner Radius', 'Outer Radius'])

ax.set_xlim((-width, len(families) - 1 + 2 * width))
ax.set_ylim((0, 1000))

# Histogram & pie chart

In [None]:
fig, ax = plt.subplots(figsize = (15, 6))
_ = ax.hist(viperdb['Net Surface Charge'], bins = 100)

In [None]:
records_per_group = [(groups == group).sum() for group in unique_groups]
colors = [plt.cm.hot(50 * (i + 1)) for i in range(len(unique_groups))]

fig, ax = plt.subplots(figsize = (5, 5))
_ = ax.pie(records_per_group, labels = unique_groups, colors = colors, autopct = '%.1f%%')

# Box plots

In [None]:
# Default whiskers are 1.5 IQR past Q1 and Q3. Here we use 2.5.
# Values outside of this range are considered outliers and are explicitly drawn.

fig, ax = plt.subplots(figsize = (15, 8))

ax.boxplot([viperdb['Inner Radius'], viperdb['Outer Radius']], whis = 2.5)
ax.set_xticklabels(['Inner Radius', 'Outer Radius'], fontsize = 15)

ax.set_ylabel('Sizes (Å)', fontsize = 15)
ax.set_ylim((0, 600))
ax.set_yticks(np.arange(0, 601, 50))
ax.yaxis.grid(True)

In [None]:
# Can also give Matplotlib explicit drawing instructions

fig, ax = plt.subplots()
stats1 = {'mean': 4.2, 'med': 5.9, 'q1': 3.2, 'q3': 7.7, 'whislo': -1, 'whishi': 9, 'fliers': [-3, -5.5]}
stats2 = {'mean': 5, 'med': 5.5, 'q1': 3.5, 'q3': 7.5, 'whislo': 2.0, 'whishi': 8.0, 'fliers': []}
_ = ax.bxp([stats1, stats2], showmeans = True)

# Heat maps

In [None]:
from Bio.SubsMat.MatrixInfo import blosum62

ALL_AA = 'ACDEFGHIKLMNPQRSTVWY'

def get_blosum_value(aa1, aa2):
    if (aa1, aa2) in blosum62:
        return blosum62[(aa1, aa2)]
    else:
        return blosum62[(aa2, aa1)]

data = np.array([[get_blosum_value(aa1, aa2) for aa2 in ALL_AA] for aa1 in ALL_AA])

fig, ax = plt.subplots(figsize = (8, 6))
heatmap = ax.pcolor(data, cmap = 'bwr', vmin = -10, vmax = 10)
fig.colorbar(heatmap)

ax.set_xticks(np.arange(len(ALL_AA)) + 0.5)
ax.set_xticklabels(ALL_AA)
ax.set_yticks(np.arange(len(ALL_AA)) + 0.5)
ax.set_yticklabels(ALL_AA)

ax.set_title('BLOSUM 62', fontsize = 20)

See the full list of colormaps at: http://matplotlib.org/examples/color/colormaps_reference.html

# Matplotlib's interface is sometimes quite bad

In [None]:
# ax.set_xticks Vs. plt.xticks
# Better to just use axes constantly and avoid plt shortcuts.

fig, ax = plt.subplots()
ax.plot(x, y)
ax.set_xticks([0, np.pi / 2, 5])

fig, ax = plt.subplots()
plt.plot(x, y)
_ = plt.xticks([0, np.pi / 2, 5])

In [None]:
# pylab is a common namespace for both pyplot and numpy.
# Better to avoid it.

from matplotlib import pylab

x = pylab.arange(0, 5, 0.1)
y = pylab.sin(x)
pylab.plot(x, y)

# Pandas shortcuts

In [None]:
viperdb.plot(kind = 'scatter', x = 'Inner Radius', y = 'Outer Radius')
# Pandas also takes care of axes names

In [None]:
fig, ax = plt.subplots(figsize = (16, 8))
viperdb[viperdb['Inner Radius'] < 500].plot(ax = ax, kind = 'scatter', x = 'Inner Radius', y = 'Outer Radius', \
        title = 'Inner vs. Outer Radius', s = 0.6 * viperdb['Subunits'], c = groups.map(group_to_color), alpha = 0.3)

In [None]:
group_count_series = pd.Series(records_per_group, index = unique_groups, name = '')
print(group_count_series)

colors = [plt.cm.hot(50 * (i + 1)) for i, group in enumerate(unique_groups)]

fig, ax = plt.subplots(figsize = (5, 5))
group_count_series.plot(kind = 'pie', ax = ax, colors = colors, autopct = '%1.1f%%')

Read more at: http://pandas.pydata.org/pandas-docs/version/0.15.0/visualization.html 

# Style

In [None]:
def draw_sample_plots():
    
    fig, axes = plt.subplots(nrows = 1, ncols = 3, figsize = (11, 3))

    axes[0].plot(x, y)

    axes[1].bar([0, 1, 2], [5, 23, 12], width = 0.5)
    axes[1].set_xticks([0, 1, 2])
    axes[1].set_xticklabels(['A', 'B', 'C'])

    axes[2].scatter(np.random.rand(10), np.random.rand(10))
    
draw_sample_plots()

In [None]:
# There are ready-to-use styles you can use.
# ggplot is a popular style that emulates the aesthetics of ggplot (a popular plotting package for R)

plt.style.use('ggplot')
draw_sample_plots()

In [None]:
# Recovering the default style is not trivial for some reason...

def recover_default_style():
    import matplotlib as mpl
    mpl.rcParams.update(mpl.rcParamsDefault)
    %matplotlib inline
    
recover_default_style()
draw_sample_plots()

In [None]:
# More nice styles...
recover_default_style()
plt.style.use('bmh')
draw_sample_plots()

In [None]:
recover_default_style()
plt.style.use('fivethirtyeight')
draw_sample_plots()

In [None]:
# List all available styles
print(plt.style.available)