Created by Donald E. Brown on 2015-11-12.
Copyright (c) 2015 Donald E. Brown. All rights reserved.

## Graphics in Python

In [6]:
# Data
import pandas as pd
import numpy as np
import scipy as sp
import os
import string as st
from collections import Counter

# Statistics
import statsmodels.api as sm
import statsmodels.sandbox.tools.tools_pca as sm_pca
from statsmodels.formula.api import ols as sm_ols
from statsmodels.stats.anova import anova_lm as sm_anova
from patsy.contrasts import Treatment

# Plotting
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from pylab import savefig

sns.set(style="darkgrid", color_codes=True)

# Printing
import locale

locale.setlocale( locale.LC_ALL, '' )

%matplotlib inline

# Check for errors

ImportError: C extension: 'hashtable' not built. If you want to import pandas from the source directory, you may need to run 'python setup.py build_ext --inplace' to build the C extensions first.

In [3]:
totactsClean = pd.read_csv("traindataClean.csv")
totactsClean.shape

NameError: name 'pd' is not defined

In [None]:
plt.plot(totactsClean.ACCDMG2015)

In [None]:
plt.scatter(np.arange(30), np.arange(30) + 3 * np.random.randn(30))

In [None]:
totactsClean[["ACCDMG"]].plot(color='r', linestyle='dashed', marker='o')

In [None]:
totactsClean[["ACCDMG", "TRKDMG"]].plot()

In [None]:
plt.plot(totactsClean["ACCDMG2015"], '.')

In [None]:
fig = plt.figure()
spd_dmg = fig.add_subplot(1, 1, 1)
#spd_dmg.plot(totactsClean.ACCDMG2015)
spd_dmg.plot(totactsClean.TRNSPD, totactsClean.ACCDMG2015, '.')
spd_dmg.set_title("Accident Damage vs. Speed")
spd_dmg.set_ylabel("Cost ($10M)")
spd_dmg.set_xlabel("Speed (MPH)")

In [None]:
fig = plt.figure()
dmg = fig.add_subplot(1, 1, 1)
dmg.plot(totactsClean.ACCDMG2015.cumsum(), 'g', label = 'Total Damage')
dmg.plot(totactsClean.TRKDMG.cumsum(), 'r', label = 'Track Damage' )
dmg.plot(totactsClean.EQPDMG.cumsum(), 'b', label = 'Train Damage')
dmg.legend(loc = 'best')
dmg.set_title("Accident Damage")
dmg.set_ylabel("Damage ($)")
dmg.set_xlabel("Accident Number")

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)
plt.plot(totactsClean.TRNSPD, 'r.')
ax1.hist(totactsClean.TEMP, bins=20, color='k', alpha=0.3)
ax2.scatter(totactsClean.TRNSPD, np.log(totactsClean.ACCDMG+1))

## Question

Write the pythod to make the following six plots: 1. ACCDMG vs. TRNSPD, 2. TONS vs. TRNSPD, 3. Log(ACCDMG) vs. TRNSPD, 4. histogram of TRNSPD with alpha = 1, 5. histogram of TRNSPD with alpha = .5, and 6. histogram of TRNSPD with alpha = .3. Put the six plots into a single display. Make your graphic look nice with titles, spacing, and colors.

In [None]:
# Group by year and use for all plot below
totacts_yearly = totactsClean.groupby('YEAR4')

In [None]:
# Time Series Plot of Damage
# with legend
fig = plt.figure(figsize=(8, 8))
dmg = fig.add_subplot(1, 1, 1)
dmg.plot(totacts_yearly.sum().index, totacts_yearly.ACCDMG2015.sum(), label = "Total Damage")
dmg.plot(totacts_yearly.sum().index, totacts_yearly.TRKDMG.sum(),  label = "Track Damage")
dmg.plot(totacts_yearly.sum().index, totacts_yearly.EQPDMG.sum(),  label = "Train Damage")
dmg.legend(loc = 'best')
dmg.set_title("Accident Damage")
dmg.set_ylabel("Damage ($)")
dmg.set_xlabel("Year")

## PANDAS plotting

In [None]:
totactsClean.TEMP.hist()

In [None]:
totactsClean.TEMP.hist(bins = 100)

In [None]:
totactsClean.TEMP.plot(kind = 'kde')

In [None]:
# Using seaborn
temp_hist = sns.distplot(totactsClean[['TEMP']], kde=False)
# Bins algorithm defaults to Freedman-Diaconis
temp_hist.set_title('Accident Temperatures (F)')
temp_hist.set_ylabel('Count')
temp_hist.set_xlabel('Temperature in degrees Fahrenheit')

In [None]:
damage = totactsClean.ACCDMG2015.head()
damage.plot()
#plt.plot(totactsClean.TEMP)

In [None]:
# Data Frame plot
damages = totactsClean[["ACCDMG2015", "TRKDMG", "EQPDMG"]]
damages = damages[(totactsClean.YEAR  == 13) & (totactsClean.MONTH  == 1)]
damages.plot()

In [None]:
# Scatter Plot Matrix
pd.scatter_matrix(damages, diagonal = 'kde')

In [None]:
#Bar plots
yearly_plot = totactsClean.groupby('YEAR4')['ACCDMG'].count().plot(kind="bar")
yearly_plot.set_title('Number of Train Accidents Per Year')
yearly_plot.set_ylabel('Number of Accidents')
yearly_plot.set_xlabel('Year')

In [None]:
#Bar plots
actstype = totactsClean.groupby('TYPE')['ACCDMG2015']
type_plot = actstype.sum().plot(kind="bar")
type_plot.set_title('Cost of Accidents by Type')
type_plot.set_ylabel('Cost of Accidents')
type_plot.set_xlabel('Type of Accident')

In [None]:
#Bar plots
type_counts = totactsClean.TYPE.value_counts(sort = False)
type_counts.plot(kind = 'bar')

## Prettyplotlib
install prettyplotlib, in the terminal type pip install prettyplotlib

In [None]:
# Plot in matplotlib

np.random.seed(12)
for i in range(8):
    x = np.arange(1000)
    y = np.random.randn(1000).cumsum()
    plt.plot(x,y, label = str(i))
plt.legend()

In [None]:
# same plot with prettyplotlib
import prettyplotlib as ppl

np.random.seed(12)
for i in range(8):
    x = np.arange(1000)
    y = np.random.randn(1000).cumsum()
    ppl.plot(x,y, label = str(i))
ppl.legend()

In [None]:
# Colormesh

np.random.seed(12)
plt.pcolormesh(np.random.rand(16,16))
plt.colorbar()

In [None]:
# colormesh with prettyplotlib

np.random.seed(12)
ppl.pcolormesh(np.random.rand(16,16))

## Question

Use python to prodce a bar chart of the counts of the type of train accidents for the years 2011, 2012, and 2013. The bars for each year should be next to each other.

## Seaborn


In [None]:
# Putting total and maximum injuries together using symbols 

plt.figure(figsize=(8, 8))

# Plot time series of sum of accident costs per year in background
yearly_plot = sns.tsplot(totacts_yearly.ACCDMG.sum(), totacts_yearly.sum().index)
yearly_plot.set_title('Cost of Accidents Per Year')
yearly_plot.set_ylabel('Cost (2015 USD)')
yearly_plot.set_xlabel('Year')

# Overlay top accident cost per year as circles on points
plt.scatter(totacts_yearly.sum().index.tolist(), 
            totacts_yearly.ACCDMG.sum().tolist(),
            color='r', 
            s=totacts_yearly.ACCDMG.max().apply(lambda x: x/1e4).tolist())
savefig('costperyear.pdf')

In [None]:
# Violin Plot

x1 = np.random.randn(80)
x2 = np.random.randn(80)
x3 = x1*x2
sns.violinplot(data = [x1,x2,x3])

In [None]:
mydf = pd.DataFrame(dict(x1=x1, x2=x2, x3=x3))
sns.corrplot(mydf)

In [None]:
sns.heatmap(pd.DataFrame(np.random.rand(4,4)))

In [None]:
# Regression Plot

# ACCDMG2015 vs TONS
sns.jointplot( "TONS", "ACCDMG", data=totactsClean, kind="reg")

## Bokeh
Interactive Web visualization

In [None]:
import bokeh.plotting as bkh
bkh.output_notebook()

In [None]:
# Generate data
x = np.linespace(0.,1.,100)
y = np.cumsum(np.random.randn(100))

In [None]:
# Plot an interactive graphic
p = bkh.figure(title="simple line example", x_axis_label='x', y_axis_label='y')
p.line(x,y, line_width = 5)
bkh.show(p)

## MPLD3
Converting matplotlib figures to interactive D3.js visualizations


In [None]:
from mpld3 import enable_notebook
enable_notebook()

In [None]:
X = np.random.normal(0,1,(100,3))
color = np.random.random(100)
size = 500*np.random.random(100)
plt.scatter(X[:,0], X[:,1], c=color, s= size, alpha = .5, linewidths = 2)
plt.grid(color = 'lightgray', alpha = 0.7)