### Matplotlib

[Matplotlib](https://matplotlib.org/) is a plotting/visualization library for Python which plays nicely with NumPy and other commonly used mathematics libraries. 

There are other visualization libraries for Python such as seaborn (built atop Matplotlib), plotly, bokeh and dash. All of these have their own strenghts. Users are encouraged to pick the best one for the task. We are going to concentrate on Matplotlib since is a great starting point.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

The most common plotting function is `plot`, which creates a line graph

In [None]:
# A single sequence is treated as the vertical-axis
y_coords = [0, 3, 1, 5, 2]
plt.plot(y_coords)
plt.show() # not needed for notebooks but still good practise

In [None]:
# We can also add the horizontal axis by hand
x_coords = [0, 1, 2, 3, 4]
plt.plot(x_coords, y_coords)
plt.show()

In [None]:
# Coordinates do not have to be integers
x_coords2 = np.linspace(0,1,len(y_coords))
plt.plot(x_coords2, y_coords)
plt.show()

A good figure should include axis labels and a title, especially when it is given without a caption

In [None]:
plt.plot(x_coords, y_coords)
plt.xlabel('X-Axis')
plt.ylabel('Y-Axis')
plt.title('Plot Title')
plt.show()

We can set the "ticks" of both axis; we can set where to place them, what to call them and implicitly decide their number

In [None]:
#Sparser X, Arbitrary Y
plt.plot(x_coords, y_coords)

# Making X sparser
plt.xticks(range(0,len(y_coords)), range(0,len(y_coords)))

# Arbitrary Y values
plt.yticks([-1,-0.5,0,1,3,3.5,4,4.7,5],[-1,-0.5,0,1,3,3.5,4,4.7,5])
#plt.yticks([-1,-0.5,0,1,3,3.5,4,4.7,5],range(8,-1,-1))
plt.show()

In [None]:
# Customizing the tick marks. Might as well put more meaningful labels

# Build the line graph.
plt.plot(x_coords, y_coords)

# Add a title.
plt.title('Sales by Year')

# Add labels to the axes.
plt.xlabel('Year')
plt.ylabel('Sales')

# Set the ticks
plt.xticks([0, 1, 2, 3, 4],
           ['2016', '2017', '2018', '2019', '2020'])
plt.yticks([0, 1, 2, 3, 4, 5],
           ['$0m', '$1m', '$2m', '$3m', '$4m', '$5m'])

plt.show()

We can play around with line and point (also called markers within matplotlib) properties:
* Color
* Line type (e.g. straight, dashed)
* Marker type (e.g. None, point, triangle)
* Line thickness
* Markers size

Note: There is a shortcut for marker and line, types and colors.

We will not explain all the possible parameters. They will become apparent as we use them

In [None]:
# Shortcut
plt.plot(x_coords, y_coords, 'ro--')
plt.show()

In [None]:
#Equivalent to above
plt.plot(x_coords, y_coords, color = 'r', marker='o', linestyle = '--')
plt.show()

In [None]:
#Larger marker and thicker line (exaggerated, this will not look good :) 
plt.plot(x_coords, y_coords, 'ro--', markersize=12, linewidth = 4)
plt.show()

We can set the overall figure size, often a good idea for notebooks. Let's also add a grid.

In [None]:
# Setting the size
plt.figure(figsize=(12,8))

# Build the line graph.
plt.plot(x_coords, y_coords,'ro--', ms=12,linewidth = 3)

# Add a title.
plt.title('Sales by Year')

# Add labels to the axes.
plt.xlabel('Year')
plt.ylabel('Sales')

# Set the ticks
plt.xticks([0, 1, 2, 3, 4],
           ['2016', '2017', '2018', '2019', '2020'])
plt.yticks([0, 1, 2, 3, 4, 5],
           ['$0m', '$1m', '$2m', '$3m', '$4m', '$5m'])

# Add a grid.
plt.grid(True)

plt.show()

Now the fonts are small!

In [None]:
# Setting the size
plt.figure(figsize=(12,8))

# Build the line graph.
plt.plot(x_coords, y_coords,'ro--', ms=12,linewidth = 3)

# Add a title.
plt.title('Sales by Year', fontsize=18)

# Add labels to the axes.
plt.xlabel('Year', fontsize=16)
plt.ylabel('Sales', fontsize=16)

# Set the ticks
plt.xticks([0, 1, 2, 3, 4],
           ['2016', '2017', '2018', '2019', '2020'],
           fontsize=14)
plt.yticks([0, 1, 2, 3, 4, 5],
           ['$0m', '$1m', '$2m', '$3m', '$4m', '$5m'],
           fontsize=14)

# Add a grid.
plt.grid(True)

plt.show()

This is the sales by year graph. Let's also see the costs graph.

In [None]:
costs = [0.5, 0.8, 1.2, 0.7, 1.6]
plt.plot(x_coords, costs)

We can draw multiple lines on the same graph! We need to specify a legend in this case

In [None]:
# Add the legend
plt.plot(x_coords, y_coords,'r')
plt.plot(x_coords, costs,'b')

# Specify them with the order they are added
plt.legend(['Sales', 'Costs'])

# Try the below ones
# plt.legend(['Sales', 'Costs','Something Else'])

# plt.legend(['Sales'])
# plt.legend(['Costs'])


plt.show()

Let's put everything in one place

In [None]:
# Setting the size
plt.figure(figsize=(10,7))

# Build the line graph.
plt.plot(x_coords, y_coords,'ro--', ms=12,linewidth = 3)
plt.plot(x_coords, costs,'bd--', ms=12,linewidth = 3)

# Add a title.
plt.title('Yearly Amounts', fontsize=18)

# Add labels to the axes.
plt.xlabel('Year', fontsize=16)
plt.ylabel('Amount', fontsize=16)

# Set the ticks
plt.xticks([0, 1, 2, 3, 4],
           ['2016', '2017', '2018', '2019', '2020'],
           fontsize=14)
plt.yticks([0, 1, 2, 3, 4, 5],
           ['$0m', '$1m', '$2m', '$3m', '$4m', '$5m'],
           fontsize=14)

# Add a legend
plt.legend(['Sales', 'Costs'], fontsize= 14)

# Add a grid.
plt.grid(True)

plt.show()

We can also plot just points/markers or have different marker and line colors

In [None]:
# No lines
plt.plot(x_coords, y_coords,'ro', ms=12,linewidth = 3)
#plt.scatter(x_coords,y_coords,marker='o',color='r')

plt.show()

In [None]:
#Different color markers and lines for the same sequence
plt.plot(x_coords, y_coords,'o', ms=12)
plt.plot(x_coords, y_coords,'--', linewidth=3)

We can set axis limits and make the axis scales be the same as well

In [None]:
#Default

plt.figure(figsize=(8,6))
x = np.linspace(0,10,10)
y = 0.45*x**2-2*x+2
z = 2*x 
plt.plot(x,y,'.--')
plt.plot(x,z,'rx-')

plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('Polynomials')

# Sprinkle in some Latex!
plt.legend(('$2^{nd}$ Order','$1^{st}$ Order'))

plt.show()

In [None]:
# Axis scales equal

plt.figure(figsize=(8,6))
x = np.linspace(0,10,10)
y = 0.45*x**2-2*x+2
z = 2*x 
plt.plot(x,y,'.--')
plt.plot(x,z,'rx-')

plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('Polynomials')

# Sprinkle in some Latex!
plt.legend(('2^{nd} Order','1^{st} Order'))

# Make them Equal!
plt.axis('equal')

plt.show()

In [None]:
# Tighter by setting the limits

plt.figure(figsize=(8,6))
x = np.linspace(0,10,10)
y = 0.45*x**2-2*x+2
z = 2*x 
plt.plot(x,y,'.--')
plt.plot(x,z,'rx-')

plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('Polynomials')

# Sprinkle in some Latex!
plt.legend(('2^{nd} Order','1^{st} Order'))

plt.xlim((8,9))
plt.ylim((14,20))

plt.show()

**Bar Charts**

Bar charts are also commonly used in visualization, which can be plotted with the `bar` function. We need to specify the location and height of the bars.

In [None]:
centers = [0, 10, 20, 30, 40]
heights = [100, 200, 300, 400, 500]
plt.bar(centers, heights)
plt.show()

In [None]:
centers2 = range(len(heights))
plt.bar(centers2, heights)
plt.show()

In [None]:
centers3 = [0,1,3,6,10]
plt.bar(centers3, heights)
plt.show()

We can additionally set the widths of the bars

In [None]:
bar_width = 5
plt.bar(centers, heights, bar_width)
plt.show()

Let's display the counts of words!

In [None]:
text = """Pfizer's COVID's vaccine over "90% effective"

By: Michael Erman and Julie Steenhuysen | 9 November 2020

Reuters – Pfizer Inc said on Monday its experimental COVID-19 vaccine was more than 90% effective, a major victory in the fight against a pandemic that has killed more than a million people, battered the world's economy and upended daily life.

Pfizer and German partner BioNTech SE are the first drugmakers to release successful data from a large-scale clinical trial of a coronavirus vaccine. The companies said they have so far found no serious safety concerns and expect to seek U.S. authorization this month for emergency use of the vaccine.

Health experts said Pfizer's results were positive for all COVID-19 vaccines currently in development since they show the shots are going after the right target and are a proof of concept that the disease can be halted with vaccination.

"Today is a great day for science and humanity," Albert Bourla, Pfizer's chairman and chief executive, said.

"We are reaching this critical milestone in our vaccine development program at a time when the world needs it most with infection rates setting new records, hospitals nearing over-capacity and economies struggling to reopen."

If Pfizer's vaccine is authorized, the number of doses will initially be limited and many questions remain, including how long the vaccine will provide protection.

BioNTech Chief Executive Ugur Sahin told Reuters he was optimistic the immunisation effect of the vaccine would last for a year although that was not certain yet.

"This news made me smile from ear to ear. It is a relief to see such positive results on this vaccine and bodes well for COVID-19 vaccines in general," said Peter Horby, professor of emerging infectious diseases at the University of Oxford.

 

MARKETS SURGE

The prospect of a vaccine electrified world markets with S&P 500 futures hitting a record high and tourism and travel shares surging. Pfizer shares were indicated 14.2% higher in pre-market trading in New York, while BioNTech's stock was up nearly 23% in Frankfurt.

"Light at the end of the tunnel. Let's just hope the vaccine deniers won't get in the way, but 2021 just got a lot brighter," said Neil Wilson, chief market analyst at Markets.com

Shares of other vaccine developers in the final stage of testing also rose with Johnson & Johnson up 4% in pre-market trading and Moderna 7.4% stronger. Britain's AstraZeneca was down 0.5%.

"The efficacy data are really impressive. This is better than most of us anticipated," said William Schaffner, infectious diseases expert at Vanderbilt University School of Medicine, Nashville, Tennessee. "The study isn't completed yet, but nonetheless the data look very solid."

U.S. President Donald Trump welcomed the test results, and the market boost: "STOCK MARKET UP BIG, VACCINE COMING SOON. REPORT 90% EFFECTIVE. SUCH GREAT NEWS!" he said on Twitter.

President-elect Joe Biden said the news was excellent but did not change the fact that face masks, social distancing and other health measures would be needed well into next year.

1.3 BILLION DOSES

Pfizer expects to seek broad U.S. authorization for emergency use of the vaccine for people aged 16 to 85. To do so, it will need two months of safety data from about half the study's 44,000 participants, which is expected late this month.

"I'm near ecstatic," Bill Gruber, one of Pfizer's top vaccine scientists, said in an interview. "This is a great day for public health and for the potential to get us all out of the circumstances we're now in."

Pfizer and BioNTech have a $1.95 billion contract with the U.S. government to deliver 100 million vaccine doses beginning this year. They have also reached supply agreements with the European Union, the United Kingdom, Canada and Japan.

To save time, the companies began manufacturing the vaccine before they knew whether it would be effective. They now expect to produce up to 50 million doses, or enough to protect 25 million people this year.

Pfizer said it expects to produce up to 1.3 billion doses of the vaccine in 2021.

The U.S. pharmaceutical giant said the interim analysis was conducted after 94 participants in the trial developed COVID-19, examining how many of them had received the vaccine versus a placebo.

The company did not break down exactly how many of those who fell ill received the vaccine. Still, over 90% effectiveness implies that no more than 8 of the 94 people who caught COVID-19 had been given the vaccine, which was administered in two shots about three weeks apart.

The efficacy rate is well above the 50% effectiveness required by the U.S. Food and Drug Administration for a coronavirus vaccine.

MORE DATA NEEDED

To confirm the efficacy rate, Pfizer said it would continue the trial until there are 164 COVID-19 cases among participants. Bourla told CNBC on Monday that based on rising infection rates, the trial could be completed before the end of November.

The data have yet to be peer-reviewed or published in a medical journal. Pfizer said it would do so once it has results from the entire trial.

"These are interesting first signals, but again they are only communicated in press releases," said Marylyn Addo, head of tropical medicine at the University Medical Center Hamburg-Eppendorf in Germany.

"Primary data are not yet available and a peer-reviewed publication is still pending. We still have to wait for the exact data before we can make a final assessment."

The global race for a vaccine has seen wealthier countries forge multibillion-dollar supply deals with drugmakers like Pfizer, AstraZeneca Plc and Johnson & Johnson, raising questions over when middle income and poorer nations will get access to inoculations.

The U.S. quest for a vaccine has been the Trump administration's central response to the pandemic. The United States has the world's highest known number of COVID-19 cases and deaths with more than 10 million infections and over 237,000 fatalities.

Trump repeatedly assured the public that his administration would likely identify a successful vaccine in time for the presidential election, held last Tuesday. On Saturday, Democratic rival Biden was declared the winner.

ESSENTIAL TOOLS

Vaccines are seen as essential tools to help end the health crisis that has shuttered businesses and left millions out of work. Millions of children whose schools were closed in March remain in remote learning programs.

Dozens of drugmakers and research groups around the globe have been racing to develop vaccines against COVID-19, which on Sunday exceeded 50 million infections since the new coronavirus first emerged late last year in China.

The Pfizer and BioNTech vaccine uses messenger RNA (mRNA) technology, which relies on synthetic genes that can be generated and manufactured in weeks, and produced at scale more rapidly than conventional vaccines.

Moderna Inc, whose vaccine candidate employs similar technology, is expected to report results from its large-scale trial later this month.

The mRNA technology is designed to trigger an immune response without using pathogens, such as actual virus particles.

Pfizer alone will not have the capacity to immediately provide enough vaccines for the United States. The Trump administration has said it will have enough supply for all of the 330 million U.S. residents who wish to be vaccinated by the middle of 2021.

The U.S. government has said the vaccines will be provided free to Americans, including the insured, uninsured and those in government health programs such as Medicare."""
print(text)

In [None]:
import string
preProcessed = text.translate(str.maketrans('','', string.punctuation + string.digits + '–')).lower()
print(preProcessed)

In [None]:
words = preProcessed.split()
print(len(words))

In [None]:
from collections import Counter

wordCounts = Counter(words)
print(wordCounts.most_common(10))

In [None]:
mc_words = []
mc_counts = []
n = 25
for w, c in wordCounts.most_common(n):
    mc_words.append(w)
    mc_counts.append(c)
    
xlocations = range(len(mc_counts))
plt.figure(figsize=(12,8))
plt.bar(xlocations,mc_counts)
plt.xticks(xlocations,mc_words,rotation=45)
plt.show()

In [None]:
# Handle the MatplotlibDeprecationWarning
figs = plt.subplot(2,2,1)
figs.plot(range(4))
figs = plt.subplot(2,2,2)
figs.plot(range(10,-1,-1))
figs = plt.subplot(2,2,3)
figs.plot(np.random.random(5))
figs.set_title('T')
figs.get_xlim()
figs = plt.subplot(2,2,1)
figs.get_xlim()

Misc. Bar Plot Stuff:

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [None]:
bar_width = 5
plt.figure(figsize = (12,8))
plt.bar(centers, heights,bar_width, color=('r', 'g', 'b', 'c', 'k'))
plt.show()

In [None]:
bar_width = 5
plt.figure(figsize = (12,8))
plt.bar(centers, heights,bar_width, color=('r', 'g', 'b', 'c', 'k'))
plt.xticks(centers,['red','green','blue','cyan','black'], fontsize=14)
plt.show()

In [None]:
bar_width = 5
plt.figure(figsize = (12,8))
plt.bar(centers, heights,bar_width, color=('r', 'g', 'b', 'c', 'k'))
plt.xticks(centers,['red','green','blue','cyan','black'], fontsize=14)
plt.yticks(plt.gca().get_yticks(),plt.gca().get_yticks(), fontsize=14)
plt.show()

Different bar plots side by side

In [None]:
heights2 = [150,50,400,70,600]
plt.bar(centers, heights,bar_width)
plt.bar(centers, heights2,bar_width)


In [None]:
#Change the widths
width=2.5
plt.bar(np.array(centers)-width/2, heights, width)
plt.bar(np.array(centers)+width/2, heights2, width)

In [None]:
# Alternative, but you need to play around with the ticks
width=2.5
plt.bar(np.array(centers)+width*0, heights, width)
plt.bar(np.array(centers)+width*1, heights2, width)

**Pie Chart**

In [None]:
values = [20, 60, 80, 40]
plt.pie(values)
plt.show()

In [None]:
sales = [100, 400, 300, 600]
slice_labels = ['1st Qtr', '2nd Qtr', '3rd Qtr', '4th Qtr']
plt.pie(sales, labels=slice_labels)
plt.title('Sales by Quarter')

In [None]:
values = [20, 60, 80, 40]
plt.figure(figsize = (12,8))
plt.pie(values, labels=['1st Qtr', '2nd Qtr', '3rd Qtr', '4th Qtr'], colors=('r', 'g', 'b', 'w', 'k'))
plt.title('Sales')

In [None]:
def tmp(pct):
    return f'{pct:.2f}%'

In [None]:
values = [20, 60, 80, 40]
plt.figure(figsize = (12,8))
plt.pie(values, labels=['1st Qtr', '2nd Qtr', '3rd Qtr', '4th Qtr'], autopct='%.1f%%')
plt.title('Sales')

In [None]:
# Changing the fontsize

values = [20, 60, 80, 40]
plt.figure(figsize = (12,8))
plt.pie(values, labels=['1st Qtr', '2nd Qtr', '3rd Qtr', '4th Qtr'], autopct='%.1f%%',textprops={'fontsize': 14})
plt.title('Sales')

**Histograms**

Displaying counts within bins to visualize distributions

In [None]:
xn = np.random.normal(0,0.5,200)
xu = np.random.uniform(-5,-2.,100)*0.1
x = np.concatenate((xn,xu))

In [None]:
plt.figure(figsize=(12,8))
h,bins,_=plt.hist(x)

In [None]:
len(x)

In [None]:
print(len(h),len(bins))

In [None]:
h

In [None]:
plt.bar(bins[1:],h,0.25)

In [None]:
h,bins,_=plt.hist(x,bins=5)

In [None]:
print(len(h),len(bins))

In [None]:
h,bins,_=plt.hist(x,bins=20)

In [None]:
# How to set the bin widths?

# First Try
bw=0.2
rng = x.max()-x.min()
num_bins = int(np.ceil(rng/0.2))
h,bins,_=plt.hist(x,bins=num_bins)

In [None]:
bins[1]-bins[0] # not exactly 0.2, but close

In [None]:
# Second try
given_bins = x.min()+np.arange(num_bins+1)*bw
h,bins,_=plt.hist(x,bins=given_bins)

In [None]:
given_bins

In [None]:
given_bins[1]-given_bins[0]

We can plot a parametric distribution on top of the histogram using its probability density function.

In [None]:
from scipy import stats

mu = x.mean()
std = x.std()

t = np.linspace(min(mu-3*std,x.min()),max(mu+3*std,x.max()),100)
y = stats.norm(mu,std).pdf(t)

In [None]:
# IMPORTANT: 
# Need to set density = True so that histogram does not show counts,
# but instead show bin_count/(total_count*bin_width)
plt.hist(x, density=True) 
plt.plot(t,y,'r')

Not exactly Gaussian so does not fit but see below

In [None]:
xn = np.random.normal(0,0.5,500)
tn = np.linspace(-1.5,1.5,100)
yn = stats.norm(xn.mean(),xn.std()).pdf(tn)
plt.hist(xn, density=True) 
plt.plot(tn,yn,'r')

**Box Plots**

Box plots are also used to display distributions

In [None]:
plt.boxplot(x)
plt.show()

In [None]:
y = np.random.normal(-0.2,0.2,100)
plt.boxplot([x,y])
plt.show()

**Small Example:** Throwing Multiple Dice

In [None]:
import random
def throw_single_die(num_die_sides):
    return random.randint(1,num_die_sides)

def throw_and_sum(die_per_throw = 2, total_num_throws=10000, num_die_sides=6):
    l = []
    # Perform the throws
    for i in range(total_num_throws):
        s = 0
        for j in range(die_per_throw):
            s += throw_single_die(num_die_sides)
        l.append(s)
    return l

In [None]:
lst = throw_and_sum(4)
plt.hist(lst,10,density=True)

In [None]:
#21 possible sums:
plt.figure(figsize=(10,7))
plt.hist(lst,21)

In [None]:
# FIXED THE TICKS!

# Explanation:
## min = 4, max = 24 ==> 21 bins
## Need to specify all the left edges and the last right edge (21 left edges + 1 right edge)

plt.figure(figsize=(10,7))
bw=1
xthrows = np.array(lst)
rng = xthrows.max()-xthrows.min()
num_bins = int(np.ceil(rng/bw))+1 #Need to add 1 due to dealing with integers since rng = 20
given_bins = xthrows.min()+np.arange(num_bins+1)*bw #+1 right edge
h,bins,_=plt.hist(xthrows,bins=given_bins)
plt.xticks(given_bins[:-1]+0.5,given_bins[:-1]) #Home exercise: Why did I do this?
plt.xlim((xthrows.min()-bw,xthrows.max()+2*bw))
plt.ylabel("Number of Occurences")
plt.xlabel("Counts")
plt.show()

In [None]:
h

In [None]:
given_bins

In [None]:
bins

Adding text: `plt.text(x,y,text,...)` where
* x and y are coordinates of the lower left edge of the text in the data coordinates (can be changed)
* text is a string
* ... are formatting options like color, fontsize tc.

In [None]:
plt.figure(figsize=(10,7))
h,bins,_=plt.hist(xthrows)
plt.ylabel("Number of Occurences")
plt.xlabel("Counts")

xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()

plt.text(0,0,'realorigin',color='r')

plt.text(xmin,ymin,'origin')
plt.text((xmax+xmin)/2,(ymax+ymin)/2,'center',color = 'w', fontsize=16)


In [None]:
xmin

In [None]:
plt.figure(figsize=(12,8))


h,bins,_=plt.hist(xthrows)
plt.ylabel("Number of Occurences", fontsize=16)
plt.xlabel("Counts", fontsize=16)

xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()

for i in range(len(h)):
    plt.text(bins[i],h[i]+10,f'{int(h[i])}',color = 'r', fontsize=14)
    
# New command:
plt.gca().tick_params(labelsize=14)

**Saving Figures**

How to put our figures into presentatitons and reports: `plt.savefig`

In [None]:
plt.figure(figsize=(10,7))
bw=1
xthrows = np.array(lst)
rng = xthrows.max()-xthrows.min()
num_bins = int(np.ceil(rng/bw))
given_bins = xthrows.min()+np.arange(num_bins)*bw

h,bins,_=plt.hist(xthrows,bins=given_bins)

plt.xticks(given_bins+0.5,given_bins) #Home exercise: Why did I do this?

plt.ylabel("Number of Occurences")
plt.xlabel("Counts")


plt.savefig(fname='dieTotals.png', dpi=300)

**Matplotlib Example**

Use the below cell to read the data in memory. Then answer the questions below.

In [None]:
with open('student_grades.csv','r') as file:
    column_names = file.readline().strip().split(',')
    sdic = {}
    lgcounts = {}
    for cname in column_names:
        sdic[cname] = []
    for row in file:
        row_data = row.strip().split(',')
        for i in range(len(column_names)):
            cname = column_names[i]
            entry = row_data[i]
            if i == 0:
                sdic[cname].append(entry)
                lgcounts[entry] = lgcounts.get(entry,0) + 1
            else:
                sdic[cname].append(float(entry))
    for key, value in sdic.items():
        sdic[key] = np.array(value)

In [None]:
sdic

Exercises:
* Plot the histogram of the `'Total'` 
* Write a function that will plot the histogram with (`simple_histogram`):
    * Larger figure size (decide on the size yourself)
    * Labels and title (feel free to decide on this yourself). 
    * Xticks denoting the range of each bin
    * Appropriate fontsizes
    * The function signature is given below
* Improve the previous function with the below additions in (`better_histogram`)
    * Labels and titles as input as well with default values
    * Bins not so close to each other
    * If a filename is given, write the plot to a figure
* Plot the histogram of letter grades with the correct order of letters
* Letter grade distribution in a pie chart
* 3 separate histograms of the `'Total'`, `'Exams'` and the `'HWs'` side by side for students who received `'B+'`, `'B'` and `'B-'`.

**Hint:** Do not be shy to revert to the `plt.bar` function

In [None]:
# Histogram of total
h,bins,_ = plt.hist(sdic['Total'])

In [None]:
# Simple Histogram
def simple_histogram(data, bins=10):
    
    # Appropriate fontsizes
    titlefontsize = 16
    labelfontsize = 16
    tickfontsize = 12
    
    # Larger figure size (decide on the size yourself)
    plt.figure(figsize=(14,7))
    
    h,bins,_ = plt.hist(data, bins)
    
    # Labels and title (feel free to decide on this yourself)
    plt.xlabel('Grade Ranges',fontsize=labelfontsize)
    plt.ylabel('Number of Students',fontsize=labelfontsize)
    plt.title('Student Grades',fontsize=titlefontsize)
    
    # Xticks denoting the range of each bin
    mid_points = (bins[1:]+bins[:-1])/2
    tick_labels = []
    for i in range(len(bins)-1):
        s = f'{bins[i]:.1f}-{bins[i+1]:.1f}' 
        tick_labels.append(s)
    
    plt.xticks(mid_points,tick_labels)  
    
    plt.tick_params(labelsize=tickfontsize)    
    plt.show()
    

In [None]:
simple_histogram(sdic['Total'])

In [None]:
simple_histogram(sdic['Total'],bins=range(10,100,10))

In [None]:
# Better Histogram, no signature given!

# Labels and titles as input as well with default values
# Bins not so close to each other (hint:rwidth)
# If a filename is given, write the plot to a figure

# Home exercise!

# Simple Histogram
def better_histogram(data, 
                     bins=10, 
                     xlabel = 'Grade Ranges', 
                     ylabel = 'Number of Students',
                     title = 'Student Grades',
                     filename = None):
    
    # Appropriate fontsizes
    titlefontsize = 18
    labelfontsize = 16
    tickfontsize = 14
    
    # Larger figure size (decide on the size yourself)
    plt.figure(figsize=(14,7))
    
    h,bins,_ = plt.hist(data, bins, rwidth = 0.9)
    
    # Labels and title (feel free to decide on this yourself)
    plt.xlabel(xlabel,fontsize=labelfontsize)
    plt.ylabel(ylabel,fontsize=labelfontsize)
    plt.title(title,fontsize=titlefontsize)
    
    # Xticks denoting the range of each bin
    mid_points = (bins[1:]+bins[:-1])/2
    tick_labels = []
    for i in range(len(bins)-1):
        s = f'{bins[i]:.1f}-{bins[i+1]:.1f}' 
        tick_labels.append(s)
    
    plt.xticks(mid_points,tick_labels)  
    
    plt.tick_params(labelsize=tickfontsize) 
    
    if filename:
        plt.savefig(filename, dpi=300)
        
    plt.show()
    

In [None]:
better_histogram(sdic['Total'], bins=range(10,101,10))

In [None]:
# Letter grade histogram in correct order
correct_order = ['F', 'D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A']

plt.hist(sdic['Letter'],rwidth=0.9)
plt.show()


In [None]:
tmp = np.zeros(len(sdic['Letter']))

for i in range(len(tmp)):
    tmp[i] = correct_order.index(sdic['Letter'][i]) # This is the trick! Why does this work?

In [None]:
print(tmp[:5])
print(sdic['Letter'][:5])

In [None]:
plt.figure(figsize=(12,7))
h,bins,_=plt.hist(tmp,11,rwidth=0.9)
plt.xticks((bins[1:]+bins[:-1])/2,correct_order)

plt.title('Letter Grade Distribution', fontsize=18)
plt.xlabel('Letter Grades', fontsize=16)
plt.ylabel('Number of Students', fontsize=16)

plt.tick_params(labelsize=14)

plt.show()

In [None]:
# We also have
lgcounts

In [None]:
# Letter grade distribution in a pie chart
plt.figure(figsize=(12,7))
plt.pie(lgcounts.values(),labels=lgcounts.keys(), autopct='%.1f%%', explode = np.ones(len(lgcounts))*0.02, 
        colors= ('b','g','r','c','m','y'))
plt.show()

In [None]:
# In the correct order?
argsortedCounts = []

for g in correct_order:
    argsortedCounts.append(lgcounts[g])

# Letter grade distribution in a pie chart
plt.figure(figsize=(12,7))
plt.pie(argsortedCounts, labels=correct_order, autopct='%.1f%%', explode = np.ones(len(lgcounts))*0.02)
plt.show()

More advanced color options: https://matplotlib.org/stable/tutorials/colors/colors.html 

In [None]:
# Exploding only As, Cs and Fs with a different color cycle
explode = np.zeros(len(argsortedCounts))
explode[correct_order.index('A')] = 0.1
explode[correct_order.index('C')] = 0.1
explode[correct_order.index('F')] = 0.1
plt.figure(figsize=(12,7))
plt.pie(argsortedCounts, labels=correct_order, autopct='%.1f%%', explode = explode,
       colors = ('tab:blue','tab:orange','tab:green','tab:cyan','tab:purple','tab:olive'))
plt.show()

In [None]:
# 3 separate histograms of the `'Total'`, `'Exams'` and the `'HWs'` side by side for 

plt.figure(figsize=(16,5))

plt.subplot(1,3,1)
plt.hist(sdic['Total'])
plt.title('Total')

plt.subplot(1,3,2)
plt.hist(sdic['Exams'])
plt.title('Exams')

plt.subplot(1,3,3)
plt.hist(sdic['HWs'])
plt.title('HWs')

In [None]:
sdic['Total'] > 85

In [None]:
plt.hist(sdic['Total'][sdic['Total'] > 85])

In [None]:
# 3 separate histograms of the `'Total'`, `'Exams'` and the `'HWs'` side by side for 
# students who received `'B+'`, `'B'` and `'B-'`.

grades = sdic['Letter']

inds = (grades=='B') | (grades=='B-') | (grades=='B+')

plt.figure(figsize=(16,5))

plt.subplot(1,3,1)
plt.hist(sdic['Total'][inds])
plt.title('Total')

plt.subplot(1,3,2)
plt.hist(sdic['Exams'][inds])
plt.title('Exams')

plt.subplot(1,3,3)
plt.hist(sdic['HWs'][inds])
plt.title('HWs')