# Jupyter Notebook Example

## Get data from web

The following is a tiny program to download text from the web.


In [None]:
# A tiny program to download text from the web.
def read_url(url):
    from urllib.request import urlopen
    import re

    return re.sub("\\s+", " ", urlopen(url).read().decode())


In [None]:
huck_finn_url = "https://www.inferentialthinking.com/data/huck_finn.txt"
huck_finn_text = read_url(huck_finn_url)
huck_finn_chapters = huck_finn_text.split("CHAPTER ")[44:]


In [None]:
little_women_url = "https://www.inferentialthinking.com/data/little_women.txt"
little_women_text = read_url(little_women_url)
little_women_chapters = little_women_text.split("CHAPTER ")[1:]


In [None]:
huck_finn_chapters[4]

In [None]:
little_women_chapters[1]

## Working with Tables

A lot of data science is about transforming data often to produce tables that we can more easily analyze. In this class you will use the Berkeley datascience library to manipulate and data.

In [None]:
from datascience import Table


In [None]:
Table().with_column("Chapters", huck_finn_chapters)

In [None]:
import numpy as np

In [None]:
np.char.count(huck_finn_chapters, "Tom")

In [None]:
np.char.count(huck_finn_chapters, 'Jim')

In [None]:
counts = Table().with_columns(
    [
        "Tom",
        np.char.count(huck_finn_chapters, "Tom"),
        "Jim",
        np.char.count(huck_finn_chapters, "Jim"),
        "Huck",
        np.char.count(huck_finn_chapters, "Huck"),
    ]
)
counts


## We will Learn to Visualize Data

Plot the cumulative counts: How many times in Chapter 1, how many times in Chapters 1 and 2, and so on.


In [None]:
# 将Table中一列的第N项数值转换为前N项和
def cumulative_sum(table: Table) -> Table:
    """
    Returns a new Table with each column replaced by its cumulative sum.
    """
    labels = table.labels
    columns = [np.cumsum(np.array(col)) for col in table.columns]
    return Table().with_columns(list(sum(zip(labels, columns), ())))

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.style.use("fivethirtyeight")
cum_counts = cumulative_sum(counts).with_column("Chapter", np.arange(1, 44, 1))
cum_counts.plot(column_for_xticks="Chapter")
plt.title("Cumulative Number of Times Name Appears");


In [None]:
# The chapters of Little Women
Table().with_column("Chapters", little_women_chapters)


In [None]:
# Counts of names in the chapters of Little Women
names = ["Amy", "Beth", "Jo", "Laurie", "Meg"]
mentions = {name: np.char.count(little_women_chapters, name) for name in names}
counts = Table().with_columns(
    [
        "Amy",
        mentions["Amy"],
        "Beth",
        mentions["Beth"],
        "Jo",
        mentions["Jo"],
        "Laurie",
        mentions["Laurie"],
        "Meg",
        mentions["Meg"],
    ]
)


In [None]:
# Plot the cumulative counts
Table.static_plots()
cum_counts = cumulative_sum(counts).with_column("Chapter", np.arange(1, 48, 1))
cum_counts.plot(column_for_xticks=5)
plt.title("Cumulative Number of Times Name Appears");


In [None]:
# Plot the cumulative counts
Table.interactive_plots()
cum_counts = cumulative_sum(counts).with_column("Chapter", np.arange(1, 48, 1))
cum_counts.plot(column_for_xticks=5)


## Examining Length

How long are the books? How long are sentences?

In [None]:
len(read_url(huck_finn_url))

In [None]:
# In each chapter, count the number of all characters;
# call this the "length" of the chapter.
# Also count the number of periods.

length_hf = Table().with_columns(
    [
        "Length",
        [len(s) for s in huck_finn_chapters],
        "Periods",
        np.char.count(huck_finn_chapters, "."),
    ]
)
length_lw = Table().with_columns(
    [
        "Length",
        [len(s) for s in little_women_chapters],
        "Periods",
        np.char.count(little_women_chapters, "."),
    ]
)


In [None]:
# The counts for Huckleberry Finn
length_hf


In [None]:
# The counts for Little Women
length_lw


In [None]:
Table.static_plots()
plt.figure(figsize=(10, 10))
plt.scatter(length_hf[1], length_hf[0], color="darkblue")
plt.scatter(length_lw[1], length_lw[0], color="gold")
plt.xlabel("Number of periods in chapter")
plt.ylabel("Number of characters in chapter");


## Examining distributions

In [None]:
Table.static_plots()
length_hf.with_columns(
    "Sentence Length", length_hf["Length"] / length_hf["Periods"]
).hist("Sentence Length")


In [None]:
Table.static_plots()
length_lw.with_columns(
    "Sentence Length", length_lw["Length"] / length_lw["Periods"]
).hist("Sentence Length")
