In [5]:
# Initialize Python instance by importing packages
# This line prepares Jupyter notebook for working with matplotlib and in-notebook plotting
%matplotlib inline

import os # connects jupyter to GitBash
os.environ['PATH']+="C:\Program Files\Git\\bin"
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm # allows access to color maps
import matplotlib.pyplot as plt # sets up plotting
import pandas as pd # allows for data handling
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html',True)
import seaborn as sns # sets up styles & additional plotting options


Read in some data from .csv files.  CSV files can be output by any spreadsheet
software,  and are plain text so, make a great wat to share data.  This dataset
is from Goodreads.  Scrapped the highest regarding books on site. Next lab will show how to do scraping.

In [7]:
df=pd.read_csv("all.csv", header=None,
              names=["rating", 'review_count', 'isbn', 'booktype', 'author_url', 'year', 'gene_urls', 'dir', 'rating_count', 'name'],
              )
df.head()

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,gene_urls,dir,rating_count,name
0,4.4,136455,439023483,good_reads:book,https://www.goodreads.com/author/show/153394.S...,2008.0,/genres/young-adult|/genres/science-fiction|/g...,dir01/2767052-the-hunger-games.html,2958974,"The Hunger Games (The Hunger Games, #1)"
1,4.41,16648,439358078,good_reads:book,https://www.goodreads.com/author/show/1077326....,2003.0,/genres/fantasy|/genres/young-adult|/genres/fi...,dir01/2.Harry_Potter_and_the_Order_of_the_Phoe...,1284478,Harry Potter and the Order of the Phoenix (Har...
2,3.56,85746,316015849,good_reads:book,https://www.goodreads.com/author/show/941441.S...,2005.0,/genres/young-adult|/genres/fantasy|/genres/ro...,dir01/41865.Twilight.html,2579564,"Twilight (Twilight, #1)"
3,4.23,47906,61120081,good_reads:book,https://www.goodreads.com/author/show/1825.Har...,1960.0,/genres/classics|/genres/fiction|/genres/histo...,dir01/2657.To_Kill_a_Mockingbird.html,2078123,To Kill a Mockingbird
4,4.23,34772,679783261,good_reads:book,https://www.goodreads.com/author/show/1265.Jan...,1813.0,/genres/classics|/genres/fiction|/genres/roman...,dir01/1885.Pride_and_Prejudice.html,1388992,Pride and Prejudice


In [8]:
df.dtypes

rating          float64
review_count     object
isbn             object
booktype         object
author_url       object
year            float64
gene_urls        object
dir              object
rating_count     object
name             object
dtype: object

In [9]:
df.shape

(6000, 10)

In [10]:
df.columns

Index(['rating', 'review_count', 'isbn', 'booktype', 'author_url', 'year', 'gene_urls', 'dir', 'rating_count', 'name'], dtype='object')

In [11]:
type(df.rating), type(df)

(pandas.core.series.Series, pandas.core.frame.DataFrame)

In [12]:
df.rating < 3

0       False
1       False
2       False
3       False
4       False
        ...  
5995    False
5996    False
5997    False
5998    False
5999    False
Name: rating, Length: 6000, dtype: bool

In [13]:
np.sum(df.rating < 3)

4

find the percentage of books with a rating of 3 or lower

In [21]:
(np.sum(df.rating < 3)/df.shape[0])

0.0006666666666666666

Can also do this as a mean as True's map to 1s

In [22]:
np.mean(df.rating < 3)

0.0006666666666666666

Or directly through pandas, which works since 'df.rating < 3' is a pandas series.

In [23]:
(df.rating < 3).mean()

0.0006666666666666666

# Filtering

Here are two ways to get a filtered dataframe

In [24]:
df.query("rating > 4.5")

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,gene_urls,dir,rating_count,name
17,4.58,1314,0345538374,good_reads:book,https://www.goodreads.com/author/show/656983.J...,1973.0,/genres/fantasy|/genres/classics|/genres/scien...,dir01/30.J_R_R_Tolkien_4_Book_Boxed_Set.html,68495,J.R.R. Tolkien 4-Book Boxed Set
162,4.55,15777,075640407X,good_reads:book,https://www.goodreads.com/author/show/108424.P...,2007.0,/genres/fantasy|/genres/fiction,dir02/186074.The_Name_of_the_Wind.html,210018,The Name of the Wind (The Kingkiller Chronicle...
222,4.53,15256,055357342X,good_reads:book,https://www.goodreads.com/author/show/346732.G...,2000.0,/genres/fantasy|/genres/fiction|/genres/fantas...,dir03/62291.A_Storm_of_Swords.html,327992,"A Storm of Swords (A Song of Ice and Fire, #3)"
242,4.53,5404,0545265355,good_reads:book,https://www.goodreads.com/author/show/153394.S...,2010.0,/genres/young-adult|/genres/fiction|/genres/fa...,dir03/7938275-the-hunger-games-trilogy-boxset....,102330,The Hunger Games Trilogy Boxset (The Hunger Ga...
249,4.80,644,0740748475,good_reads:book,https://www.goodreads.com/author/show/13778.Bi...,2005.0,/genres/sequential-art|/genres/comics|/genres/...,dir03/24812.The_Complete_Calvin_and_Hobbes.html,22674,The Complete Calvin and Hobbes
...,...,...,...,...,...,...,...,...,...,...
5806,4.57,121,0679777458,good_reads:book,https://www.goodreads.com/author/show/8361.Dor...,1966.0,/genres/historical-fiction|/genres/fiction|/ge...,dir59/351211.The_Disorderly_Knights.html,2177,"The Disorderly Knights (The Lymond Chronicles,..."
5873,4.55,103,144247372X,good_reads:book,https://www.goodreads.com/author/show/2876763....,2012.0,/genres/fantasy|/genres/paranormal|/genres/ang...,dir59/14367071-the-complete-hush-hush-saga.html,2869,"The Complete Hush, Hush Saga"
5874,4.78,18,2851944371,good_reads:book,https://www.goodreads.com/author/show/318835.O...,1972.0,/genres/poetry|/genres/fiction|/genres/nobel-p...,dir59/2014000.Le_Monogramme.html,565,Le Monogramme
5880,4.61,123,,good_reads:book,https://www.goodreads.com/author/show/4942228....,2010.0,/genres/romance|/genres/m-m-romance|/genres/sc...,dir59/10506860-the-interludes.html,1031,"The Interludes (In the company of shadows, #3)"


Here we can create a mask and use it to "index" into the dataframe to get the rows we want

In [25]:
df[df.year < 0]

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,gene_urls,dir,rating_count,name
47,3.68,5785,0143039954,good_reads:book,https://www.goodreads.com/author/show/903.Homer,-800.0,/genres/classics|/genres/fiction|/genres/poetr...,dir01/1381.The_Odyssey.html,560248,The Odyssey
246,4.01,365,0147712556,good_reads:book,https://www.goodreads.com/author/show/903.Homer,-800.0,/genres/classics|/genres/fantasy|/genres/mytho...,dir03/1375.The_Iliad_The_Odyssey.html,35123,The Iliad/The Odyssey
455,3.85,1499,0140449140,good_reads:book,https://www.goodreads.com/author/show/879.Plato,-380.0,/genres/philosophy|/genres/classics|/genres/no...,dir05/30289.The_Republic.html,82022,The Republic
596,3.77,1240,0679729526,good_reads:book,https://www.goodreads.com/author/show/919.Virgil,-29.0,/genres/classics|/genres/poetry|/genres/fictio...,dir06/12914.The_Aeneid.html,60308,The Aeneid
629,3.64,1231,1580495931,good_reads:book,https://www.goodreads.com/author/show/1002.Sop...,-429.0,/genres/classics|/genres/plays|/genres/drama|/...,dir07/1554.Oedipus_Rex.html,93192,Oedipus Rex
674,3.92,3559,1590302257,good_reads:book,https://www.goodreads.com/author/show/1771.Sun...,-512.0,/genres/non-fiction|/genres/politics|/genres/c...,dir07/10534.The_Art_of_War.html,114619,The Art of War
746,4.06,1087,0140449183,good_reads:book,https://www.goodreads.com/author/show/5158478....,-500.0,/genres/classics|/genres/spirituality|/genres/...,dir08/99944.The_Bhagavad_Gita.html,31634,The Bhagavad Gita
777,3.52,1038,1580493882,good_reads:book,https://www.goodreads.com/author/show/1002.Sop...,-442.0,/genres/drama|/genres/fiction|/genres/classics...,dir08/7728.Antigone.html,49084,Antigone
1233,3.94,704,015602764X,good_reads:book,https://www.goodreads.com/author/show/1002.Sop...,-400.0,/genres/classics|/genres/plays|/genres/drama|/...,dir13/1540.The_Oedipus_Cycle.html,36008,The Oedipus Cycle
1397,4.03,890,0192840509,good_reads:book,https://www.goodreads.com/author/show/12452.Aesop,-560.0,/genres/classics|/genres/childrens|/genres/lit...,dir14/21348.Aesop_s_Fables.html,71259,Aesop's Fables


You can combine the previous two conditions using the second form and putting '()' brackets around each condition.  The query uses a boolean AND.  Each condition creates a mask of tures and falses.

In [26]:
df[(df.year < 0) & (df.rating > 4)] # thee were none > 4.5

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,gene_urls,dir,rating_count,name
246,4.01,365,147712556,good_reads:book,https://www.goodreads.com/author/show/903.Homer,-800.0,/genres/classics|/genres/fantasy|/genres/mytho...,dir03/1375.The_Iliad_The_Odyssey.html,35123,The Iliad/The Odyssey
746,4.06,1087,140449183,good_reads:book,https://www.goodreads.com/author/show/5158478....,-500.0,/genres/classics|/genres/spirituality|/genres/...,dir08/99944.The_Bhagavad_Gita.html,31634,The Bhagavad Gita
1397,4.03,890,192840509,good_reads:book,https://www.goodreads.com/author/show/12452.Aesop,-560.0,/genres/classics|/genres/childrens|/genres/lit...,dir14/21348.Aesop_s_Fables.html,71259,Aesop's Fables
1882,4.02,377,872205541,good_reads:book,https://www.goodreads.com/author/show/879.Plato,-400.0,/genres/philosophy|/genres/classics|/genres/no...,dir19/22632.The_Trial_and_Death_of_Socrates.html,18712,The Trial and Death of Socrates
3133,4.3,131,872203492,good_reads:book,https://www.goodreads.com/author/show/879.Plato,-400.0,/genres/philosophy|/genres/classics|/genres/no...,dir32/9462.Complete_Works.html,7454,Complete Works
4475,4.11,281,865163480,good_reads:book,https://www.goodreads.com/author/show/879.Plato,-390.0,/genres/philosophy|/genres/classics|/genres/no...,dir45/73945.Apology.html,11478,Apology
5367,4.07,133,872206335,good_reads:book,https://www.goodreads.com/author/show/879.Plato,-360.0,/genres/philosophy|/genres/classics|/genres/no...,dir54/30292.Five_Dialogues.html,9964,Five Dialogues
