In [1]:
import pandas as pd

## What is Pandas

Pandas is a newer package built on top of NumPy, and provides an efficient implementation of a DataFrame. 
DataFrames are essentially multidimensional arrays with attached row and column labels, and often with heterogeneous types and/or missing data.

### Pandas Series

Indexed data, created from a list or array


In [2]:
data = pd.Series(range(4,45,6))
print(data)

0     4
1    10
2    16
3    22
4    28
5    34
6    40
dtype: int64


In [3]:
data.index = range(2,9)
data

2     4
3    10
4    16
5    22
6    28
7    34
8    40
dtype: int64

In [4]:
data.index = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
data

a     4
b    10
c    16
d    22
e    28
f    34
g    40
dtype: int64

In [5]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

## Dataframe

### Create a dataframe

In [34]:
# Creating dataframe from dictionary
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [None]:
# Create dataframe from list
lst = ['one', 'two', 'three', 'four', 'five', 'six', 'seven']  
lst2 = [11, 22, 33, 44, 55, 66, 77] 

df = pd.DataFrame(list(zip(lst, lst2)), columns =['Name', 'val']) 
df

### Import a database

In [13]:
ds = pd.read_csv('books.csv')

ParserError: Error tokenizing data. C error: Expected 10 fields in line 4012, saw 11


In [11]:
ds = pd.read_csv('books.csv', delimiter=',', nrows = 1000)  # pd.read_excel ('your_excel_file')

In [14]:
ds.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,# num_pages,ratings_count,text_reviews_count
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling-Mary GrandPré,4.56,0439785960,9780439785969,eng,652,1944099,26249
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling-Mary GrandPré,4.49,0439358078,9780439358071,eng,870,1996446,27613
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling-Mary GrandPré,4.47,0439554934,9780439554930,eng,320,5629932,70390
3,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.41,0439554896,9780439554893,eng,352,6267,272
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling-Mary GrandPré,4.55,043965548X,9780439655484,eng,435,2149872,33964


In [15]:
ds.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '# num_pages', 'ratings_count', 'text_reviews_count'],
      dtype='object')

In [17]:
ds['title']

0      Harry Potter and the Half-Blood Prince (Harry ...
1      Harry Potter and the Order of the Phoenix (Har...
2      Harry Potter and the Sorcerer's Stone (Harry P...
3      Harry Potter and the Chamber of Secrets (Harry...
4      Harry Potter and the Prisoner of Azkaban (Harr...
5      Harry Potter Boxed Set  Books 1-5 (Harry Potte...
6      Unauthorized Harry Potter Book Seven News: "Ha...
7           Harry Potter Collection (Harry Potter  #1-6)
8      The Ultimate Hitchhiker's Guide: Five Complete...
9          The Ultimate Hitchhiker's Guide to the Galaxy
10     The Hitchhiker's Guide to the Galaxy (Hitchhik...
11     The Hitchhiker's Guide to the Galaxy (Hitchhik...
12     The Ultimate Hitchhiker's Guide (Hitchhiker's ...
13                  A Short History of Nearly Everything
14                           Bill Bryson's African Diary
15     Bryson's Dictionary of Troublesome Words: A Wr...
16                                In a Sunburned Country
17     I'm a Stranger Here Myse

### Rename columns

In [27]:
cols = list(ds.columns)
print("These are the previous columns: ", cols)

cols[2] = 'writers'
ds.columns = cols
print("These are the columns now: ", list(ds.columns))

These are the previous columns:  ['bookID', 'title', 'writers', 'average_rating', 'isbn', 'isbn13', 'language_code', '# num_pages', 'ratings_count', 'text_reviews_count']
These are the columns now:  ['bookID', 'title', 'writers', 'average_rating', 'isbn', 'isbn13', 'language_code', '# num_pages', 'ratings_count', 'text_reviews_count']


### Getting some quantifiable info

In [28]:
ds.describe()

Unnamed: 0,bookID,average_rating,isbn13,# num_pages,ratings_count,text_reviews_count
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1421.079,3.94098,9762143000000.0,367.865,48476.69,1113.084
std,880.934502,0.346679,418872000000.0,282.998624,266883.1,4377.365644
min,1.0,0.0,49086010000.0,0.0,0.0,0.0
25%,636.5,3.77,9780312000000.0,200.0,76.0,7.0
50%,1462.5,3.95,9780553000000.0,304.0,795.5,54.5
75%,2151.25,4.12,9780817000000.0,455.25,6348.75,289.25
max,2917.0,5.0,9789953000000.0,3342.0,5629932.0,70390.0


In [None]:
ds['ratings_count'].mean()

In [None]:
ds['ratings_count'].median()

In [30]:
ds['ratings_count'].max()

795.5

In [32]:
ds['ratings_count'].min()

0

### Indexing

In [21]:
ds.iloc[3]

bookID                                                                4
title                 Harry Potter and the Chamber of Secrets (Harry...
authors                                                    J.K. Rowling
average_rating                                                     4.41
isbn                                                         0439554896
isbn13                                                    9780439554893
language_code                                                       eng
# num_pages                                                         352
ratings_count                                                      6267
text_reviews_count                                                  272
Name: 3, dtype: object