In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Pandas Foundations

DataFrame is just like a spreadsheet, and Series is the single columns of data inside the DataFrame, each column(Series) holds precisely one data type. 

Both DataFrame and Series has Index component, one key reason that makes them so special and handy to use.

DataFrame and Series consist of three components Index,columns and data(values).

When you need to select one specific data element or subset of data, use index label and column name to find the target. 

In [6]:
data=pd.read_csv('BikeRiding2012.csv',index_col='Date',parse_dates=['Date'],dayfirst=True)

In [7]:
data.head()  # read the first few rows,to get a feel of the data.

Unnamed: 0_level_0,Unnamed: 1,Rachel / Papineau,Berri1,Maisonneuve_2,Maisonneuve_1,Brébeuf,Parc,PierDup,CSC (Côte Sainte-Catherine),Pont_Jacques_Cartier
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2012-01-01,00:00,16,35,51,38,5.0,26,10,0,27.0
2012-01-02,00:00,43,83,153,68,11.0,53,6,1,21.0
2012-01-03,00:00,58,135,248,104,2.0,89,3,2,15.0
2012-01-04,00:00,61,144,318,116,2.0,111,8,1,19.0
2012-01-05,00:00,95,197,330,124,6.0,97,13,2,16.0


In [8]:
data.index #in this exmple, the idexes are dates.

DatetimeIndex(['2012-01-01', '2012-01-02', '2012-01-03', '2012-01-04',
               '2012-01-05', '2012-01-06', '2012-01-07', '2012-01-08',
               '2012-01-09', '2012-01-10',
               ...
               '2012-12-22', '2012-12-23', '2012-12-24', '2012-12-25',
               '2012-12-26', '2012-12-27', '2012-12-28', '2012-12-29',
               '2012-12-30', '2012-12-31'],
              dtype='datetime64[ns]', name='Date', length=366, freq=None)

In [9]:
data.columns 

Index(['Unnamed: 1', 'Rachel / Papineau', 'Berri1', 'Maisonneuve_2',
       'Maisonneuve_1', 'Brébeuf', 'Parc', 'PierDup',
       'CSC (Côte Sainte-Catherine)', 'Pont_Jacques_Cartier'],
      dtype='object')

In [12]:
data.values  # numpy arrays

array([['00:00', 16, 35, ..., 10, 0, 27.0],
       ['00:00', 43, 83, ..., 6, 1, 21.0],
       ['00:00', 58, 135, ..., 3, 2, 15.0],
       ..., 
       ['00:00', 0, 27, ..., 0, 0, nan],
       ['00:00', 0, 5, ..., 0, 0, nan],
       ['00:00', 0, 4, ..., 0, 0, nan]], dtype=object)

In [14]:
data.dtypes

Unnamed: 1                      object
Rachel / Papineau                int64
Berri1                           int64
Maisonneuve_2                    int64
Maisonneuve_1                    int64
Brébeuf                        float64
Parc                             int64
PierDup                          int64
CSC (Côte Sainte-Catherine)      int64
Pont_Jacques_Cartier           float64
dtype: object

In [15]:
data.get_dtype_counts()

float64    2
int64      7
object     1
dtype: int64

In [16]:
ParcData=data['Parc']  # return a series. alternative: ParcData=data.Parc

In [17]:
ParcData.head()    # 

Date
2012-01-01     26
2012-01-02     53
2012-01-03     89
2012-01-04    111
2012-01-05     97
Name: Parc, dtype: int64

In [18]:
type(ParcData)

pandas.core.series.Series

In [21]:
objSeries=data['Unnamed: 1']   # to emphasize the diff, this is not a good name btw.
intSeries=data['Berri1']

In [22]:
objSeries.value_counts() #this column only has one value...

00:00    366
Name: Unnamed: 1, dtype: int64

In [25]:
intSeries.describe()

count     366.000000
mean     2678.234973
std      2149.301945
min         4.000000
25%       433.500000
50%      2326.000000
75%      4642.750000
max      7104.000000
Name: Berri1, dtype: float64

In [26]:
objSeries.count()

366

In [28]:
objSeries.size

366

In [31]:
intSeries.isnull()

Date
2012-01-01    False
2012-01-02    False
2012-01-03    False
2012-01-04    False
2012-01-05    False
2012-01-06    False
2012-01-07    False
2012-01-08    False
2012-01-09    False
2012-01-10    False
2012-01-11    False
2012-01-12    False
2012-01-13    False
2012-01-14    False
2012-01-15    False
2012-01-16    False
2012-01-17    False
2012-01-18    False
2012-01-19    False
2012-01-20    False
2012-01-21    False
2012-01-22    False
2012-01-23    False
2012-01-24    False
2012-01-25    False
2012-01-26    False
2012-01-27    False
2012-01-28    False
2012-01-29    False
2012-01-30    False
              ...  
2012-12-02    False
2012-12-03    False
2012-12-04    False
2012-12-05    False
2012-12-06    False
2012-12-07    False
2012-12-08    False
2012-12-09    False
2012-12-10    False
2012-12-11    False
2012-12-12    False
2012-12-13    False
2012-12-14    False
2012-12-15    False
2012-12-16    False
2012-12-17    False
2012-12-18    False
2012-12-19    False
2012-12-20    F

In [32]:
intSeries.isnull().sum() # get the number of null values in the series, in this case, no null values.

0