# pandas

# Python Data Analysis Library

pandas is an open source library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language. Is has two main objects to represents data: Series and DataFrame.

Reference:

http://pandas.pydata.org/



In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Working with Series is an array like object.

Series is a one-dimensional labeled array capable of holding any data type. 


#### pd.Series(self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)

In [6]:
x = pd.Series([1,2,3,4,5])
x

0    1
1    2
2    3
3    4
4    5
dtype: int64

### Basic Operation

In [7]:
x + 100

0    101
1    102
2    103
3    104
4    105
dtype: int64

In [8]:
(x ** 2) + 100

0    101
1    104
2    109
3    116
4    125
dtype: int64

In [9]:
y = x > 2
y

0    False
1    False
2     True
3     True
4     True
dtype: bool

In [10]:
y.any()

True

In [11]:
y.all()

False

## apply()

In [12]:
def ourfunction(x):
    if x % 2 == 1:
        return x * 3
    else:
        return x * 2

x.apply(ourfunction)

0     3
1     4
2     9
3     8
4    15
dtype: int64

### Avoid looping over your data

In [13]:
%%timeit

ds = pd.Series(range(10000))

for counter in range(len(ds)):
    ds[counter] = ourfunction(ds[counter])

1 loop, best of 3: 132 ms per loop


In [14]:
%%timeit

ds = pd.Series(range(10000))

ds = ds.apply(ourfunction)

The slowest run took 5.29 times longer than the fastest. This could mean that an intermediate result is being cached.
100 loops, best of 3: 4.58 ms per loop


In [15]:
x.astype(np.float64)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64

## copy()

In [16]:
y = x

In [17]:
y[0]

1

In [18]:
y[0] = 100

In [19]:
y

0    100
1      2
2      3
3      4
4      5
dtype: int64

In [20]:
x

0    100
1      2
2      3
3      4
4      5
dtype: int64

In [21]:
y = x.copy()

In [22]:
x[0]=1

In [23]:
x

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [24]:
y

0    100
1      2
2      3
3      4
4      5
dtype: int64

In [25]:
x.describe()

count    5.000000
mean     3.000000
std      1.581139
min      1.000000
25%      2.000000
50%      3.000000
75%      4.000000
max      5.000000
dtype: float64

# DataFrame
### DataFrame is a 2-dimensional labeled data structure with columns of potentially different types.
### Series is the datastructure for a single column of a DataFrame.

## pd.DataFrame(self, data=None, index=None, columns=None, dtype=None, copy=False)

In [38]:
data = [1,2,3,4,5,6,7,8,9]
df1 = pd.DataFrame(data, columns=["col1"])

In [39]:
df1

Unnamed: 0,col1
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9


# Selecting Datadf["col1"]

In [40]:
df1["col1"]

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
8    9
Name: col1, dtype: int64

In [41]:
df1["col1"][0]

1

## Adding extra columns

In [42]:
df1["col1_sequared"] = df1["col1"] ** 2
df1

Unnamed: 0,col1,col1_sequared
0,1,1
1,2,4
2,3,9
3,4,16
4,5,25
5,6,36
6,7,49
7,8,64
8,9,81


In [43]:
df1["col1_plus3"] = df1["col1"] + 3
df1["col1_factorial"] = df1["col1"].apply(np.math.factorial)
df1

Unnamed: 0,col1,col1_sequared,col1_plus3,col1_factorial
0,1,1,4,1
1,2,4,5,2
2,3,9,6,6
3,4,16,7,24
4,5,25,8,120
5,6,36,9,720
6,7,49,10,5040
7,8,64,11,40320
8,9,81,12,362880


In [44]:
df1["is_even"] = df1["col1"] % 2
df1

Unnamed: 0,col1,col1_sequared,col1_plus3,col1_factorial,is_even
0,1,1,4,1,1
1,2,4,5,2,0
2,3,9,6,6,1
3,4,16,7,24,0
4,5,25,8,120,1
5,6,36,9,720,0
6,7,49,10,5040,1
7,8,64,11,40320,0
8,9,81,12,362880,1


# map()

In [45]:
df1["odd_even"] = df1["is_even"].map({1:"odd", 0:"even"})
df1

Unnamed: 0,col1,col1_sequared,col1_plus3,col1_factorial,is_even,odd_even
0,1,1,4,1,1,odd
1,2,4,5,2,0,even
2,3,9,6,6,1,odd
3,4,16,7,24,0,even
4,5,25,8,120,1,odd
5,6,36,9,720,0,even
6,7,49,10,5040,1,odd
7,8,64,11,40320,0,even
8,9,81,12,362880,1,odd


## drop()

In [46]:
df1 = df1.drop("is_even", 1)
df1

Unnamed: 0,col1,col1_sequared,col1_plus3,col1_factorial,odd_even
0,1,1,4,1,odd
1,2,4,5,2,even
2,3,9,6,6,odd
3,4,16,7,24,even
4,5,25,8,120,odd
5,6,36,9,720,even
6,7,49,10,5040,odd
7,8,64,11,40320,even
8,9,81,12,362880,odd


## Multi Column Select

In [47]:
df1[["col1", "odd_even"]]

Unnamed: 0,col1,odd_even
0,1,odd
1,2,even
2,3,odd
3,4,even
4,5,odd
5,6,even
6,7,odd
7,8,even
8,9,odd


## Controlling display options

In [56]:
pd.options.display.max_columns= 60
pd.options.display.max_rows= 6
pd.options.display.notebook_repr_html = True
df1

Unnamed: 0,col1,col1_sequared,col1_plus3,col1_factorial,odd_even
0,1,1,4,1,odd
1,2,4,5,2,even
2,3,9,6,6,odd
...,...,...,...,...,...
6,7,49,10,5040,odd
7,8,64,11,40320,even
8,9,81,12,362880,odd


## Filtering

In [57]:
df1[df1["odd_even"] == "odd"]

Unnamed: 0,col1,col1_sequared,col1_plus3,col1_factorial,odd_even
0,1,1,4,1,odd
2,3,9,6,6,odd
4,5,25,8,120,odd
6,7,49,10,5040,odd
8,9,81,12,362880,odd


## Chaining Filters
 
## OR

In [58]:
df1[(df1.odd_even == "even") | (df1.col1  < 5)]

Unnamed: 0,col1,col1_sequared,col1_plus3,col1_factorial,odd_even
0,1,1,4,1,odd
1,2,4,5,2,even
2,3,9,6,6,odd
3,4,16,7,24,even
5,6,36,9,720,even
7,8,64,11,40320,even


# &And

In [59]:
df1[(df1.odd_even == "even") & (df1.col1 < 5)]

Unnamed: 0,col1,col1_sequared,col1_plus3,col1_factorial,odd_even
1,2,4,5,2,even
3,4,16,7,24,even


## Further Chaining

In [60]:
df1[(df1.odd_even == "even") & (df1.col1 < 5)]["col1_plus3"][:1]

1    5
Name: col1_plus3, dtype: int64

In [61]:
df1.describe()

Unnamed: 0,col1,col1_sequared,col1_plus3,col1_factorial
count,9.000000,9.000000,9.000000,9.000000
mean,5.000000,31.666667,8.000000,45457.000000
std,2.738613,28.080242,2.738613,119758.341137
...,...,...,...,...
50%,5.000000,25.000000,8.000000,120.000000
75%,7.000000,49.000000,10.000000,5040.000000
max,9.000000,81.000000,12.000000,362880.000000


## Reading Data from CSV/TSV Files

In [77]:
titanic_data = pd.read_csv('titanic.csv')

In [78]:
titanic_data

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1307,3.0,0.0,"Zakarian, Mr. Ortin",male,27.0000,0.0,0.0,2670,7.2250,,C,,,
1308,3.0,0.0,"Zimmerman, Mr. Leo",male,29.0000,0.0,0.0,315082,7.8750,,S,,,
1309,,,,,,,,,,,,,,


# References:

http://pandas.pydata.org/

http://nbviewer.jupyter.org/github/TwistedHardware/mltutorial/tree/master/notebooks/IPython-Tutorial/