## Pandas - Introduction Part 1

In [1]:
import pandas as pd
import numpy as np

#### Data Structures - Series

In [24]:
"""
Series is a one-dimensional array like structure with homogeneous data
"""
data = np.array(['a','b','c','d'])
s1 = pd.Series(data)
print s1

0    a
1    b
2    c
3    d
dtype: object


In [25]:
"""
Indexes are assigned to each element in a Series.
By default the indexes are assigned incrementally starting 0
"""
print s1.index.values
# We can also assign different set of index values
s1 = pd.Series(data, index = [101,102,103,104])
print s1

[0 1 2 3]
101    a
102    b
103    c
104    d
dtype: object


In [27]:
"""
We can also create series from a dictionary data
Here the keys act as indices for the elements in the series
"""
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data)
print s

"""
We can also make a series using just the scalars
"""
s = pd.Series(5, index=[0, 1, 2, 3])
print s

a    0.0
b    1.0
c    2.0
dtype: float64
0    5
1    5
2    5
3    5
dtype: int64


In [39]:
"""
Retrieve data from a series
"""
#Python like indexing works in retrieveing elements
s = pd.Series(np.arange(100,106), index = ['a','b','c','d','e','f'])
print s
#Get the first element
print s[0]
#Get first 3 elements
print s[0:3]
#Get the last 3 elements
print s[-3:]
#Using the labels/index we gave
print s['a']
print s[['a','b','c']]

a    100
b    101
c    102
d    103
e    104
f    105
dtype: int32
100
a    100
b    101
c    102
dtype: int32
d    103
e    104
f    105
dtype: int32
100
a    100
b    101
c    102
dtype: int32


#### Data Structures - Dataframes

In [45]:
"""
DataFrame is a two-dimensional array with heterogeneous data. The most commonaly used data structure in pandas
A Dataframe can also be thought of as a container of multiple series
There are multiple ways in which we can create dataframes.
"""
#Using List of lists
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'],dtype=float)
print df

#Using Dictionary of lists
#Note: length og lists must be same
dict_data = {"country": ["Brazil", "Russia", "India", "China", "South Africa"],
       "capital": ["Brasilia", "Moscow", "New Dehli", "Beijing", "Pretoria"],
       "area": [8.516, 17.10, 3.286, 9.597, 1.221],
       "population": [200.4, 143.5, 1252, 1357, 52.98] }
brics = pd.DataFrame(dict_data, index=["BR", "RU", "IN", "CH", "SA"])
print brics

#Using Lists of dictionaries
#Note: how the lists can be of different lengths here.
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)
print df

#We can also use a dictionary of series to create a dataframe
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print df

     Name   Age
0    Alex  10.0
1     Bob  12.0
2  Clarke  13.0
      area    capital       country  population
BR   8.516   Brasilia        Brazil      200.40
RU  17.100     Moscow        Russia      143.50
IN   3.286  New Dehli         India     1252.00
CH   9.597    Beijing         China     1357.00
SA   1.221   Pretoria  South Africa       52.98
   a   b     c
0  1   2   NaN
1  5  10  20.0
   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


In [50]:
"""
Column selection in a dataframe
"""
#Using the column label

print brics ['country']
print brics [['country','area']]

BR          Brazil
RU          Russia
IN           India
CH           China
SA    South Africa
Name: country, dtype: object
         country    area
BR        Brazil   8.516
RU        Russia  17.100
IN         India   3.286
CH         China   9.597
SA  South Africa   1.221


In [53]:
"""
Column addition and deletion in a dataframe
"""
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
print df

# Adding a new column to an existing DataFrame object with column label by passing new series
df['three']=pd.Series([10,20,30],index=['a','b','c'])
print df

df['four']=df['one']+df['three']
print df

# Deleting columns in an existing dataframe with column label
# using del function
del df['one']
print df

# using pop function
df.pop('two')
print df

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN
   one  two  three  four
a  1.0    1   10.0  11.0
b  2.0    2   20.0  22.0
c  3.0    3   30.0  33.0
d  NaN    4    NaN   NaN
   two  three  four
a    1   10.0  11.0
b    2   20.0  22.0
c    3   30.0  33.0
d    4    NaN   NaN
   three  four
a   10.0  11.0
b   20.0  22.0
c   30.0  33.0
d    NaN   NaN


In [56]:
"""
Rows can be selected by passing row labels
"""
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print df.loc['b']
#Using integer location
print df.iloc[2]
#Multiple rows can be selected using ‘ : ’ operator.
print df[2:4]


one    2.0
two    2.0
Name: b, dtype: float64
one    3.0
two    3.0
Name: c, dtype: float64
   one  two
c  3.0    3
d  NaN    4


In [60]:
"""
Addition and deletion of rows in a dataframe
"""
df = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])

#Note the repeating indexes indexes
df = df.append(df2)
print df

# Drop rows with label 0
df = df.drop(0)
print df

   a  b
0  1  2
1  3  4
0  5  6
1  7  8
   a  b
1  3  4
1  7  8


In [16]:
"""
Another way to create a DataFrame is by importing a csv file using Pandas
"""
ign = pd.read_csv("ign.csv")
print ign.head()

   Unnamed: 0 score_phrase                                              title  \
0           0      Amazing                            LittleBigPlanet PS Vita   
1           1      Amazing  LittleBigPlanet PS Vita -- Marvel Super Hero E...   
2           2        Great                               Splice: Tree of Life   
3           3        Great                                             NHL 13   
4           4        Great                                             NHL 13   

                                                 url          platform  score  \
0             /games/littlebigplanet-vita/vita-98907  PlayStation Vita    9.0   
1  /games/littlebigplanet-ps-vita-marvel-super-he...  PlayStation Vita    9.0   
2                          /games/splice/ipad-141070              iPad    8.5   
3                      /games/nhl-13/xbox-360-128182          Xbox 360    8.5   
4                           /games/nhl-13/ps3-128181     PlayStation 3    8.5   

        genre editors_choi

### Exploring basics

In [67]:
#Create a Dictionary of series
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),
   'Age':pd.Series([25,26,25,23,30,29,23]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}

#Create a DataFrame
df = pd.DataFrame(d)
print df

#Create a transpose of the dataframe
print "Transpose:"
print df.T

#print the axes of the df
print "Axis:"
print df.axes

#The datatypes of each column:
print "Datatypes:"
print df.dtypes

#Get the shape of the dataframe
print "Shape:"
print df.shape

#Get the size of the dataframe
print "Total num of elements:"
print df.size

#Get a dump of the dataframe in the form of a list of lists
print df.values

   Age   Name  Rating
0   25    Tom    4.23
1   26  James    3.24
2   25  Ricky    3.98
3   23    Vin    2.56
4   30  Steve    3.20
5   29  Smith    4.60
6   23   Jack    3.80
Transpose:
           0      1      2     3      4      5     6
Age       25     26     25    23     30     29    23
Name     Tom  James  Ricky   Vin  Steve  Smith  Jack
Rating  4.23   3.24   3.98  2.56    3.2    4.6   3.8
Axis:
[RangeIndex(start=0, stop=7, step=1), Index([u'Age', u'Name', u'Rating'], dtype='object')]
Datatypes:
Age         int64
Name       object
Rating    float64
dtype: object
Shape:
(7, 3)
Total num of elements:
21
[[25L 'Tom' 4.23]
 [26L 'James' 3.24]
 [25L 'Ricky' 3.98]
 [23L 'Vin' 2.56]
 [30L 'Steve' 3.2]
 [29L 'Smith' 4.6]
 [23L 'Jack' 3.8]]


In [72]:
"""
Head and tail functions
"""
#View the first and last n elements of the dataframe
print df.head(2)
print df.tail(2)

   Age   Name  Rating
0   25    Tom    4.23
1   26  James    3.24
   Age   Name  Rating
5   29  Smith     4.6
6   23   Jack     3.8


In [79]:
"""
Basic functions in a dataframe
"""
#Get a column wise sum
print df.sum()
#Get a row wise sum
print df.sum(axis=1)

#Get a column wise mean of all the rows
print df.mean()

#Get a summary of all the numerical columns in a dataframe. just like summary(df) in R
print df.describe()

#Get a summary of columns by their datatypes
print df.describe(include=['O'])

Age                                  181
Name      TomJamesRickyVinSteveSmithJack
Rating                             25.61
dtype: object
0    29.23
1    29.24
2    28.98
3    25.56
4    33.20
5    33.60
6    26.80
dtype: float64
Age       25.857143
Rating     3.658571
dtype: float64
             Age    Rating
count   7.000000  7.000000
mean   25.857143  3.658571
std     2.734262  0.698628
min    23.000000  2.560000
25%    24.000000  3.220000
50%    25.000000  3.800000
75%    27.500000  4.105000
max    30.000000  4.600000
         Name
count       7
unique      7
top     Ricky
freq        1


In [82]:
#Renaming columns and indexes
df1 = pd.DataFrame(np.random.randn(6,3),columns=['col1','col2','col3'])
print df1

print ("After renaming the rows and columns:")
print df1.rename(columns={'col1' : 'c1', 'col2' : 'c2'},index = {0 : 'apple', 1 : 'banana', 2 : 'durian'})

       col1      col2      col3
0 -0.595097  0.470524  0.279954
1 -0.776857  0.666584 -1.038153
2  1.262017 -1.212780  1.461746
3 -0.821280 -1.319773 -0.285814
4 -1.167121 -0.849286  2.104408
5  0.492654  1.301608 -2.584708
After renaming the rows and columns:
              c1        c2      col3
apple  -0.595097  0.470524  0.279954
banana -0.776857  0.666584 -1.038153
durian  1.262017 -1.212780  1.461746
3      -0.821280 -1.319773 -0.285814
4      -1.167121 -0.849286  2.104408
5       0.492654  1.301608 -2.584708


In [91]:
"""
Iterating through the dataframe using the simple python loops
"""
df = pd.DataFrame(np.random.randn(4,3),columns=['col1','col2','col3'])

#Itering through columns
for col in df:
    print col

print "\n","Iterating over columns"
#Iterating over columns, column values(Series) as (key,value) pairs
for key,value in df.iteritems():
    print key,"\n",value

print "\n","Iterating over rows"
#Iterting over rows, row values(Series) as (key,value) pairs
for row_index,row in df.iterrows():
    print row_index,"\n",row

col1
col2
col3

Iterating over columns
col1 
0   -0.826366
1    1.471914
2   -0.729959
3   -1.010754
Name: col1, dtype: float64
col2 
0   -0.173026
1    0.317008
2    1.141356
3   -0.547795
Name: col2, dtype: float64
col3 
0   -0.371337
1   -0.087639
2   -0.151336
3    0.074584
Name: col3, dtype: float64

Iterating over rows
0 
col1   -0.826366
col2   -0.173026
col3   -0.371337
Name: 0, dtype: float64
1 
col1    1.471914
col2    0.317008
col3   -0.087639
Name: 1, dtype: float64
2 
col1   -0.729959
col2    1.141356
col3   -0.151336
Name: 2, dtype: float64
3 
col1   -1.010754
col2   -0.547795
col3    0.074584
Name: 3, dtype: float64
   col1      col2      col3
0  10.0 -0.173026 -0.371337
1  10.0  0.317008 -0.087639
2  10.0  1.141356 -0.151336
3  10.0 -0.547795  0.074584


In [97]:
"""
Sorting the dataframe
"""
unsorted_df = pd.DataFrame(np.random.randn(10,2),index=[1,4,6,2,3,5,9,8,0,7],columns = ['col2','col1'])

#Sort by row index(row labels)
print "\n","By Row labels"
sorted_df=unsorted_df.sort_index(ascending=False)
print sorted_df

#Sort by column labels
print "\n","By Columns labels"
sorted_df=unsorted_df.sort_index(axis=1)
print sorted_df

#Sort by the dataframe values
print "\n","By Values"
sorted_df = unsorted_df.sort_values(by=['col1','col2'])
print sorted_df


By Row labels
       col2      col1
9  0.650655 -0.945977
8 -3.015635  0.993880
7  0.711635 -0.627116
6 -0.892861  0.939077
5  0.015696 -0.115436
4 -1.420958 -0.916836
3  0.534063 -0.059460
2  0.390587  0.124343
1  0.089525 -0.390213
0  0.003064  0.415790

By Columns labels
       col1      col2
1 -0.390213  0.089525
4 -0.916836 -1.420958
6  0.939077 -0.892861
2  0.124343  0.390587
3 -0.059460  0.534063
5 -0.115436  0.015696
9 -0.945977  0.650655
8  0.993880 -3.015635
0  0.415790  0.003064
7 -0.627116  0.711635

By Values
       col2      col1
9  0.650655 -0.945977
4 -1.420958 -0.916836
7  0.711635 -0.627116
1  0.089525 -0.390213
5  0.015696 -0.115436
3  0.534063 -0.059460
2  0.390587  0.124343
0  0.003064  0.415790
6 -0.892861  0.939077
8 -3.015635  0.993880
