## Intro to the DataFrames III Module + Import Dataset

In [1]:
import pandas as pd

In [8]:
bond = pd.read_csv('jamesbond.csv')
bond.head()

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
3,Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


## The .set_index() and .reset_index() Methods

In [9]:
## We can set an index for the dataframe when we read the file: pd.read_csv('jamesbond.csv', index_col='Film')
## .set_index() function is called on a dataframe
bond.set_index(keys = 'Film', inplace = True)
bond.head(3)
## keys parameter in .set_index() can be one string or a list of strings(multi-index)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [10]:
## .reset_index() function move the current index back into a column position(by default) and create the standard index
bond.reset_index().head(3)
bond.reset_index(inplace = True, drop = False)
bond.head(3)

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [29]:
bond.reset_index(drop = True).head(3)
## drop = True will drop the old index and create new standard number index

Unnamed: 0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [42]:
bond.set_index('Film', inplace = True)
bond.set_index('Year').head(3)   ## if we set_index to another column, the Year will replace the Film, and Film will be overwritten

Unnamed: 0_level_0,Actor,Director,Box Office,Budget,Bond Actor Salary
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1962,Sean Connery,Terence Young,448.8,7.0,0.6
1963,Sean Connery,Terence Young,543.8,12.6,1.6
1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [49]:
## To solve this, we need to reset the index to be number and then set the index to be Year
bond.set_index('Film', inplace = True)
bond.reset_index(inplace = True)
bond.set_index('Year', inplace = True)
bond.head(3)

Unnamed: 0_level_0,Film,Actor,Director,Box Office,Budget,Bond Actor Salary
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1962,Dr. No,Sean Connery,Terence Young,448.8,7.0,0.6
1963,From Russia with Love,Sean Connery,Terence Young,543.8,12.6,1.6
1964,Goldfinger,Sean Connery,Guy Hamilton,820.4,18.6,3.2


## Retrieve Rows by Index Label with .loc[]

In [72]:
bond = pd.read_csv('jamesbond.csv', index_col = 'Film')
bond.sort_index(inplace = True)   ## sorting the index for large dataframes can faster program and more efficient
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [57]:
## .loc[] is called directly on dataframe to extract rows, put the index into the bracket
bond.loc['GoldenEye']
bond.loc['Goldfinger']  ## put a single value extract a series

Year                         1964
Actor                Sean Connery
Director             Guy Hamilton
Box Office                  820.4
Budget                       18.6
Bond Actor Salary             3.2
Name: Goldfinger, dtype: object

In [59]:
## cite the file(index) does not exist
# bond.loc['Sacred Bond']  ## Give an error

In [60]:
## Cite the index with more than one row in dataframe, it will extract all of them
bond.loc['Casino Royale']

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [76]:
## Extract sequential rows, remind: that 'Moonraker' here is inclusive in the output unlike extract in the series/list/arrays
bond.loc['Diamonds Are Forever' : 'Moonraker'].head(3)
bond.loc['GoldenEye': ].head(3)   ## extract from the GoldenEye to the end
bond.loc[: "On Her Majesty's Secret Service"].tail(3)  ## Extract from the very beginning to the Majesty's

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Never Say Never Again,1983,Sean Connery,Irvin Kershner,380.0,86.0,
Octopussy,1983,Roger Moore,John Glen,373.8,53.9,7.8
On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6


In [78]:
bond.loc[[ 'Octopussy', 'Moonraker']]  ## extract multiple rows not in order

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Octopussy,1983,Roger Moore,John Glen,373.8,53.9,7.8
Moonraker,1979,Roger Moore,Lewis Gilbert,535.0,91.5,


In [79]:
##  Last movie does not exist, but it wll included in the output, be careful, like the .get() function
bond.loc[['For Your Eyes Only', 'Live and Let Die', 'Gold Bond']]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  from ipykernel import kernelapp as app


Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
For Your Eyes Only,1981.0,Roger Moore,John Glen,449.4,60.2,
Live and Let Die,1973.0,Roger Moore,Guy Hamilton,460.3,30.8,
Gold Bond,,,,,,


In [83]:
'Gold Bond' in bond.index

False

## Retrieve Rows by Index Position with .iloc[]

In [88]:
## .iloc[] extract rows by index position
bond = pd.read_csv('jamesbond.csv')
bond.head(3)    ## Here the index label is the same as the index position, thus .loc[] and .iloc[] are the same
bond.loc[15]
bond.iloc[15]

Film                 A View to a Kill
Year                             1985
Actor                     Roger Moore
Director                    John Glen
Box Office                      275.2
Budget                           54.5
Bond Actor Salary                 9.1
Name: 15, dtype: object

In [93]:
bond.iloc[[15, 20]]
bond.iloc[: 4]
bond.iloc[4 : 8]
bond.iloc[20 : ]

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
20,The World Is Not Enough,1999,Pierce Brosnan,Michael Apted,439.5,158.3,13.5
21,Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
22,Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
23,Quantum of Solace,2008,Daniel Craig,Marc Forster,514.2,181.4,8.1
24,Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5
25,Spectre,2015,Daniel Craig,Sam Mendes,726.7,206.3,


In [101]:
bond = pd.read_csv('jamesbond.csv', index_col = 'Film')
bond.sort_index(inplace = True)
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [102]:
## Even though now we have string index labels, each of these index labels is still assigned an index position
bond.iloc[0]

Year                        1985
Actor                Roger Moore
Director               John Glen
Box Office                 275.2
Budget                      54.5
Bond Actor Salary            9.1
Name: A View to a Kill, dtype: object

In [106]:
bond.iloc[15:20]
bond.iloc[: 8]
bond.iloc[18 : ]
bond.iloc[[5, 10, 15, 20]]
## .iloc[] extracts rows from the dataframe based on the index position/location, regardless of what we are actually 
## using as the index. If we set up a custom index(Film, Year), each index label still gonna be assigned a numeric position.

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
Licence to Kill,1989,Timothy Dalton,John Glen,250.9,56.7,7.9
On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
The Man with the Golden Gun,1974,Roger Moore,Guy Hamilton,334.0,27.7,


## The Catch-All .ix[] Method

In [11]:
## .ix[] combines the .loc[] and .iloc[] together
bond = pd.read_csv('jamesbond.csv', index_col = 'Film')
bond.sort_index(inplace = True)
bond.ix['GoldenEye']   ## same as bond.loc['GoldenEye]
bond.ix[['Diamonds Are Forever', 'Moonraker', 'Spectre']]
## bond.ix['A View to a Kill' : 'The World Is Not Enough']  ## The World Is Not Enough is inclusive

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Moonraker,1979,Roger Moore,Lewis Gilbert,535.0,91.5,
Spectre,2015,Daniel Craig,Sam Mendes,726.7,206.3,


In [120]:
##  bond.ix['Sacred Bond'] does not exist which gives an error output
##  bond.ix[['Spectre', 'Sacred Bond']]  Error Message
'Spctre' in bond.index
'Sacred Bond' in bond.index
bond.loc[['Spectre', 'Sacred Bond']]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Spectre,2015.0,Daniel Craig,Sam Mendes,726.7,206.3,
Sacred Bond,,,,,,


In [126]:
bond.ix[10]
bond.ix[10:15]  ## Only show the record until index position of 14
bond.ix[[8, 16 , 24]]
## bond.ix[100]  ## Gives Error Message
## bond.ix[[8, 30]]   ## Still gives an Error Message, unlike passing the trings to .ix[]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  app.launch_new_instance()


Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GoldenEye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9,5.1
Quantum of Solace,2008,Daniel Craig,Marc Forster,514.2,181.4,8.1
Tomorrow Never Dies,1997,Pierce Brosnan,Roger Spottiswoode,463.2,133.9,10.0


## Second Arguments to .loc[], iloc[], and .ix[] Methods

In [131]:
bond.loc['Moonraker', 'Actor']
## It will find the row with Moonraker index label and then find the column with 'Actor' and extract the value in 'Actor'
bond.loc['Moonraker', 'Director']

'Lewis Gilbert'

In [133]:
## Pull more than one columns
bond.loc['Moonraker', ['Director', 'Actor']]
bond.loc['Moonraker', 'Director' : 'Budget']

Director      Lewis Gilbert
Box Office              535
Budget                 91.5
Name: Moonraker, dtype: object

In [137]:
##  use the .iloc[] to extract value in specific column, we need to find the location of this column because .iloc[] 
## only accepts integers in its arguments
bond.iloc[0, 2]
bond.iloc[14, 2 : 5]
bond.iloc[14, [5, 3, 2]]

Bond Actor Salary          7.8
Box Office               373.8
Director             John Glen
Name: Octopussy, dtype: object

In [141]:
bond.ix[20, 'Budget']
bond.ix[5, 3]
## .ix[] can mix the number and  string
bond.ix['The Man with the Golden Gun', ['Actor', 'Budget']]
bond.ix['The Man with the Golden Gun', : 4]
bond.ix['The Man with the Golden Gun', 2]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the document

'Guy Hamilton'

## Set New Values for a Specific Cell or Row

In [15]:
bond.loc['Dr. No']

Year                          1962
Actor                 Sean Connery
Director             Terence Young
Box Office                   448.8
Budget                           7
Bond Actor Salary              0.6
Name: Dr. No, dtype: object

In [19]:
bond.loc['Dr. No', 'Actor']   ## Sean Connery
bond.loc['Dr. No', 'Actor'] = 'Sir Sean Connery'
bond.loc['Dr. No']  ## Now, 'Actor' is changed to 'Sir Sean Connery

Year                             1962
Actor                Sir Sean Connery
Director                Terence Young
Box Office                      448.8
Budget                              7
Bond Actor Salary                 0.6
Name: Dr. No, dtype: object

In [24]:
## Assign multiple values can pass them into a list
bond.loc['Dr. No', ['Box Office', 'Budget', 'Bond Actor Salary']] = [448800000, 7000000, 600000]

Box Office           448.8
Budget                   7
Bond Actor Salary      0.6
Name: Dr. No, dtype: object

### Set Multiple Values in DataFrame(Across Multiple Rows)

In [38]:
bond = pd.read_csv('jamesbond.csv', index_col='Film')
bond.sort_index(inplace = True)
bond.head(2)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3


In [30]:
## Wrong way to change values for multiple rows
bond[bond['Actor'] == 'Sean Connery']['Actor']='Sir Sean Connery'  ##useless
## This gives a warning: A value is trying to be set on a copy of a slice from a DataFrame
## The original values are not updated at all, the reason is due to bond[bond['Actor'=='Sean Connery']] will produce a 
## new DataFrame, what we have done was just modified the second dataframe(not stored), not the original one

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [32]:
df2 = bond[bond['Actor'] == 'Sean Connery']  ## Creates a brand new copy of this slice
df2['Actor'] = 'Sir Sean Connery'  ## operations on this copy will only affect the copy
df2.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Diamonds Are Forever,1971,Sir Sean Connery,Guy Hamilton,442.5,34.7,5.8
Dr. No,1962,Sir Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sir Sean Connery,Terence Young,543.8,12.6,1.6


In [40]:
bond.loc[bond['Actor'] == 'Sean Connery']
## We can pass a boolean list to loc[], though output looks like previous code, this is not a copy of the original dataframe
## It is a view of bond dataframe that have a value of 'Sean Connery', when modify this, it will affect the values in bond

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6


In [41]:
bond.loc[bond['Actor'] == 'Sean Connery', 'Actor'] = 'Sir Sean Connery'  ## this operation affects the original dataframe

In [42]:
bond.head()
## be cautious about whether you are creating a copy or not a copy

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sir Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9


### Rename Index Labels or Columns in a DataFrame

In [43]:
bond = pd.read_csv('jamesbond.csv', index_col = 'Film')
bond.sort_index(inplace = True)

In [44]:
## rename() function to rename the columns, df.rename(index = , columns = , inplace = )
## pass a dictionary to arguments, {old_col_name : new_col_name} --  Change column names
bond.rename(columns = {'Year' : 'Release Date', 'Box Office' : 'Revenue'}, inplace = True)
bond.head(1)

Unnamed: 0_level_0,Release Date,Actor,Director,Revenue,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1


In [45]:
## Change index labels
bond.rename(index = {'Dr. No' : 'Doctor No',
                     'GoldenEye' : 'Golden Eye',
                    'The World Is Not Enough' : 'Best Bond Movie Ever'}, inplace = True)
bond.loc['Best Bond Movie Ever']

Release Date                   1999
Actor                Pierce Brosnan
Director              Michael Apted
Revenue                       439.5
Budget                        158.3
Bond Actor Salary              13.5
Name: Best Bond Movie Ever, dtype: object

In [49]:
## Another to assign new value to columns is to use .columns attribute
## The bad side is that we need to rename all the columns, we cannot pick one 
bond.columns[1] = 'Actors'  ## We can not do this, Index does not support mutable operations

TypeError: Index does not support mutable operations

In [50]:
bond.columns = ['Year of Release', 'Actor', 'Director', 'Gross', 'Cost', 'Salary']
bond.head(1)
## We must write the whole list of columns names no matter you want to change or not

Unnamed: 0_level_0,Year of Release,Actor,Director,Gross,Cost,Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1


## Delete Rows or Columns from a DataFrame

In [86]:
bond = pd.read_csv('jamesbond.csv', index_col = 'Film')
bond.sort_index(inplace = True)

In [54]:
## Use .drop() function directly called on dataframe to drop columns/rows, axis=0/'index' -- drop the rows
bond.drop('A View to a Kill', inplace = True)
'A View to a Kill' in bond.index

False

In [74]:
## Remove multiple rows, use inplace = True to overwrite original dataset
bond.drop(['A View to a Kill', 'Die Another Day', 'From Russia with Love'])
bond.drop('Casino Royale').head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9


In [75]:
## Remove multiple columns, we need to change the axis to columns
bond.drop(['Box Office', 'Bond Actor Salary'], axis = 'columns').head(3)

Unnamed: 0_level_0,Year,Actor,Director,Budget
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A View to a Kill,1985,Roger Moore,John Glen,54.5
Casino Royale,2006,Daniel Craig,Martin Campbell,145.3
Casino Royale,1967,David Niven,Ken Hughes,85.0


In [80]:
## another way to remove columns, .pop() function is a permanent function used to remove a single series
## .pop() returns the removed series(which we can store), and also remove that series from original dataframe
actor = bond.pop('Actor')
bond.head(1)

Unnamed: 0_level_0,Year,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A View to a Kill,1985,John Glen,275.2,54.5,9.1


In [81]:
actor

Film
A View to a Kill                      Roger Moore
Casino Royale                        Daniel Craig
Casino Royale                         David Niven
Diamonds Are Forever                 Sean Connery
Die Another Day                    Pierce Brosnan
Dr. No                               Sean Connery
For Your Eyes Only                    Roger Moore
From Russia with Love                Sean Connery
GoldenEye                          Pierce Brosnan
Goldfinger                           Sean Connery
Licence to Kill                    Timothy Dalton
Live and Let Die                      Roger Moore
Moonraker                             Roger Moore
Never Say Never Again                Sean Connery
Octopussy                             Roger Moore
On Her Majesty's Secret Service    George Lazenby
Quantum of Solace                    Daniel Craig
Skyfall                              Daniel Craig
Spectre                              Daniel Craig
The Living Daylights               Timothy Da

In [87]:
## The third way to remove a column, use the built-in function in Python: del
del bond['Director']
bond.head(1)

Unnamed: 0_level_0,Year,Actor,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A View to a Kill,1985,Roger Moore,275.2,54.5,9.1


### Create Random Sample with the .sample() Method

In [2]:
bond = pd.read_csv('jamesbond.csv', index_col = 'Film')
bond.sort_index(inplace = True)

In [6]:
## Extract random sample of columns or rows from our DataFrame
## Extract random rows, .sample() function will return a single random row by default adn not modify our original dataframe
bond.sample()
bond.sample(n = 5)
bond.sample(frac = .25)
bond.sample(n = 3, axis='columns').head(3)
## parameter: n= number of random rows, frac= sampling the proportion of of the dataframe, axis= sampling rows/columns
## axis= 0/'index'  1/'columns'

Unnamed: 0_level_0,Year,Box Office,Budget
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A View to a Kill,1985,275.2,54.5
Casino Royale,2006,581.5,145.3
Casino Royale,1967,315.0,85.0


### The .nsmallest() and .nlargest() Methods

In [10]:
## Extract rows from DataFrame that contains the smallest or largest values in a specific column
bond.sort_values('Box Office', ascending = False).head(3)  ## old method
bond.nlargest(n = 3, columns = 'Box Office')  ## Extract the largest 3 rows by Box Office, descending order
bond.nsmallest(n = 2, columns = 'Box Office')  ## Extract the smallest 2 rows by Box Office, ascending order

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Licence to Kill,1989,Timothy Dalton,John Glen,250.9,56.7,7.9
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1


In [11]:
bond.nlargest(n = 3, columns = 'Budget')   ## .nlargest() and .nsmallest() functions are more efficient than .sort_values()

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Spectre,2015,Daniel Craig,Sam Mendes,726.7,206.3,
Quantum of Solace,2008,Daniel Craig,Marc Forster,514.2,181.4,8.1
Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5


In [12]:
bond.nsmallest(n = 6, columns = 'Bond Actor Salary')

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4


In [13]:
## .nlargest() and .nsmallest() also can be applied to pandas series
bond['Box Office'].nlargest(2)

Film
Skyfall        943.5
Thunderball    848.1
Name: Box Office, dtype: float64

In [14]:
bond['Year'].nsmallest(n = 2)

Film
Dr. No                   1962
From Russia with Love    1963
Name: Year, dtype: int64

### Filtering with the .where() Method

In [15]:
## .where() function will present the rows that meet the condition and the rows that do not match are going to be filed 
## with null values
bond[bond['Actor'] == 'Sean Connery']

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Never Say Never Again,1983,Sean Connery,Irvin Kershner,380.0,86.0,
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4


In [16]:
bond.where(bond['Actor'] == 'Sean Connery').head()
## we need to pass the boolean series to the .where() function, null values are rows do not meet condition

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,,,,,,
Casino Royale,,,,,,
Casino Royale,,,,,,
Diamonds Are Forever,1971.0,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,,,,,,


In [17]:
bond.where(bond['Box Office'] > 800)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,,,,,,
Casino Royale,,,,,,
Casino Royale,,,,,,
Diamonds Are Forever,,,,,,
Die Another Day,,,,,,
Dr. No,,,,,,
For Your Eyes Only,,,,,,
From Russia with Love,,,,,,
GoldenEye,,,,,,
Goldfinger,1964.0,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [19]:
bond.where((bond['Box Office'] > 800) & (bond['Actor'] == 'Sean Connery'))
## Note: The parentheses of condtions is a must!!!!!

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,,,,,,
Casino Royale,,,,,,
Casino Royale,,,,,,
Diamonds Are Forever,,,,,,
Die Another Day,,,,,,
Dr. No,,,,,,
For Your Eyes Only,,,,,,
From Russia with Love,,,,,,
GoldenEye,,,,,,
Goldfinger,1964.0,Sean Connery,Guy Hamilton,820.4,18.6,3.2


### The .query() Method

In [22]:
## Another way to filter data by applying .query() function
## 1. argument should be a string.  2. This function only works when the columns in the DataFrame do not have spaces
## Deal with column name blanks
bond.rename(columns = {'Box Office': 'Box_Office', 'Bond Actor Salary': 'Bond_Actor_Salary'})  ## Old way
bond.columns = [column_name.replace(' ', '_') for column_name in bond.columns]
bond.head(1)

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1


In [31]:
bond.query('Actor == "Sean Connery"').head(2)
## In the .query() function, we only need strings of boolean, so we need double quote when use string in boolean string

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6


In [32]:
bond.query("Director == 'Terence Young'")

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7


In [35]:
bond.query('Actor != "Roger Moore"').head(1)
bond.query("Box_Office > 600").head(2)

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5


In [38]:
## Multiple conditions in .query() function, write the condition like a sentence

bond.query("Actor == 'Roger Moore' or Director == 'John Glen' ")
bond.query("Actor == 'Roger Moore' and Director == 'John Glen' ")

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
Octopussy,1983,Roger Moore,John Glen,373.8,53.9,7.8


In [41]:
bond.query("Actor in ['Timothy Dalton', 'George Lazenby']")
bond.query("Actor not in ['Sean Connery', 'Roger Moore']").head(2)

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


### A review of the .apply() Method on Single Columns

In [49]:
bond = pd.read_csv('jamesbond.csv', index_col='Film')
bond.sort_index(inplace = True)

In [48]:
def convert_to_string_and_add_millions(number):
    return str(number)+'MILLIONS!'
bond['Box Office'] = bond['Box Office'].apply(convert_to_string_and_add_millions)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2MILLIONS!,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5MILLIONS!,145.3,3.3


In [50]:
columns = ['Box Office', 'Budget', 'Bond Actor Salary']
for col in columns:
    bond[col] = bond[col].apply(convert_to_string_and_add_millions)
bond.head(3)    ## the Null values also convert to a string and had millions! after that

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2MILLIONS!,54.5MILLIONS!,9.1MILLIONS!
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5MILLIONS!,145.3MILLIONS!,3.3MILLIONS!
Casino Royale,1967,David Niven,Ken Hughes,315.0MILLIONS!,85.0MILLIONS!,nanMILLIONS!


### The .apply() Method with Row Values

In [51]:
bond = pd.read_csv('jamesbond.csv', index_col = 'Film')
bond.sort_index(inplace = True)

In [52]:
## The input of this function will be each row(list/array) in the Dataframe, thus each element will be the column value
def good_movie(row):
    actor = row[1]
    budget = row[4]
    if actor == 'Pierce Brosnan':
        return 'The Best'
    elif actor == 'Roger Moore' and budget > 40:
        return 'Enjoyable'
    else:
        return 'I have no clue'
    
bond.apply(good_movie, axis = 'columns').head(3)  
## axis = 'columns' means that although we apply on each row, but we feed the value to the function(good_movie) column by column

Film
A View to a Kill         Enjoyable
Casino Royale       I have no clue
Casino Royale       I have no clue
dtype: object

### The .copy() Method

In [58]:
## .copy() function creates an exact copy of an existing pandas
directors = bond['Director']
directors.head(3)

Film
A View to a Kill          John Glen
Casino Royale       Martin Campbell
Casino Royale            Ken Hughes
Name: Director, dtype: object

In [60]:
directors['A View to a Kill'] = 'Mister John Glen'   ## Give us warning

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [61]:
directors.head(3)  ## The series is changed!

Film
A View to a Kill    Mister John Glen
Casino Royale        Martin Campbell
Casino Royale             Ken Hughes
Name: Director, dtype: object

In [62]:
bond.head(3)  ## My original dataframe has been affected!!

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,Mister John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [65]:
## The method to avoid affecting the original DataFrame is to make a separate copy and work with the copy
bond = pd.read_csv('jamesbond.csv', index_col='Film')
bond.sort_index(inplace = True)
directors = bond['Director'].copy()
directors.head(3)

Film
A View to a Kill          John Glen
Casino Royale       Martin Campbell
Casino Royale            Ken Hughes
Name: Director, dtype: object

In [67]:
directors['A View to a Kill'] = 'Mister John Glen'

In [68]:
directors.head(3)  ## now directors is a copy(separate object from bond DataFrame)

Film
A View to a Kill    Mister John Glen
Casino Royale        Martin Campbell
Casino Royale             Ken Hughes
Name: Director, dtype: object

In [69]:
bond.head(3)  ## Original DataFrame has not been overwritten

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
