In [2]:
import pandas as pd

In [3]:
# Use the pd.read_csv() method to load a dataframe
# We can load .csv files, but also .zip files which contain .csv files
# Load the file 'data/mini_cast.zip'
df = pd.read_csv('data/mini_cast.zip')


In [4]:
# Take a quick look at the first twenty entries of the dataframe
df.head(20)

Unnamed: 0,title,year,name,type,character,n
0,Demony wojny wedlug Goi,1998,Olaf Lubaszenko,actor,Lt. Czacki,6.0
1,Boone: The Bounty Hunter,2017,Osric Chau,actor,Denny,2.0
2,Lang ist es her...,1931,Renee May,actor,"Reserl, ihre Tochter",2.0
3,Miss Malini,1947,Javar Seetharaman,actor,Director of Kala Mandhiram,
4,American Wrestler: The Wizard,2016,Daniel (XIII) Pacheco,actor,East Petaluma Wrestler,34.0
5,Bad boy dak gung,2000,Blackie Shou Liang Ko,actor,Dark Tung,8.0
6,Excuses!,2003,Mar Colàs,actress,Verònica,6.0
7,Alan & Naomi,1992,Michael Gross,actor,Sol Silverman,3.0
8,10 jours en or,2012,Alain Buron,actor,Chef de rayon supermarché,18.0
9,Guest iin London,2017,Moe Bilal,actor,Funeral Ward Man,


In [5]:
### edTest(test_entries) ###
# The len() function can be used to see the number of rows in your data frame
# find the rows in the original dataframe
entries = len(df)
print(entries)

172145


In [6]:
# .shape gives the total number of rows and columns
# Check the shape of the original dataframe
shape = df.shape
print(shape)

(172145, 6)


In [7]:
# Use .columns to see the columns in the dataframe
cols = df.columns
print(cols)

Index(['title', 'year', 'name', 'type', 'character', 'n'], dtype='object')


In [8]:
# You can change the columns to python native lists using .tolist() function on cols
col_list = cols.tolist()

In [9]:
# Change the final column name from 'n' to 'role_importance'
col_list[-1] = 'role_importance'
print(col_list)

['title', 'year', 'name', 'type', 'character', 'role_importance']


In [10]:
# You can cast the new col_list to the dataframe by setting the df.columns = col_list
df.columns =col_list
df.head(30)

Unnamed: 0,title,year,name,type,character,role_importance
0,Demony wojny wedlug Goi,1998,Olaf Lubaszenko,actor,Lt. Czacki,6.0
1,Boone: The Bounty Hunter,2017,Osric Chau,actor,Denny,2.0
2,Lang ist es her...,1931,Renee May,actor,"Reserl, ihre Tochter",2.0
3,Miss Malini,1947,Javar Seetharaman,actor,Director of Kala Mandhiram,
4,American Wrestler: The Wizard,2016,Daniel (XIII) Pacheco,actor,East Petaluma Wrestler,34.0
5,Bad boy dak gung,2000,Blackie Shou Liang Ko,actor,Dark Tung,8.0
6,Excuses!,2003,Mar Colàs,actress,Verònica,6.0
7,Alan & Naomi,1992,Michael Gross,actor,Sol Silverman,3.0
8,10 jours en or,2012,Alain Buron,actor,Chef de rayon supermarché,18.0
9,Guest iin London,2017,Moe Bilal,actor,Funeral Ward Man,


In [11]:
### edTest(test_cols) ###
# We can subselect particular columns by choosing a sublist
few_cols = ['title','year','name']
df[few_cols]


Unnamed: 0,title,year,name
0,Demony wojny wedlug Goi,1998,Olaf Lubaszenko
1,Boone: The Bounty Hunter,2017,Osric Chau
2,Lang ist es her...,1931,Renee May
3,Miss Malini,1947,Javar Seetharaman
4,American Wrestler: The Wizard,2016,Daniel (XIII) Pacheco
...,...,...,...
172140,The Tournament,2005,John Los
172141,On Air: Storia di un Successo,2016,Luca Alba
172142,R,2010,Bushra Sadaio
172143,Una storia moderna - L'ape regina,1963,Walter Giller


## ⏸ `df[['x']]` vs `df['x']`

What is the difference between the two operations above for a valid dataframe with a column named 'x'.

#### A. `df[['x']]` returns a `pd.DataFrame` object  whereas `df['x']` returns a `pd.Series` object
#### B. `df[['x']]` returns a `pd.Series` object  whereas `df['x']` returns a `pd.DataFrame` object
#### C. `df[['x']]` is an invalid operation
#### D. `df['x']` is an invalid operation

In [12]:
### edTest(test_chow1) ###

# Submit an answer choice as a string below (eg. if you choose option C, put 'C')
answer1 = 'A'

### 🙋🏻 How many roles are undefined ?

In [13]:
# First get the series corresponding to the 'role_importance' column
role_series = df['role_importance']
role_series

0          6.0
1          2.0
2          2.0
3          NaN
4         34.0
          ... 
172140     NaN
172141     4.0
172142    25.0
172143     3.0
172144     5.0
Name: role_importance, Length: 172145, dtype: float64

In [14]:
# Select only the non-NAN values by using .notna() method on 'role_series'
boolean_series = role_series.notna()

In [15]:
# Index the original data..frame with the 'boolean' mask and find the length
num_of_notna_roles = boolean_series
print(num_of_notna_roles)

0          True
1          True
2          True
3         False
4          True
          ...  
172140    False
172141     True
172142     True
172143     True
172144     True
Name: role_importance, Length: 172145, dtype: bool


## Part 2: Series operations

In [16]:
# Let's call another dataset 'data/titles.zip'
# which has a lot more movies but fewer columns
titles = pd.read_csv('data/titles.zip')

# Take a quick look at the dataset
titles.head()


Unnamed: 0,title,year
0,The Hell with Heroes,1968
1,"Raptola, violola y matola",1989
2,Kigeki: Otto urimasu,1968
3,Tanga-Tika,1953
4,Burning an Illusion,1981


In [17]:
# Get a series of the year in which the movies were released

year = df['year']
#print(year)

In [18]:
# Create a boolean series with only the years > 2000
boolean_series =year > 2000
print(boolean_series)

0         False
1          True
2         False
3         False
4          True
          ...  
172140     True
172141     True
172142     True
172143    False
172144     True
Name: year, Length: 172145, dtype: bool


In [35]:
### edTest(test_counts) ###
# Use the .value_counts() method on `boolean_series` to see how many films were released before and after 2000
df[df.year >2000].value_counts()
#print()
# Take the count of the 'True' bools 
movies_after_2000 =99253
print(f'Number of movies released after the year 2000 are {movies_after_2000}')

Number of movies released after the year 2000 are 99253


In [20]:
# We can even filter the dataframe with the boolean series directly
# with the syntax df[boolean_series]
titles_post_2000 = df[boolean_series]
titles_post_2000.head()

Unnamed: 0,title,year,name,type,character,role_importance
1,Boone: The Bounty Hunter,2017,Osric Chau,actor,Denny,2.0
4,American Wrestler: The Wizard,2016,Daniel (XIII) Pacheco,actor,East Petaluma Wrestler,34.0
6,Excuses!,2003,Mar Colàs,actress,Verònica,6.0
8,10 jours en or,2012,Alain Buron,actor,Chef de rayon supermarché,18.0
9,Guest iin London,2017,Moe Bilal,actor,Funeral Ward Man,


### 🙋🏻 How many movies in the decade of 2000 ?

In [21]:
# To answer the above question, we can operate on the year 'Series' directly 
# get the series associated with the 'year' column, floor divide it by 10 and multiply by 10
# equate this with 2000 to get a boolean series of movies in the year 200x
boolean_series =((year//10)*10)==2000


In [22]:
# Similarly, we can filter out a dataframe with only the movies in the decade of 2000
# using the same boolean_series mentioned above
titles_2k = boolean_series
titles_2k.head()

0    False
1    False
2    False
3    False
4    False
Name: year, dtype: bool

## Indexing/Slicing Rows of DataFrames
- Simple ways of selecting all rows and colu (`df[:]`)
- Rows can be accessed via a key or a integer corresponding to the row number. 
- Omitting a value generally means *all values* before or after an item.
- When we retrieve a single or mulitiple rows, the result is a Dataframe.
- Several ways, either directly, with `loc`, or with `iloc`.



In [23]:
# We can use the boolean series along with loc as well
# with syntax df.loc[boolean_series]
titles_2k = df.loc[boolean_series]
titles_2k.head()

Unnamed: 0,title,year,name,type,character,role_importance
5,Bad boy dak gung,2000,Blackie Shou Liang Ko,actor,Dark Tung,8.0
6,Excuses!,2003,Mar Colàs,actress,Verònica,6.0
10,The Girls' Room,2000,Lela Lee,actress,Chloe,10.0
18,The Harvest Project,2008,Edan Armstrong,actor,Male cadaver,33.0
21,Aladin,2009,Peter (IX) Wong,actor,Warehouse Employee,


In [24]:
# loc takes both rows and columns, so we can specify if we only want the title
# with syntax df.loc[boolean_series,['title']]
only_titles =df.loc[boolean_series,['title']]
only_titles.head()

Unnamed: 0,title
5,Bad boy dak gung
6,Excuses!
10,The Girls' Room
18,The Harvest Project
21,Aladin


## ⏸ `.loc()` vs `.iloc()`
Suppose we have the `toy_df`dataframe below.
```python3
	 name	  role
1	Pavlos	Architect
2	Rahul	Instructor
3	Chaitanya   TA
4	Hemani	    TA
5	Viraj 	    TA
6	Abjasree    TA
7	Karthiga    TA
```

Which operation will give you the following output:

```python3
name       Pavlos
role    Architect
Name: 1, dtype: object
```

#### A. `toy_df.iloc[0]`
#### B. `toy_df.loc[0]`
#### C. `toy_df.iloc[1]`
#### D. `toy_df.loc[1]`

In [25]:
toy_df = pd.DataFrame({'name':['Pavlos','Rahul','Chaitanya','Hemani','Viraj', 'Abjasree', 'Karthiga'],'role':['Architect','Instructor','TA','TA','TA','TA','TA']},index = [1,2,3,4,5,6,7])
toy_df.head()


Unnamed: 0,name,role
1,Pavlos,Architect
2,Rahul,Instructor
3,Chaitanya,TA
4,Hemani,TA
5,Viraj,TA


In [26]:
### edTest(test_chow2) ###
# There may be multiple right answers therefore your your answer should be a list comma separated, 
# for example ['a' , 'c' , 'd']
answer2 = ['a','d']