# Basic data discovery with Pandas

Use len, head, tail,COLUMN,notnull,isnull,sort_value,sort expressions

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

In [3]:
titles = pd.read_csv('titles.csv')
titles.head()

Unnamed: 0,title,year
0,Suicide Kale,2017
1,Kottapeta Rowdy,1980
2,Kaki kuda,1958
3,Ekho: Fall of an Empire,2004
4,Corpo e Alma de Uma Mulher,1983


# working with dataset

In [4]:
# How many movies are listed in the titles dataframe?
len(titles)

244914

In [5]:
# What are the earliest two films listed in the titles dataframe?

titles.sort_values('year').head(2)

Unnamed: 0,title,year
178518,Miss Jerry,1894
181186,The Startled Lover,1898


### to convert years into decades

In [6]:
decades=titles['year'] // 10 *10

In [7]:
decades.head()

0    2010
1    1980
2    1950
3    2000
4    1980
Name: year, dtype: int64

In [8]:
# Filter for years greater than 1980
(titles.year > 1980)[:5]

0     True
1    False
2    False
3     True
4     True
Name: year, dtype: bool

### showing all values that are greater than 1980

In [9]:
years_greater_1980=(titles[titles.year > 1980])

In [10]:
years_greater_1980.head()

Unnamed: 0,title,year
0,Suicide Kale,2017
3,Ekho: Fall of an Empire,2004
4,Corpo e Alma de Uma Mulher,1983
5,Windsor Drive,2015
7,The Night We Never Met,1993


In [11]:
len(years_greater_1980)

157621

In [12]:
# or we can extend our filter further
titles[(titles.year > 1980) & (titles.year <= 1985)].head()

Unnamed: 0,title,year
4,Corpo e Alma de Uma Mulher,1983
8,Subah Subah,1983
30,Pooviriyum Pulari,1982
60,Katha Nayakudu,1984
83,I pompieri,1985


In [13]:
# for movies in 1985

titles[titles.year == 1985]

Unnamed: 0,title,year
83,I pompieri,1985
147,Sparkles Tavern,1985
474,Hey Johney,1985
497,Terminal Choice,1985
745,Prizzi's Honor,1985
793,Do Dilon Ki Dastaan,1985
1112,Young Sherlock Holmes,1985
1339,Mesmerized,1985
1574,Bez konca,1985
1661,Xiao jiang,1985


**We have 2032 movies in 1985**

### Work filters on titles

In [14]:
# For a specific name of movie
# Titles is name of dataframe
# title is name of column within dataframe
titles[titles.title=='Pooviriyum Pulari']

Unnamed: 0,title,year
30,Pooviriyum Pulari,1982


In [15]:
# title is name of column within dataframe
titles[titles.title=='Macbeth']

Unnamed: 0,title,year
4852,Macbeth,1998
32294,Macbeth,2013
34123,Macbeth,2009
68133,Macbeth,2004
69932,Macbeth,1987
86478,Macbeth,2006
90134,Macbeth,2012
91480,Macbeth,2015
94151,Macbeth,2003
108496,Macbeth,1916


**All the movie version made in different years**

In [16]:
titles[titles.title=='Macbeth'].sort_values('year')

Unnamed: 0,title,year
133367,Macbeth,1913
108496,Macbeth,1916
182697,Macbeth,1948
69932,Macbeth,1987
191761,Macbeth,1997
4852,Macbeth,1998
94151,Macbeth,2003
68133,Macbeth,2004
86478,Macbeth,2006
34123,Macbeth,2009


### sorting a series

In [17]:
titles.year.sort_values()

178518    1894
181186    1898
175272    1899
101319    1900
213695    1900
179620    1900
145030    1900
101844    1900
160383    1903
5721      1905
55669     1905
143334    1906
234612    1906
99777     1906
98111     1906
99681     1906
108953    1907
58539     1907
230529    1907
115206    1907
145561    1908
14199     1908
100568    1908
186376    1908
79602     1908
121540    1908
13474     1909
186131    1909
110399    1909
104046    1909
          ... 
130626    2022
107397    2022
52697     2022
200584    2022
96533     2022
144418    2022
215449    2022
80557     2022
194112    2022
138361    2022
151077    2022
104974    2022
30580     2022
136174    2022
110396    2022
81728     2022
530       2023
108306    2023
73282     2023
199953    2023
243681    2023
113250    2024
69407     2024
57822     2024
179622    2024
123060    2024
79042     2024
80392     2025
69061     2026
218450    2115
Name: year, Length: 244914, dtype: int64

In [18]:
cast = pd.read_csv('cast.csv')
cast.head()

Unnamed: 0,title,year,name,type,character,n
0,Closet Monster,2015,Buffy #1,actor,Buffy 4,31.0
1,Suuri illusioni,1985,Homo $,actor,Guests,22.0
2,Battle of the Sexes,2017,$hutter,actor,Bobby Riggs Fan,10.0
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,


# Solving excercise 1

### 3.1)-How many movies are listed in the titles dataframe?

In [19]:
titles.head()

Unnamed: 0,title,year
0,Suicide Kale,2017
1,Kottapeta Rowdy,1980
2,Kaki kuda,1958
3,Ekho: Fall of an Empire,2004
4,Corpo e Alma de Uma Mulher,1983


In [20]:
len(titles)

244914

In [21]:
titles.nunique()

title    223828
year        128
dtype: int64

### 3.2)- What are the earliest two films listed in the titles dataframe?

In [22]:
titles.head(2)

Unnamed: 0,title,year
0,Suicide Kale,2017
1,Kottapeta Rowdy,1980


### 3.3)- How many movies have the title "Hamlet"?


In [23]:
titles[titles.title== 'Hamlet']

Unnamed: 0,title,year
12613,Hamlet,1990
18174,Hamlet,1921
24098,Hamlet,1954
24576,Hamlet,2000
29112,Hamlet,1987
42746,Hamlet,1948
51166,Hamlet,2016
70599,Hamlet,1911
74440,Hamlet,1910
149646,Hamlet,1996


In [24]:
#let's give a number as question was how many
len(titles[titles.title == 'Hamlet'])

20

### 3.4)-How many movies are titled "North by Northwest"?

In [25]:
titles[titles.title== 'North by Northwest']

Unnamed: 0,title,year
231037,North by Northwest,1959


In [26]:
len(titles[titles.title == 'North by Northwest'])

1

### 3.5)-When was the first movie titled "Hamlet" made?

In [27]:
titles[titles.title== 'Hamlet'].sort_values('year').head(1)

Unnamed: 0,title,year
74440,Hamlet,1910


### 3.6)-List all of the "Treasure Island" movies from earliest to most recent.

In [28]:
titles[titles.title== 'Treasure Island'].sort_values('year')

Unnamed: 0,title,year
179603,Treasure Island,1918
175660,Treasure Island,1920
241392,Treasure Island,1934
148129,Treasure Island,1950
10689,Treasure Island,1972
5116,Treasure Island,1973
120623,Treasure Island,1985
81596,Treasure Island,1999


### 3.7)-How many movies were made in the year 1950?


In [29]:
len(titles[titles.year== 1950])

1113

### 3.8)-How many movies were made in the year 1960?

In [30]:
len(titles[titles.year== 1960])

1549

### 3.9)-How many movies were made from 1950 through 1959?

In [31]:
len(titles[(titles.year >= 1950) & (titles.year < 1960) ])

12934

### 3.10)- In what years has a movie titled "Batman" been released?

In [32]:
titles[titles.title== 'Batman']

Unnamed: 0,title,year
36291,Batman,1943
208065,Batman,1989


### 3.11)- How many roles were there in the movie "Inception"?

In [33]:
cast.head()

Unnamed: 0,title,year,name,type,character,n
0,Closet Monster,2015,Buffy #1,actor,Buffy 4,31.0
1,Suuri illusioni,1985,Homo $,actor,Guests,22.0
2,Battle of the Sexes,2017,$hutter,actor,Bobby Riggs Fan,10.0
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,


In [34]:
len(cast[cast.title== 'Inception'])

78

### 3.12)-How many roles in the movie "Inception" are NOT ranked by an "n" value?

In [35]:
c=cast[cast.title== 'Inception']

In [36]:
c2=c[c.n.isnull()]

In [37]:
len(c2)

27

### 3.13)-But how many roles in the movie "Inception" did receive an "n" value?


In [38]:
c=cast[cast.title== 'Inception']

In [39]:
c2=c[c.n.notnull()]

In [40]:
len(c2)

51

Note: 51+27=78 (verified)

### 3.14)-Display the cast of "North by Northwest" in their correct "n"-value order, ignoring roles that did not earn a numeric "n" value.


In [41]:
c=cast[cast.title== 'North by Northwest']

In [42]:
len(c)

116

In [43]:
c1=c[c.n.notnull()]

In [44]:
len(c1)

16

In [45]:
c2=c1.sort_values('n')

In [46]:
c2

Unnamed: 0,title,year,name,type,character,n
871426,North by Northwest,1959,Cary Grant,actor,Roger O. Thornhill,1.0
3518333,North by Northwest,1959,Eva Marie Saint,actress,Eve Kendall,2.0
1457165,North by Northwest,1959,James Mason,actor,Phillip Vandamm,3.0
3157232,North by Northwest,1959,Jessie Royce Landis,actress,Clara Thornhill,4.0
356707,North by Northwest,1959,Leo G. Carroll,actor,The Professor,5.0
3048272,North by Northwest,1959,Josephine Hutchinson,actress,Mrs. Townsend,6.0
1695173,North by Northwest,1959,Philip Ober,actor,Lester Townsend,7.0
1275337,North by Northwest,1959,Martin Landau,actor,Leonard,8.0
2447081,North by Northwest,1959,Adam Williams,actor,Valerian,9.0
1811387,North by Northwest,1959,Edward Platt,actor,Victor Larrabee,10.0


### 3.15)-Display the entire cast, in "n"-order, of the 1972 film "Sleuth".

In [47]:
c=cast[cast.title== 'Sleuth']

In [48]:
c1=c[c.year==1972]

In [49]:
c1

Unnamed: 0,title,year,name,type,character,n
326773,Sleuth,1972,Michael Caine,actor,Milo Tindle,2.0
373876,Sleuth,1972,Alec Cawthorne,actor,Inspector Doppler,3.0
1448690,Sleuth,1972,Teddy Martin,actor,Police Constable Higgs,6.0
1465483,Sleuth,1972,John (II) Matthews,actor,Detective Sergeant Tarrant,4.0
1705394,Sleuth,1972,Laurence Olivier,actor,Andrew Wyke,1.0
2724553,Sleuth,1972,Eve (III) Channing,actress,Marguerite Wyke,5.0


In [50]:
c1.sort_values('n')

Unnamed: 0,title,year,name,type,character,n
1705394,Sleuth,1972,Laurence Olivier,actor,Andrew Wyke,1.0
326773,Sleuth,1972,Michael Caine,actor,Milo Tindle,2.0
373876,Sleuth,1972,Alec Cawthorne,actor,Inspector Doppler,3.0
1465483,Sleuth,1972,John (II) Matthews,actor,Detective Sergeant Tarrant,4.0
2724553,Sleuth,1972,Eve (III) Channing,actress,Marguerite Wyke,5.0
1448690,Sleuth,1972,Teddy Martin,actor,Police Constable Higgs,6.0


### 3.16)-Now display the entire cast, in "n"-order, of the 2007 version of "Sleuth".

In [51]:
c=cast[cast.title=='Sleuth']

In [52]:
c1=c[c.year==2007]

In [53]:
c1.sort_values('n')

Unnamed: 0,title,year,name,type,character,n
326774,Sleuth,2007,Michael Caine,actor,Andrew,1.0
1293634,Sleuth,2007,Jude Law,actor,Milo,2.0
1805907,Sleuth,2007,Harold Pinter,actor,Man on T.V.,3.0
260242,Sleuth,2007,Kenneth Branagh,actor,Other Man on T.V.,
373877,Sleuth,2007,Alec (II) Cawthorne,actor,Inspector Doppler,
2724552,Sleuth,2007,Eve (II) Channing,actress,Marguerite Wyke,
3370424,Sleuth,2007,Carmel O'Sullivan,actress,Maggie,


### 3.17)-How many roles were credited in the silent 1921 version of Hamlet?

In [54]:
# instead of using two steps, we can use in one & operator
c = c[(c.title == 'Hamlet') & (c.year == 1921)]

In [55]:
c

Unnamed: 0,title,year,name,type,character,n


In [56]:
len(c)

0

### 3.18) How many roles were credited in Branagh’s 1996 Hamlet?

In [57]:
cast[cast.title=='Hamlet']

Unnamed: 0,title,year,name,type,character,n
1731,Hamlet,1996,Riz Abbasi,actor,Attendant to Claudius,1.0
7789,Hamlet,1921,Fritz Achterberg,actor,"Fortinbras,",9.0
11728,Hamlet,2009,Hayden Adams,actor,Laertes,7.0
11729,Hamlet,2009,Hayden Adams,actor,Player,7.0
14493,Hamlet,1913,Eric Adeney,actor,Reynaldo,14.0
16771,Hamlet,2000,Casey Affleck,actor,Fortinbras,15.0
34314,Hamlet,1964,Hugh Alexander,actor,Cornelius,
34315,Hamlet,1964,Hugh Alexander,actor,Second Gravedigger,
34316,Hamlet,1964,Hugh Alexander,actor,English Ambassador,
54690,Hamlet,1913,Richard Andean,actor,Second Player,16.0


In [58]:
c=cast[cast.title=='Hamlet'] 


In [59]:
c1=c[c.year==1996]

In [60]:
len(c1)

54

### 3.19)-How many "Hamlet" roles have been listed in all film credits through history?

In [61]:
cast[cast.character== 'Hamlet']

Unnamed: 0,title,year,name,type,character,n
6286,Sugar,2008,Hamlet Abreu,actor,Hamlet,58.0
97830,Top of the Town,1937,Mischa Auer,actor,Hamlet,6.0
125190,The Sound of Spying,2014,Jonathan Ball,actor,Hamlet,
160794,Suivez cet avion,1989,Fran?ois Beaulieu,actor,Hamlet,26.0
161104,Nous sommes les autres,2017,Marc Beaupr?,actor,Hamlet,
168663,Hamlet (II),2007,William Belchambers,actor,Hamlet,
175927,Un Amleto di meno,1973,Carmelo Bene,actor,Hamlet,
239081,I'll Love You Always,1935,Sven Hugo Borg,actor,Hamlet,
260225,Hamlet,1996,Kenneth Branagh,actor,Hamlet,5.0
274831,Le portrait de Dorian Gray,1977,Sacha Briquet,actor,Hamlet,5.0


In [62]:
len(cast[cast.character== 'Hamlet'])

96

In [63]:
len(cast[cast.character == "Hamlet"].name.unique())

96

We should check for unique values. As some actors might have played twice i.e one is 1st version and one may be on TV or play or some other platform(netflix series). So we will have duplicate values for them

### 3.20)-How many people have played an "Ophelia"?

In [64]:
cast[cast.character=='Ophelia']

Unnamed: 0,title,year,name,type,character,n
248251,The Merry Maids of Madness,2016,Amy Bourque,actor,Ophelia,11.0
410247,Jake Gets Paid,2009,Josie Chisholm,actor,Ophelia,
470859,Inoperable,2017,Crystal Cordero,actor,Ophelia,
866334,"Dogg's Hamlet, Cahoot's Macbeth",2005,Lucas Grabeel,actor,Ophelia,
1364457,11,2014,Catherine (IV) Lord,actor,Ophelia,
1581593,Kronos,2016,Justine Mooritz,actor,Ophelia,
2442518,Baker,2011,Elle Wilcox,actor,Ophelia,
2473563,Queen's Encounter,2011,Hiu Yee Wong,actor,Ophelia,
2542491,Confession,1999,Paulina V. Ahlstrom,actress,Ophelia,
2564975,Hamlet_X,2003,Kathrin Angerer,actress,Ophelia,


In [65]:
len(cast[cast.character=='Ophelia'])

117

In [66]:
len(cast[cast.character == "Ophelia"].name.unique())

115

### 3.21)-How many people have played a role called "The Dude"?

In [67]:
cast[cast.character=='The Dude']

Unnamed: 0,title,year,name,type,character,n
272161,The Big Lebowski,1998,Jeff Bridges,actor,The Dude,1.0
339389,Terms & Conditions,2015,Jordan Cann,actor,The Dude,9.0
498660,Stranger,2000,Scott Crowell,actor,The Dude,
570128,Pizza Man vs. the Dude,2004,Chris DeMarcus,actor,The Dude,
610791,Sweepstakes,1931,Mike Donlin,actor,The Dude,12.0
629473,Self Helpless,2010,Devin The Dude,actor,The Dude,
855976,A Few Brains More,2012,Michael D. Goodwin,actor,The Dude,
1089444,Jay and Silent Bob Strike Back,2001,Matthew (XIX) James,actor,The Dude,37.0
1138933,Explicit Ills,2008,Christopher Kadish,actor,The Dude,27.0
1367230,American Idiots,2013,Jason Loughridge,actor,The Dude,10.0


In [68]:
len(cast[cast.character=='The Dude'])

19

In [69]:
len(cast[cast.character == "The Dude"].name.unique())

19

### 3.22)-How many people have played a role called "The Stranger"?

In [70]:
len(cast[cast.character == "The Stranger"].name.unique())

202

### 3.23)-How many roles has Sidney Poitier played throughout his career?

In [71]:
len(cast[cast.name == "Sidney Poitier"])

43

### 3.24)-How many roles has Judi Dench played?

In [72]:
len(cast[cast.name == "Judi Dench"])

55

### 3.25)-List the supporting roles (having n=2) played by Cary Grant in the 1940s, in order by year.

In [73]:
c=cast

In [74]:
c1=c[c.name=="Cary Grant"]

In [75]:
c1[c1.year // 10 == 194]

Unnamed: 0,title,year,name,type,character,n
871390,Arsenic and Old Lace,1944,Cary Grant,actor,Mortimer Brewster,1.0
871398,Destination Tokyo,1943,Cary Grant,actor,Capt. Cassidy,1.0
871402,Every Girl Should Be Married,1948,Cary Grant,actor,Dr. Madison Brown,1.0
871405,George White's Scandals,1945,Cary Grant,actor,Cary Grant,
871407,His Girl Friday,1940,Cary Grant,actor,Walter Burns,1.0
871411,I Was a Male War Bride,1949,Cary Grant,actor,Captain Henri Rochard,1.0
871421,Mr. Blandings Builds His Dream House,1948,Cary Grant,actor,Jim Blandings,1.0
871422,Mr. Lucky,1943,Cary Grant,actor,Joe Adams -posing as Joe Bascopolous,1.0
871423,My Favorite Wife,1940,Cary Grant,actor,Nick Arden,2.0
871424,Night and Day,1946,Cary Grant,actor,Cole Porter,1.0


In [76]:
c2=c1[c1.year // 10 == 194]

In [77]:
c2[c2.n==2].sort_values('year')

Unnamed: 0,title,year,name,type,character,n
871423,My Favorite Wife,1940,Cary Grant,actor,Nick Arden,2.0
871433,Penny Serenade,1941,Cary Grant,actor,Roger Adams,2.0


### 3.26)-List the leading roles that Cary Grant played in the 1940s in order by year.

In [78]:
c2[c2.n==1].sort_values('year')

Unnamed: 0,title,year,name,type,character,n
871448,The Howards of Virginia,1940,Cary Grant,actor,Matt Howard,1.0
871407,His Girl Friday,1940,Cary Grant,actor,Walter Burns,1.0
871450,The Philadelphia Story,1940,Cary Grant,actor,C. K. Dexter Haven,1.0
871438,Suspicion,1941,Cary Grant,actor,Johnnie Aysgarth,1.0
871452,The Talk of the Town,1942,Cary Grant,actor,Leopold Dilg,1.0
871429,Once Upon a Honeymoon,1942,Cary Grant,actor,Patrick 'Pat' O'Toole,1.0
871398,Destination Tokyo,1943,Cary Grant,actor,Capt. Cassidy,1.0
871422,Mr. Lucky,1943,Cary Grant,actor,Joe Adams -posing as Joe Bascopolous,1.0
871430,Once Upon a Time,1944,Cary Grant,actor,Jerry Flynn,1.0
871390,Arsenic and Old Lace,1944,Cary Grant,actor,Mortimer Brewster,1.0


### 3.27)-How many roles were available for actors in the 1950s?

In [79]:
c=cast

In [80]:
c1=c[c.year // 10 == 195]

In [81]:
c2=c1[c1.type =='actor']

In [82]:
len(c2)

157738

### 3.28)-How many roles were available for actresses in the 1950s?

In [83]:
c = cast
c = c[c.year // 10 == 195]
c = c[c.type == 'actress']
len(c)

57733

### 3.29)-How many leading roles (n=1) were available from the beginning of film history through 1980?

In [84]:
c=cast
c=c[c.year <= 1980]

In [85]:
c=c[c.n==1]

In [86]:
len(c)

65140

### 3.30)-How many non-leading roles were available through from the beginning of film history through 1980?

In [87]:
c=cast
c=c[c.year <= 1980]

In [88]:
c=c[c.n !=1]

In [89]:
len(c)

1117667

### 3.31)-How many roles through 1980 were minor enough that they did not warrant a numeric "n" rank?

In [90]:
c=cast
c=c[c.year <= 1980]

In [91]:
c=c[c.n.isnull()]

In [92]:
len(c)

448347