# Pandas data manipulation and transformation

In [143]:
import numpy as np
import pandas as pd

In [144]:
imdb_movies = pd.read_csv("../datasets/imdb-movies.csv")
imdb_movies.set_index('Rank', inplace=True)
imdb_movies.shape

(1000, 10)

In [145]:
imdb_movies.sample(5)

Unnamed: 0_level_0,Title,Genre,Director,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Actors
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
164,Jack Reacher: Never Go Back,"Action,Adventure,Crime",Edward Zwick,2016,118,6.1,78043,58.4,47.0,"Tom Cruise, Cobie Smulders, Aldis Hodge, Rober..."
291,Busanhaeng,"Action,Drama,Horror",Sang-ho Yeon,2016,118,7.5,58782,2.13,72.0,"Yoo Gong, Soo-an Kim, Yu-mi Jung, Dong-seok Ma"
779,Chalk It Up,Comedy,Hisonni Johnson,2016,90,4.8,499,,,"Maddy Curley, John DeLuca, Nikki SooHoo, Drew ..."
42,Moonlight,Drama,Barry Jenkins,2016,111,7.5,135095,27.85,99.0,"Mahershala Ali, Shariff Earp, Duan Sanderson, ..."
207,Raw (II),"Drama,Horror",Julia Ducournau,2016,99,7.5,5435,0.51,81.0,"Garance Marillier, Ella Rumpf, Rabah Nait Oufe..."


### Question 1: do movies longer tha 2h45 get better scores than movies shorter than 100 mins?

In [146]:
# 2 hrs 45 = 165 mins

# average of scores > 165 mins
# average of scores < 100 mins

long_movies  = sum(  (imdb_movies.loc[:,'Runtime (Minutes)'] > 165) & (imdb_movies['Metascore'] > 0 ) )
short_movies = sum(  (imdb_movies.loc[:,'Runtime (Minutes)'] < 100) & (imdb_movies['Metascore'] > 0 ) )

print("number of movies > 2hrs 45 = ",long_movies)
print("number of movies < 100 mins = ",short_movies)


#imdb_movies.groupby(['Runtime (Minutes)', 'Metascore']).mean()

imdb_movies.loc[imdb_movies['Runtime (Minutes)']<100,'Score vs length'] = 'Less than 100 mins'     #short
imdb_movies.loc[imdb_movies['Runtime (Minutes)']>165,'Score vs length'] = 'Over 2hrs 45 mins'     #long

imdb_movies.groupby(['Score vs length']).mean()['Metascore']


number of movies > 2hrs 45 =  9
number of movies < 100 mins =  227


Score vs length
Less than 100 mins    55.867841
Over 2hrs 45 mins     67.222222
Name: Metascore, dtype: float64

### Question 2: which is the director with highest revenue per minute of runtime?

In [191]:
# Divide revenue by runtime 

#imdb_movies.groupby(['Director', 'Runtime (Minutes)' ]).sum()['Revenue (Millions)']

imdb_movies.loc[imdb_movies['Runtime (Minutes)'] > 0,'Revenue by minute'] = imdb_movies['Revenue (Millions)'] / imdb_movies['Runtime (Minutes)']

#imdb_movies.groupby(['Director']).sum()['Revenue by minute'].max()
                                                                        #row   #descending order
imdb_movies.groupby(['Director']).sum()['Revenue by minute'].sort_values(0, 0, False)


  imdb_movies.groupby(['Director']).sum()['Revenue by minute'].sort_values(0,0, False)


Director
J.J. Abrams          12.841471
David Yates          11.950776
Francis Lawrence     10.243712
Christopher Nolan     9.738683
Michael Bay           9.423046
                       ...    
James Franco          0.000000
James Lapine          0.000000
Jeff Grace            0.000000
Jeffrey G. Hunt       0.000000
Gillies MacKinnon     0.000000
Name: Revenue by minute, Length: 644, dtype: float64

### Question 3: which is the most common movie genre in our dataset between 2007 and 2014?

In [181]:
# filter by year >= 2007 and year <=2014

YEARS = np.arange(2007, 2015)

print(imdb_movies.groupby(['Year']).max()['Genre'].filter(YEARS))

print()
print("Most common genre between 2007 and 2014 is:", imdb_movies.groupby(['Year']).max()['Genre'].filter(YEARS).max() )


Year
2007                   Thriller
2008            Sci-Fi,Thriller
2009    Horror,Mystery,Thriller
2010           Mystery,Thriller
2011    Romance,Sci-Fi,Thriller
2012             Horror,Mystery
2013           Mystery,Thriller
2014                   Thriller
Name: Genre, dtype: object

Most common genre between 2007 and 2014 is: Thriller


  print(imdb_movies.groupby(['Year']).max()['Genre'].filter(YEARS))
  print("Most common genre between 2007 and 2014 is:", imdb_movies.groupby(['Year']).max()['Genre'].filter(YEARS).max() )


### Question 4: which is the actor who has appeared in more movies between 2009 and 2015?

In [349]:
# filter by year => 2009 and year <= 2015

#imdb_movies.groupby(['Year']).max()['Actors'].filter(YEARS)

#uniqueActors = set(sorted(imdb_movies['Actors'].unique()))

#imdb_movies.groupby(['Year']).max()['Actors'].filter(YEARS).max()



#split the Actors column into separate columns (one per actor)
actors_df = imdb_movies['Actors'].str.split(",",expand=True,)
#actors_df

#add year column from original df
actors_df['Year']= imdb_movies['Year']

# rename columns for access
actors_df.columns = ['Actor1', 'Actor2', 'Actor3', 'Actor4', 'Year']

#drop rows if year < 2009 or year > 2015
actors_df = actors_df.drop(actors_df[(actors_df['Year'] < 2009) | (actors_df['Year'] > 2015)].index)

#testing
print("Separated actor columns for testing:")
print()
print(actors_df)

print()
print("Max of 1st Actor col:",actors_df['Actor1'].dropna().max())
print("Max of 2nd Actor col:",actors_df['Actor2'].dropna().max())
print("Max of 3rd Actor col:",actors_df['Actor3'].dropna().max())
print("Max of 4th Actor col:",actors_df['Actor4'].dropna().max())


# Merge the four actor columns into a list, then find the most frequent

print()
print("Now merging the four actor columns into one column...")
merged_list = pd.concat( [ actors_df['Actor1'], actors_df['Actor2'], actors_df['Actor3'], actors_df['Actor4']])
print()
print("Length of the merged actor cols: ",len(merged_list))
print("Most popular actor between years 2009-2015 is: ",merged_list.dropna().max())


Separated actor columns for testing:

                   Actor1                 Actor2                  Actor3  \
Rank                                                                       
1             Chris Pratt             Vin Diesel          Bradley Cooper   
2            Noomi Rapace   Logan Marshall-Green      Michael Fassbender   
27                Prabhas         Rana Daggubati          Anushka Shetty   
37    Matthew McConaughey          Anne Hathaway        Jessica Chastain   
46            Johnny Depp          Penélope Cruz             Ian McShane   
...                   ...                    ...                     ...   
993          Topher Grace             Anna Faris              Dan Fogler   
994        Milla Jovovich             Ali Larter        Wentworth Miller   
995           Thomas Mann          Oliver Cooper   Jonathan Daniel Brown   
996      Chiwetel Ejiofor          Nicole Kidman           Julia Roberts   
999            Adam Pally            T.J. Miller  