# Pandas Foundations

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
movie = pd.read_csv('./data/movie.csv')
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


> DataFrame has three attributes: **_index, columns, data_**

In [3]:
index = movie.index
columns = movie.columns
data = movie.values

In [4]:
index

RangeIndex(start=0, stop=4916, step=1)

In [5]:
columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [6]:
data

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ..., 
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]], dtype=object)

In [7]:
index.values

array([   0,    1,    2, ..., 4913, 4914, 4915])

In [8]:
columns.values

array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'], dtype=object)

> **DataFrame.dtypes** attribute display each column along with its data type.

In [9]:
movie.dtypes

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
m

> **get_dtype_counts()** method return the counts of each data type.

In [10]:
movie.get_dtype_counts()

float64    13
int64       3
object     12
dtype: int64

> Pandas official documentation for dtypes (http://bit.ly/2vxe8ZI)

In [11]:
movie['director_name'].head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [12]:
movie.director_name.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [13]:
type(movie['director_name'])

pandas.core.series.Series

In [14]:
director = movie['director_name']
director.name

'director_name'

> Series to DataFrame: Series.to_frame().

In [15]:
director.to_frame().head()

Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker


## Calling Series methods
we can use the **_dir_** function to uncover all the attriutes and methods of (Series of DataFrame)

In [16]:
series_attr_methods = set(dir(pd.Series))
len(series_attr_methods)

442

In [17]:
dataframe_attr_methods = set(dir(pd.DataFrame))
len(dataframe_attr_methods)

445

In [18]:
len(series_attr_methods & dataframe_attr_methods)

376

In [19]:
director = movie['director_name']
actor_1_fb_likes = movie['actor_1_facebook_likes']

In [20]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [21]:
actor_1_fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

> One of the most useful methods for the object data type Series: **value_counts**, which counts all the occurrences of each unique value

In [22]:
director.value_counts().head()

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
Spike Lee           16
Name: director_name, dtype: int64

In [23]:
actor_1_fb_likes.value_counts().head()

1000.0     436
11000.0    206
2000.0     189
3000.0     150
12000.0    131
Name: actor_1_facebook_likes, dtype: int64

> **size, shape or len** can counting the number of elements of Series

In [24]:
director.size

4916

In [25]:
director.shape

(4916,)

In [26]:
len(director)

4916

**count()** method will return the number of non-missing values

In [27]:
director.count()

4814

> Basic statistics method: **min, max, mean, median, std and sum**.  
> To simplify, **describe()** method will return both the summary statistics and quantiles.

In [28]:
actor_1_fb_likes.describe()

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

> The quantile method exists to calculate an exact quantile of numeric data

In [29]:
actor_1_fb_likes.quantile(.2)

510.0

In [30]:
actor_1_fb_likes.quantile([.1,.2,.3,.4,.5,.6,.7,.8,.9])

0.1      240.0
0.2      510.0
0.3      694.0
0.4      854.0
0.5      982.0
0.6     1000.0
0.7     8000.0
0.8    13000.0
0.9    18000.0
Name: actor_1_facebook_likes, dtype: float64

> The **_isnull_** method may be used to determine whether each individual value is missing or not.

In [31]:
director.isnull()

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
4886    False
4887    False
4888    False
4889    False
4890    False
4891    False
4892    False
4893    False
4894    False
4895    False
4896    False
4897    False
4898    False
4899    False
4900    False
4901    False
4902    False
4903    False
4904    False
4905    False
4906    False
4907    False
4908    False
4909    False
4910    False
4911    False
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [32]:
director.isnull()

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
4886    False
4887    False
4888    False
4889    False
4890    False
4891    False
4892    False
4893    False
4894    False
4895    False
4896    False
4897    False
4898    False
4899    False
4900    False
4901    False
4902    False
4903    False
4904    False
4905    False
4906    False
4907    False
4908    False
4909    False
4910    False
4911    False
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [33]:
actor_1_fb_likes.isnull()
# actor_1_fb_likes.notnull(), checking non-missing value.

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
4886    False
4887    False
4888    False
4889    False
4890    False
4891    False
4892    False
4893    False
4894    False
4895    False
4896    False
4897    False
4898    False
4899    False
4900    False
4901    False
4902    False
4903    False
4904    False
4905    False
4906    False
4907    False
4908    False
4909    False
4910    False
4911    False
4912    False
4913    False
4914    False
4915    False
Name: actor_1_facebook_likes, Length: 4916, dtype: bool

In [34]:
actor_1_fb_likes.count()

4909

In [35]:
actor_1_fb_likes.fillna(0).count()

4916

In [36]:
actor_1_fb_likes.dropna().size

4909

> By setting the normalize parameter to True, the relative frequencies are returned

In [37]:
director.value_counts(normalize=True)

Steven Spielberg            0.005401
Woody Allen                 0.004570
Martin Scorsese             0.004155
Clint Eastwood              0.004155
Spike Lee                   0.003324
Ridley Scott                0.003324
Renny Harlin                0.003116
Steven Soderbergh           0.003116
Oliver Stone                0.002908
Tim Burton                  0.002908
Robert Rodriguez            0.002700
Ron Howard                  0.002700
Joel Schumacher             0.002700
Barry Levinson              0.002700
Robert Zemeckis             0.002700
Michael Bay                 0.002493
Tony Scott                  0.002493
Kevin Smith                 0.002493
Brian De Palma              0.002493
Francis Ford Coppola        0.002285
Chris Columbus              0.002285
Shawn Levy                  0.002285
Richard Linklater           0.002285
Sam Raimi                   0.002285
Richard Donner              0.002285
Rob Reiner                  0.002285
Wes Craven                  0.002077
B

> 1: Compare the **count** and **size** to determine the missing value.  
> 2: More direct approach is to use **hasnans** attribue to check missing value.

In [38]:
director.hasnans

True

## Working with operators on a Series

> All the operators used in this recipe apply the same operation to each element in the Series. In native Python, this would require a for-loop to iterate through each of the items in the sequence before applying the operation

In [39]:
imdb_score = movie['imdb_score']
hd_imdb_score = imdb_score.head()
hd_imdb_score

0    7.9
1    7.1
2    6.8
3    8.5
4    7.1
Name: imdb_score, dtype: float64

In [40]:
hd_imdb_score + 1
# hd_imdb_score.add(1)
# + - * / all works.

0    8.9
1    8.1
2    7.8
3    9.5
4    8.1
Name: imdb_score, dtype: float64

In [41]:
hd_imdb_score > 7
# < > <= >= != == all works.

0     True
1     True
2    False
3     True
4     True
Name: imdb_score, dtype: bool

In [42]:
data = [
    ['Arithmetic', '+, -, *, /, //, %, **', 'add, sub, mul, div, floordiv, mod, pow'],
    ['Comparison', '<, >, <=, >=, ==, !=', 'lt, gt, le, ge, eq, ne']
]
col = ['Operator Group', 'Operator', 'Series Method Name']
table_of_operator = pd.DataFrame(data, columns=col).set_index(col[0])
table_of_operator

Unnamed: 0_level_0,Operator,Series Method Name
Operator Group,Unnamed: 1_level_1,Unnamed: 2_level_1
Arithmetic,"+, -, *, /, //, %, **","add, sub, mul, div, floordiv, mod, pow"
Comparison,"<, >, <=, >=, ==, !=","lt, gt, le, ge, eq, ne"


## Chaining Series Methods Together

In [43]:
director.value_counts().head(3)

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Name: director_name, dtype: int64

In [44]:
actor_1_fb_likes.isnull().sum()
# True == 1
# False ==0

7

In [45]:
actor_1_fb_likes.fillna(0).astype(int).head(3)

0     1000
1    40000
2    11000
Name: actor_1_facebook_likes, dtype: int64

> Find the total number of missing values, we can take the mean of the Series to get the percentage of values that are missing

In [46]:
actor_1_fb_likes.isnull().mean()

0.0014239218877135883

## Making the index meaningful

> Default: RangeIndex

In [47]:
movie = pd.read_csv('./data/movie.csv')
movie_2 = movie.set_index('movie_title')
movie_2.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [48]:
movie = pd.read_csv('./data/movie.csv', index_col='movie_title')
movie.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [49]:
movie.reset_index().head()

Unnamed: 0,movie_title,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,...,,,,,,,12.0,7.1,,0


## Renaming row and column names

> The **rename** method accepts dictionaries that map the old value to the new value.

In [50]:
movie = pd.read_csv('./data/movie.csv', index_col='movie_title')

In [51]:
idx_rename = {
    'Avatar': 'Ratava',
    'Spectre': 'Ertceps'
}
col_rename = {
    'color': 'Color',
    'director_name': 'Director Name',
    'num_critic_for_reviews': 'Critical Reviews'
}

movie_renamed = movie.rename(index=idx_rename, columns=col_rename)
movie_renamed.head()

Unnamed: 0_level_0,Color,Director Name,Critical Reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ratava,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


> The rename DataFrame method allows for both row and column labels to be renamed at the same time with the index and columns parameters. Each of these parameters may be set to a dictionary that maps old labels to their new values.

In [52]:
index = movie.index
columns = movie.columns

index_list = index.tolist()
column_list = columns.tolist()

print(type(index))
print(type(index_list))

index_list[0] = 'Ratava'
column_list[0] = 'Color'

print(index_list[:5])
print(column_list[:5])

movie.index = index_list
movie.columns = column_list
movie.head()

<class 'pandas.core.indexes.base.Index'>
<class 'list'>
['Ratava', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'Star Wars: Episode VII - The Force Awakens']
['Color', 'director_name', 'num_critic_for_reviews', 'duration', 'director_facebook_likes']


Unnamed: 0,Color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
Ratava,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


# Creating and deleting columns

In [54]:
movie = pd.read_csv('./data/movie.csv')
movie['has_seed'] = 0
movie['actor_director_facebook_likes'] = (
    movie['actor_1_facebook_likes'] + 
    movie['actor_2_facebook_likes'] + 
    movie['actor_3_facebook_likes'] + 
    movie['director_facebook_likes']
)

In [55]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seed,actor_director_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0,2791.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0,46563.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0,11554.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0,95000.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,12.0,7.1,,0,0,


In [56]:
movie['actor_director_facebook_likes'].isnull().sum()
# Totally it has 122 missing value.

122

In [58]:
movie['actor_director_facebook_likes'] = movie['actor_director_facebook_likes'].fillna(0)

In [60]:
movie['is_cast_likes_more'] = (movie['cast_total_facebook_likes'] >= movie['actor_director_facebook_likes'])

In [61]:
movie['is_cast_likes_more'].all()
# all() will check whether all the values of this column are True.

False

In [62]:
movie = movie.drop('actor_director_facebook_likes', axis='columns')

In [64]:
movie['actor_total_facebook_likes'] = (
    movie['actor_1_facebook_likes'] + 
    movie['actor_2_facebook_likes'] + 
    movie['actor_3_facebook_likes']
)
movie['actor_total_facebook_likes'] = movie['actor_total_facebook_likes'].fillna(0)

In [66]:
movie['is_cast_likes_more'] = (movie['cast_total_facebook_likes'] >= movie['actor_total_facebook_likes'])
movie['is_cast_likes_more'].all()

True

In [67]:
movie['pct_actor_cast_like'] = (movie['actor_total_facebook_likes'] / movie['cast_total_facebook_likes'])

In [68]:
(movie['pct_actor_cast_like'].min(), movie['pct_actor_cast_like'].max())

(0.0, 1.0)

In [69]:
movie.set_index('movie_title')['pct_actor_cast_like'].head()

movie_title
Avatar                                        0.577369
Pirates of the Caribbean: At World's End      0.951396
Spectre                                       0.987521
The Dark Knight Rises                         0.683783
Star Wars: Episode VII - The Force Awakens    0.000000
Name: pct_actor_cast_like, dtype: float64

The **drop** method accepts the name of the row or column to delete. It defaults to dropping rows by the index names. To drop columns must set the axis parameter to either 1 or 'columns'. The default value for axis is 0 or the string index.

In [70]:
profit_index = movie.columns.get_loc('gross') + 1
profit_index

9

In [71]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seed,is_cast_likes_more,actor_total_facebook_likes,pct_actor_cast_like
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,237000000.0,2009.0,936.0,7.9,1.78,33000,0,True,2791.0,0.577369
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,300000000.0,2007.0,5000.0,7.1,2.35,0,0,True,46000.0,0.951396
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,245000000.0,2015.0,393.0,6.8,2.35,85000,0,True,11554.0,0.987521
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,250000000.0,2012.0,23000.0,8.5,2.35,164000,0,True,73000.0,0.683783
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,12.0,7.1,,0,0,True,0.0,0.0


In [72]:
movie.insert(loc=profit_index, column='profit', value=(movie['gross']-movie['budget']))

In [73]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,profit,...,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seed,is_cast_likes_more,actor_total_facebook_likes,pct_actor_cast_like
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,523505847.0,...,237000000.0,2009.0,936.0,7.9,1.78,33000,0,True,2791.0,0.577369
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,9404152.0,...,300000000.0,2007.0,5000.0,7.1,2.35,0,0,True,46000.0,0.951396
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,-44925825.0,...,245000000.0,2015.0,393.0,6.8,2.35,85000,0,True,11554.0,0.987521
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,198130642.0,...,250000000.0,2012.0,23000.0,8.5,2.35,164000,0,True,73000.0,0.683783
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,,...,,,12.0,7.1,,0,0,True,0.0,0.0


In [75]:
del movie['has_seed']

> **insert()**: Takes 3 argument, 
> * 1: position of the new column
> * 2: the name of new column
> * 3: the value of new column

> An alternative to deleting columns with the drop method is to use **del** statement.