# cuDF 
**cuDF is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.**

# Load libraries

In [1]:
import os

import pandas as pd
import numpy as np

import cupy as cp
import cudf as cd

import s3fs

# Import data from csv

### **movies_pdf** is our Pandas DF

In [40]:
%%timeit 
movies_pdf = pd.read_csv("s3://bsql/data/rapids_intro/movies.csv")

132 ms ± 8.52 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### **movies_cdf** is our cuDF

In [42]:
%%timeit 
movies_cdf = cd.read_csv("s3://bsql/data/rapids_intro/movies.csv", storage_options = {'anon': True})

124 ms ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Gather dataset statistics

In [4]:
print(movies_pdf.shape)
print(movies_pdf.ndim)
print(len(movies_pdf))

(4916, 28)
2
4916


In [5]:
print(movies_cdf.shape)
print(movies_cdf.ndim)
print(len(movies_cdf))

(4916, 28)
2
4916


# Explore Data

In [6]:
movies_pdf.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [7]:
movies_cdf.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [8]:
movies_pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4916 entries, 0 to 4915
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      4897 non-null   object 
 1   director_name              4814 non-null   object 
 2   num_critic_for_reviews     4867 non-null   float64
 3   duration                   4901 non-null   float64
 4   director_facebook_likes    4814 non-null   float64
 5   actor_3_facebook_likes     4893 non-null   float64
 6   actor_2_name               4903 non-null   object 
 7   actor_1_facebook_likes     4909 non-null   float64
 8   gross                      4054 non-null   float64
 9   genres                     4916 non-null   object 
 10  actor_1_name               4909 non-null   object 
 11  movie_title                4916 non-null   object 
 12  num_voted_users            4916 non-null   int64  
 13  cast_total_facebook_likes  4916 non-null   int64

In [9]:
movies_cdf.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 4916 entries, 0 to 4915
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   color                      4897 non-null   object
 1   director_name              4814 non-null   object
 2   num_critic_for_reviews     4867 non-null   float64
 3   duration                   4901 non-null   float64
 4   director_facebook_likes    4814 non-null   float64
 5   actor_3_facebook_likes     4893 non-null   float64
 6   actor_2_name               4903 non-null   object
 7   actor_1_facebook_likes     4909 non-null   float64
 8   gross                      4054 non-null   float64
 9   genres                     4916 non-null   object
 10  actor_1_name               4909 non-null   object
 11  movie_title                4916 non-null   object
 12  num_voted_users            4916 non-null   int64
 13  cast_total_facebook_likes  4916 non-null   int64
 14  acto

In [10]:
movies_pdf.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [11]:
movies_cdf.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

# Select subsets of the dataframe

### Select fields that are continuous data only

In [12]:
movies_pdf.select_dtypes(include="number").head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0


In [13]:
movies_cdf.select_dtypes(include="number").head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0


### Select fields that are floats only

In [14]:
movies_pdf.select_dtypes(include="float").head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35
4,,,131.0,,131.0,,0.0,,,,12.0,7.1,


In [15]:
movies_cdf.select_dtypes(include="float").head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35
4,,,131.0,,131.0,,0.0,,,,12.0,7.1,


### Select fields that are discrete values

In [16]:
movies_pdf.select_dtypes(include="object").head()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating
0,Color,James Cameron,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,Wes Studi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,USA,PG-13
1,Color,Gore Verbinski,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,Jack Davenport,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,USA,PG-13
2,Color,Sam Mendes,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,Stephanie Sigman,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,UK,PG-13
3,Color,Christopher Nolan,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,Joseph Gordon-Levitt,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,USA,PG-13
4,,Doug Walker,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,


In [17]:
movies_cdf.select_dtypes(include="object").head()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating
0,Color,James Cameron,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,Wes Studi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,USA,PG-13
1,Color,Gore Verbinski,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,Jack Davenport,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,USA,PG-13
2,Color,Sam Mendes,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,Stephanie Sigman,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,UK,PG-13
3,Color,Christopher Nolan,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,Joseph Gordon-Levitt,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,USA,PG-13
4,,Doug Walker,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,


# Data Analysis

### Summary statistics for all continuous data fields

In [43]:
%%timeit 
movies_pdf.select_dtypes(include="number").describe()

26.3 ms ± 596 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [45]:
%%timeit 
movies_cdf.select_dtypes(include="number").describe()

208 ms ± 4.06 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Summary statistics for all discrete value fields

In [46]:
%%timeit 
movies_pdf.select_dtypes(include="object").describe()

30.1 ms ± 1.04 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [47]:
%%timeit 
movies_cdf.select_dtypes(include="object").describe()

306 ms ± 12.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Transpose the cuDF describe results

(this can be done in Pandas too)

In [52]:
%%timeit 
movies_pdf.select_dtypes(include="number").describe().T

26.5 ms ± 558 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [53]:
%%timeit 
movies_cdf.select_dtypes(include="number").describe().T

210 ms ± 3.06 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Covaraiance calculation of two continuous variables

In [54]:
%%timeit
movies_pdf.movie_facebook_likes.cov(movies_pdf.actor_3_facebook_likes)

195 µs ± 4.23 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [55]:
%%timeit
movies_cdf.movie_facebook_likes.cov(movies_cdf.actor_3_facebook_likes)

12.6 ms ± 113 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Pearson correlation of two continuous variables

In [56]:
%%timeit
movies_pdf.movie_facebook_likes.corr(movies_pdf.actor_3_facebook_likes)

259 µs ± 14.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [57]:
%%timeit
movies_cdf.movie_facebook_likes.corr(movies_cdf.actor_3_facebook_likes)

12.7 ms ± 144 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Groupby
Analyze the gross amounts generated by the two main actors

In [60]:
%%timeit
movies_pdf[['actor_1_name','actor_2_name','gross']].groupby(['actor_1_name','actor_2_name']).sum()

7.01 ms ± 82.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [59]:
%%timeit
movies_cdf[['actor_1_name','actor_2_name','gross']].groupby(['actor_1_name','actor_2_name']).sum()

8.16 ms ± 111 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Data Preparation

Genres has multiple values that are combinations of several genres.  For example: `Action|Adventure|Comedy|Fantasy|Sci-Fi`

In [29]:
print('There are ' + str(len(movies_pdf.genres.unique())) + ' genre combinations in the genres field')
print('Examples:\n', movies_pdf.genres.unique()[:10])

There are 914 genre combinations in the genres field
Examples:
 ['Action|Adventure|Fantasy|Sci-Fi' 'Action|Adventure|Fantasy'
 'Action|Adventure|Thriller' 'Action|Thriller' 'Documentary'
 'Action|Adventure|Sci-Fi' 'Action|Adventure|Romance'
 'Adventure|Animation|Comedy|Family|Fantasy|Musical|Romance'
 'Adventure|Family|Fantasy|Mystery' 'Action|Adventure']


## Splitting the Genre column using Pandas

In [30]:
genres_pdf = movies_pdf.join(movies_pdf.genres.str.split('|', expand=True).add_prefix('genre_'))

## Splitting the Genre column using cuDF
cuDF does not have the add_prefix() option when splitting a column.  Here is one way to rename your columns in cuDF.

In [31]:
genre_fields = len(movies_cdf.genres.str.split('|', expand=True).columns)
print('There will be ' + str(genre_fields) + ' new columns that will be added into our dataframe\n')
genres_cdf = movies_cdf.join(movies_cdf.genres.str.split('|', expand=True))
genres_cdf.head()

There will be 8 new columns that will be added into our dataframe



Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,aspect_ratio,movie_facebook_likes,0,1,2,3,4,5,6,7
2880,Color,Mike McCoy,216.0,110.0,16.0,11.0,Jason Cottle,93.0,70011073.0,Action|Adventure|Drama|Thriller|War,...,2.35,15000,Action,Adventure,Drama,Thriller,War,,,
2896,Color,Brian Henson,75.0,89.0,53.0,84.0,Jerry Nelson,227.0,27281507.0,Comedy|Drama|Family|Fantasy|Musical,...,1.85,0,Comedy,Drama,Family,Fantasy,Musical,,,
2888,Color,Tamra Davis,111.0,93.0,33.0,135.0,Katherine Boecher,1000.0,37188667.0,Comedy|Drama,...,1.85,0,Comedy,Drama,,,,,,
2904,Color,,68.0,55.0,,898.0,Kelvin Taylor,1000.0,,Action|Adventure|Biography|Drama|History,...,16.0,31000,Action,Adventure,Biography,Drama,History,,,
2881,Color,Anne Fletcher,107.0,104.0,98.0,920.0,Alyson Stoner,17000.0,65269010.0,Crime|Drama|Music|Romance,...,2.35,0,Crime,Drama,Music,Romance,,,,


**The new column names are assigned numbers and are not strings**

In [32]:
col_numbers = genres_cdf.columns[-genre_fields:].to_list()
print(col_numbers)

[0, 1, 2, 3, 4, 5, 6, 7]


**Need to convert them to strings and give them a prefix and convert both lists into a dictionary to rename the new columns in the cuDF**

In [33]:
new_col_names = ['genre_' + str(x) for x in col_numbers] 
print(new_col_names)

new_col_dict = dict(zip(col_numbers, new_col_names))
print(new_col_dict)

['genre_0', 'genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5', 'genre_6', 'genre_7']
{0: 'genre_0', 1: 'genre_1', 2: 'genre_2', 3: 'genre_3', 4: 'genre_4', 5: 'genre_5', 6: 'genre_6', 7: 'genre_7'}


In [34]:
genres_cdf = genres_cdf.rename(columns=new_col_dict)

In [35]:
genres_cdf.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,aspect_ratio,movie_facebook_likes,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7
2880,Color,Mike McCoy,216.0,110.0,16.0,11.0,Jason Cottle,93.0,70011073.0,Action|Adventure|Drama|Thriller|War,...,2.35,15000,Action,Adventure,Drama,Thriller,War,,,
2896,Color,Brian Henson,75.0,89.0,53.0,84.0,Jerry Nelson,227.0,27281507.0,Comedy|Drama|Family|Fantasy|Musical,...,1.85,0,Comedy,Drama,Family,Fantasy,Musical,,,
2888,Color,Tamra Davis,111.0,93.0,33.0,135.0,Katherine Boecher,1000.0,37188667.0,Comedy|Drama,...,1.85,0,Comedy,Drama,,,,,,
2904,Color,,68.0,55.0,,898.0,Kelvin Taylor,1000.0,,Action|Adventure|Biography|Drama|History,...,16.0,31000,Action,Adventure,Biography,Drama,History,,,
2881,Color,Anne Fletcher,107.0,104.0,98.0,920.0,Alyson Stoner,17000.0,65269010.0,Crime|Drama|Music|Romance,...,2.35,0,Crime,Drama,Music,Romance,,,,


# One Hot Encoding

## OHE using Pandas
Applied on the genre_0 column

In [61]:
%%timeit
pd_ohe = pd.get_dummies(genres_pdf.genre_0, prefix='genre_0')
df = pd.concat([genres_pdf, pd_ohe], axis=1)
df.head()

4.62 ms ± 209 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## OHE using cuDF
Applied on the genre_0 column

In [62]:
%%timeit
cdf = cd.get_dummies(genres_cdf, prefix='genre_0', columns=['genre_0'])
cdf.head()

77.8 ms ± 1.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
