# Understanding Space Utilization

Data objects can take up a lot of sapce

In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv('http://bit.ly/imdbratings')

### IMPORTANT: OBJECT column types take up a LOT of space.

In [3]:
#Just looking at memory usage from info does not full account for all pointers
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 979 entries, 0 to 978
Data columns (total 6 columns):
star_rating       979 non-null float64
title             979 non-null object
content_rating    976 non-null object
genre             979 non-null object
duration          979 non-null int64
actors_list       979 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 46.0+ KB


In [4]:
#Specifying "deep" for memory_usage shows the true memory usage
movies.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 979 entries, 0 to 978
Data columns (total 6 columns):
star_rating       979 non-null float64
title             979 non-null object
content_rating    976 non-null object
genre             979 non-null object
duration          979 non-null int64
actors_list       979 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 313.4 KB


In [6]:
#Notice the difference in size for columns ['star_rating','duration'] compared to the rest
#They are much smaller because they are not object types
movies.memory_usage(deep=True)

Index                 80
star_rating         7832
title              70967
content_rating     62698
genre              61963
duration            7832
actors_list       109533
dtype: int64

In [8]:
movies.memory_usage(deep=True).sum()

320905

## Use Categories to store categories
Let's store our Strings as integers to save a lot of space

Categories store the underlying strings as integers with a lookup

In [9]:
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [10]:
#Switch the genre from a String object to an integer via the category
movies['genre'] = movies['genre'].astype('category')

In [11]:
#Notice that now genre is a category
movies.dtypes

star_rating        float64
title               object
content_rating      object
genre             category
duration             int64
actors_list         object
dtype: object

In [12]:
#Note the new "Categories" at the bottom
movies['genre'].head()

0     Crime
1     Crime
2     Crime
3    Action
4     Crime
Name: genre, dtype: category
Categories (16, object): [Action, Adventure, Animation, Biography, ..., Mystery, Sci-Fi, Thriller, Western]

In [13]:
#To fully see the integer conversion
movies['genre'].cat.codes.head()

0    5
1    5
2    5
3    0
4    5
dtype: int8

In [14]:
#Now let's check out if the size has changed
#We see that is has gone from 62KB all the way down to 2KB
movies.memory_usage(deep=True)

Index                 80
star_rating         7832
title              70967
content_rating     62698
genre               2003
duration            7832
actors_list       109533
dtype: int64

In [15]:
#And we can see that we've gone from 313KB total dataset size down to 255KB
movies.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 979 entries, 0 to 978
Data columns (total 6 columns):
star_rating       979 non-null float64
title             979 non-null object
content_rating    976 non-null object
genre             979 non-null category
duration          979 non-null int64
actors_list       979 non-null object
dtypes: category(1), float64(1), int64(1), object(3)
memory usage: 254.8 KB


In [16]:
#This is nearly equally as enhancing for the content_rating
movies['content_rating'] = movies['content_rating'].astype('category')

In [17]:
movies.memory_usage(deep=True)

Index                 80
star_rating         7832
title              70967
content_rating      1739
genre               2003
duration            7832
actors_list       109533
dtype: int64

In [18]:
#Getting us down to 195KB for a total space savings of nearly 40%
movies.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 979 entries, 0 to 978
Data columns (total 6 columns):
star_rating       979 non-null float64
title             979 non-null object
content_rating    976 non-null category
genre             979 non-null category
duration          979 non-null int64
actors_list       979 non-null object
dtypes: category(2), float64(1), int64(1), object(2)
memory usage: 195.3 KB


# Random - Create a Dataframe From a Dict

In [29]:
#Create a dataframe by passing a dictionary to it
df = pd.DataFrame({'ID':[100,101,102,104], 'quality':['good','very good', 'good', 'excellent']})
df

Unnamed: 0,ID,quality
0,100,good
1,101,very good
2,102,good
3,104,excellent


## Create a logical ordering with a category so that sorting sorts in a logical manner

In [30]:
#Strings get sorted in alphabetical order, DUH
df.sort_values('quality')

Unnamed: 0,ID,quality
3,104,excellent
0,100,good
2,102,good
1,101,very good


In [31]:

df['quality'] = df['quality'].astype('category', categories=['good','very good', 'excellent'], ordered=True)

In [32]:
#Note that good < very good < excellent
df.quality

0         good
1    very good
2         good
3    excellent
Name: quality, dtype: category
Categories (3, object): [good < very good < excellent]

In [35]:
#Note that the sort actually works like you would expect
df.sort_values('quality')

Unnamed: 0,ID,quality
0,100,good
2,102,good
1,101,very good
3,104,excellent


In [36]:
#Now you can peform boolean conditional operators on th sets
df.loc[df['quality'] > 'good', :]

Unnamed: 0,ID,quality
1,101,very good
3,104,excellent
