## How do I filter rows of a dataframe by column value? 

In [1]:
import pandas as pd

In [2]:
#create dataframe
movies_path = 'http://bit.ly/imdbratings'
movies = pd.read_csv(movies_path)

In [3]:
movies.head(2)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"


In [4]:
movies.shape

(979, 6)

### How to built a filter by yourself

In [5]:
#Filter 1: DIY-Filter
booleans=[]
for item in movies.duration:
    if item >=200:
        booleans.append(True)
    else:
        booleans.append(False)

In [6]:
booleans[0:5]

[False, False, True, False, False]

In [7]:
type(booleans)

list

In [8]:
#len of booleans matches shape of dataframe
len(booleans)

979

In [23]:
#create a series with filter
is_long = pd.Series(booleans)
type(is_long)

pandas.core.series.Series

In [11]:
#pass filter-series "is_long" to movies
movies[is_long].head(3)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."
17,8.7,Seven Samurai,UNRATED,Drama,207,"[u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K..."


### Show function of a filter

In [12]:
#show how filter works out to explain syntax of the next row
is_long=movies.duration >=200
is_long.head(3)

0    False
1    False
2     True
Name: duration, dtype: bool

In [13]:
#inner part returns boolean (see line above)
#line shows just values which are True
movies[movies.duration >=200].genre

2          Crime
7      Adventure
17         Drama
78         Crime
85     Adventure
142    Adventure
157        Drama
204    Adventure
445    Adventure
476        Drama
630    Biography
767       Action
Name: genre, dtype: object

In [14]:
#IMPORTANT: PANDAS
#loc #filter #select #pandas

#Best practice to show objects, way above can cause problems
movies.loc[movies.duration >=200, 'genre']

2          Crime
7      Adventure
17         Drama
78         Crime
85     Adventure
142    Adventure
157        Drama
204    Adventure
445    Adventure
476        Drama
630    Biography
767       Action
Name: genre, dtype: object

#### How do I filter with multiple conditions?

In [15]:
#IMPORTANT: PANDAS
##filter #conditions #multipleconditions #select #pandas

#OR-filtering
movies[(movies.genre == 'Action') |
        (movies.genre == 'Drama') |
        (movies.genre == 'Western')].head(3)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."


In [16]:
#IMPORTANT: PANDAS
#filter #isin #conditions #multipleconditions #select #pandas
movies[movies.genre.isin(['Action','Drama','Western'])].head(3)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."


In [17]:
#IMPORTANT: PANDAS
#filter #isin #conditions #tilde #multipleconditions #select #pandas

#tilde is the NOT-operator in python
movies[~movies.genre.isin(['Action','Drama','Western'])].head(3)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."


### How to filter a DataFrame by largest categories?

In [18]:
counts = movies.genre.value_counts()
counts.head(3)

Drama     278
Comedy    156
Action    136
Name: genre, dtype: int64

In [19]:
#get largest genres
counts.nlargest(3)

Drama     278
Comedy    156
Action    136
Name: genre, dtype: int64

In [20]:
#IMPORTANT: PANDAS
#filter #conditions #multipleconditions #select #pandas
counts.nlargest(3).index

Index(['Drama', 'Comedy', 'Action'], dtype='object')

In [21]:
#IMPORTANT: PANDAS
#filter #isin #conditions #multipleconditions #select #pandas
movies[movies.genre.isin(counts.nlargest(3).index)].head(3)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
