In [2]:
# Import the dependancies
import pandas as pd
import numpy as np

In [3]:
# USE CONDITIONAL SELECTION
# We set up decision making code through conditionals.
# A CONDITION is a Boolean expression, `True` or `False`, that we write in syntax.
# CONDITIONAL SELECTION works in a similar way but with data.
# If the data meets the criteria that the conditional expression specifies, the code selects it.
# If the data doesn't meet the criteria, the code doesn't select it.
# With conditional selection, analysts can focus on subsets of data that are more important for their analyses.

In [10]:
# USE BOOLEAN SELECTION
# To help analysts conditionally filter data, Pandas is designed for the `iloc` and `loc` functions to accept boolean filters.
# BOOLEAN FILTER, as it's called, is a Python list or Pandas Series of True and False values.
# For example:
boolean_filter = [True, True, False, False, True]

# I will create a DataFrame called aapl_returns containing five random values.
# Then pass the boolean_filter list to the `loc` function:
aapl_returns = pd.DataFrame({'AAPL': np.random.randn(5)})
display(aapl_returns)

aapl_boolean = aapl_returns.loc[boolean_filter]
display(aapl_boolean)

# As you can see, the Boolean filter inside `iloc` and `loc` functions keeps the rows and columns that have the `True` values.
# The `False` values are then dropped.
# The function uses the index position of the Boolean Filter to determine whether to keep or drop the data existing in that specific position.
# In order for the Boolean Filter to work across the whole dataset, the number of elements in the filter must match the number of rows in the DataFrame.
# If the number of rows exceeds the number of elements in the Boolean, the function assumes that the Boolean value is `False` and drops those rows.

Unnamed: 0,AAPL
0,-0.396157
1,0.808389
2,-0.651348
3,2.967533
4,-1.037365


Unnamed: 0,AAPL
0,-0.396157
1,0.808389
4,-1.037365


In [11]:
# We can also use Boolean Filters to filter out columns of data:
daily_returns = pd.DataFrame({
    'AAPL': np.random.randn(5),
    'GOOG': np.random.randn(5),
    'MSFT': np.random.randn(5)}
)
display(daily_returns)

boolean_filters = [True, True, False]
boolean_returns = daily_returns.loc[:, boolean_filters]
display(boolean_returns)

Unnamed: 0,AAPL,GOOG,MSFT
0,-2.329426,0.1915,-2.58377
1,-0.006551,-0.471023,-0.053344
2,0.964058,-0.228542,0.931296
3,0.275024,-1.391248,0.848593
4,-1.484261,0.545066,-0.498635


Unnamed: 0,AAPL,GOOG
0,-2.329426,0.1915
1,-0.006551,-0.471023
2,0.964058,-0.228542
3,0.275024,-1.391248
4,-1.484261,0.545066


In [12]:
# USE CONDITIONAL STATEMENTS
# Can you imagine writing a Boolean Filter for 1 million rows of data? Hell on Earth!
# For a task of that size, we can use CONDITIONAL STATEMENTS.
# This creates a Boolean Filter from conditional statements using the values of the DataFrame.
# Let's have quick refresher on the Python Comparison Operators:
    # 1. == | x == y | Checks for equality between two elements. Translates to `is equal to`
    # 2. > | x > y | Checks that the first element is greater than the second element.
    # 3. >= | x >= y | Checks that the first element is greater than or equal to the second.
    # 4. < | x < y | Checks that the first element is less than the second element.
    # 5. <= | x <= y | Checks that the first element is less than or equal to the second.
# Suppose you're given a DataFrame containing 20 return values for Apple Stock.
# You're asked to provide only those data points that have positive returns, meaning those that are greater than zero.
# To get this information, you can use the conditional statement as follows:

In [18]:
# Create a returns DataFrame for Apple
aapl_returns = pd.DataFrame({'AAPL': np.random.randn(10) + 1})

# Create a conditional statement
# The condition evaluates `True` if the aapl_returns[''] are greater than 0 
# The condition evaluates `False` otherwise
display(aapl_returns.head())
display(aapl_returns["AAPL"] > 0)

Unnamed: 0,AAPL
0,1.240222
1,-1.05653
2,1.782359
3,1.56516
4,1.582211


0     True
1    False
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
Name: AAPL, dtype: bool

In [17]:
# The key takeway is that the code labeled all DataFrame values less than zero as false - just as our conditional instructed.
# Now let's use this Boolean Filter to keep the positive values and drop the negative values:
boolean_filter = aapl_returns['AAPL'] > 0

display(aapl_returns.loc[boolean_filter])

Unnamed: 0,AAPL
0,2.137651
1,3.032831
4,1.452007
5,2.345593
6,2.322372
7,1.722145
8,0.422976
9,1.437627


In [23]:
# What happens if you want to look at only returns that are between certain values?
# Say, greater than zero, but less than or equal to 2%?
# We can do this by combining multiple conditional statements with Python logical operators.
# Let's get a quick refresher on Python logical operators:
    # 1. Operator: and | Bitwise Operator: & | Ex: x and y | True if both x and y are True, otherwise False.
    # 2. Operator: or | Bitwise Operator: ( | ) | Ex: x or y | True if either x or y is True, otherwise False.
    # 3. Operator: Not | Bitwise Operator: ~ | Ex: not y | True if x is False, False if x is True.
# First, we use a descriptive variable to identify the original conditional statement, which filters out negative return values.
# Then, we use a variable to identify the conditional statement that removes the return values that are greater than 2%:

# Filter out all negative values from the DataFrame.
filter_low = aapl_returns['AAPL'] > 0 
aapl_low = aapl_returns.loc[filter_low]

# Display results
aapl_low

Unnamed: 0,AAPL
0,1.240222
2,1.782359
3,1.56516
4,1.582211
5,2.117555
6,1.483084
7,1.774228
8,1.459065
9,0.858777


In [24]:
# Filter all values less than 2%
filter_high = aapl_returns['AAPL'] <= 2
aapl_high = aapl_returns.loc[filter_high]

# Display results
aapl_high

Unnamed: 0,AAPL
0,1.240222
1,-1.05653
2,1.782359
3,1.56516
4,1.582211
6,1.483084
7,1.774228
8,1.459065
9,0.858777


In [25]:
# Now put these together by using the ampersand (&) operator to combine both filters.
# Create a variable that combines both filters
aapl_low_high = aapl_returns.loc[filter_low & filter_high]

# Display the results
aapl_low_high

Unnamed: 0,AAPL
0,1.240222
2,1.782359
3,1.56516
4,1.582211
6,1.483084
7,1.774228
8,1.459065
9,0.858777
