In [None]:
# Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# OS access
import os

# Custom module
import util

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
def get_quartet_data():

    '''
    Checks if the csv file for demonstrating Anscombe's Quartet is stored locally.  If it's not then
    it acquires the data from a url then saves it as a dataframe. 
    '''

    file = 'quartet.csv'
    
    if os.path.isfile(file):
        return pd.read_csv(file)
    
    else:
        url = "https://gist.githubusercontent.com/ryanorsinger/6218f5731f3df7cb4771ff3879fdeaa3/raw/88cb6bed276e2236c33df011bd753b6c73832c30/quartet.csv"
        
        df = pd.read_csv(url)

        df.to_csv(file)

        return df 

In [None]:
df = get_quartet_data()
df.head(3)

In [None]:
%matplotlib inline

sns.relplot(x='x', y='y', col='dataset', data=df).fig.suptitle("Anscombe's Quartet")
df.groupby("dataset").describe()

1. Using lemonade.csv dataset and focusing on continuous variables:
- Use the IQR Range Rule and the upper and lower bounds to identify the lower outliers of each column of lemonade.csv, using the multiplier of 1.5. Do these lower outliers make sense? Which outliers should be kept?
- Use the IQR Range Rule and the upper and upper bounds to identify the upper outliers of each column of lemonade.csv, using the multiplier of 1.5. Do these upper outliers make sense? Which outliers should be kept?
- Using the multiplier of 3, IQR Range Rule, and the lower bounds, identify the outliers below the lower bound in each colum of lemonade.csv. Do these lower outliers make sense? Which outliers should be kept?
- Using the multiplier of 3, IQR Range Rule, and the upper bounds, identify the outliers above the upper_bound in each colum of lemonade.csv. Do these upper outliers make sense? Which outliers should be kept?

In [None]:
def get_lower_and_upper_bounds(series, multiplier = 1.5):
    '''
    Gets the upper and lower bounds of a series based on the InterQuartile Range and a multiplier. Default multiplier is 1.5
    '''
    q1, q3 = series.quantile([.25, .75])
    iqr = q3 - q1
    
    upper = q3 + (multiplier * iqr)
    lower = q1 - (multiplier * iqr)
    
    print(f'{series.name}\'s Upper bound is {round(upper, 1)}, and Lower bound is {round(lower, 1)}')
    return upper, lower

In [None]:
csv = "/Users/ultimate/codeup-data-science/anomaly-detection-exercises/lemonade.csv"

if os.path.isfile(csv):
        pd.read_csv(csv)
    
else:
    url = "https://gist.githubusercontent.com/ryanorsinger/19bc7eccd6279661bd13307026628ace/raw/e4b5d6787015a4782f96cad6d1d62a8bdbac54c7/lemonade.csv"

    df = pd.read_csv(url)

    df.to_csv(file)

df = pd.read_csv(csv)
df.head(3)

In [None]:
df.dtypes

In [None]:
df.hist(figsize=(24, 18))
plt.show()

In [None]:
Temperature = df.Temperature
Rainfall = df.Rainfall
Flyers = df.Flyers
Price = df.Price
Sales = df.Sales

# Calculate the z-score 
zscores = pd.Series((Temperature - Temperature.mean()) / Temperature.std())

# Finds all of the observations two standard deviations or more.
Temperature[zscores.abs() >= 2]

# Finds all of the observations three standard deviations or more
Temperature[zscores.abs() >= 3]


In [None]:
upper_temp, lower_temp = get_lower_and_upper_bounds(Temperature)
upper_rain, lower_rain = get_lower_and_upper_bounds(Rainfall)
upper_fly, lower_fly = get_lower_and_upper_bounds(Flyers)
upper_price, lower_price = get_lower_and_upper_bounds(Price)
upper_sales, lower_sales = get_lower_and_upper_bounds(Sales)

In [None]:
get_lower_and_upper_bounds(Temperature), get_lower_and_upper_bounds(Rainfall), get_lower_and_upper_bounds(Flyers), get_lower_and_upper_bounds(Price), get_lower_and_upper_bounds(Sales)

In [None]:
sns.pairplot(df)

In [None]:
low_outliers = Temperature[Temperature < lower_temp]
low_outliers

In [None]:
hi_outliers = Temperature[Temperature > upper_temp]
hi_outliers

In [None]:
low_outliers = Rainfall[Rainfall < lower_temp]
low_outliers

In [None]:
hi_outliers = Rainfall[Rainfall > upper_temp]
hi_outliers

In [None]:
low_outliers = Flyers[Flyers < lower_temp]
low_outliers

In [None]:
hi_outliers = Flyers[Flyers > upper_temp]
hi_outliers

In [None]:
low_outliers = Price[Price < lower_temp]
low_outliers

In [None]:
hi_outliers = Price[Price > upper_temp]
hi_outliers

In [None]:
low_outliers = Sales[Sales < lower_temp]
low_outliers

In [None]:
hi_outliers = Sales[Sales > upper_temp]
hi_outliers

2. Identify if any columns in lemonade.csv are normally distributed. For normally distributed columns:

- Use a 2 sigma decision rule to isolate the outliers.

- Do these make sense?
- Should certain outliers be kept or removed?

In [None]:
Temperature.sort_values(ascending=False)

3. Now use a 3 sigma decision rule to isolate the outliers in the normally distributed columns from lemonade.csv