# SLU6 - Intermediate statistics: Exercise notebook

In [None]:
import pandas as pd 
import numpy as np 
import math 

In this notebook you will practice the following: 

    - Covariance 
    - Pearson correlation
    - Spearman correlation
    - Correlation matrix
    - Spurious correlations

# Exercise 1: implement covariance 

Here you will implment covariance, by completing the following function. 

Don't worry too much about generalizing, this will be an extremely naïve implementation, just to get your hands dirty! 

Quick reminder of the formula: 
$$ cov = \frac{\sum{(X - X_{avg})(Y - Y_{avg})}}{n-1} $$

#### Complete here: 

In [None]:
def covariance_by_hand(s1, s2):
    """ 
    Naive implementation of covariance by hand 
    
    Args:
        s1 (Pd.Series): a pandas series 
        s2 (Pd.Series): a pandas series (the same index and length as s1)

    Returns:
        covariance (float): the covariance between s1 and s2 

    """
    
    # get the mean of each series, save them as s1_avg and s2_avg (~2 lines)
    # s1_avg = ... 
    # s2_avg = ...
    # YOUR CODE HERE
    raise NotImplementedError()

    
    # for each series, subtract the mean to each point (~2 lines)
    # s1_minus_s1_avg = ... 
    # s2_minus_s2_avg = ... 
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # get the element-wise product between the two columns (~1 line)
    # element_wise_product = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # sum the element-wise product series, to get the top side of the equasion (~1 line)
    # element_wise_product_sum = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # divide the upper part of the equasion by the (number of elements - 1)  (~2 lines)
    # clue: use s1 to get the number of elements 
    # n = ...
    # covariance = ...
    # YOUR CODE HERE
    raise NotImplementedError()

    return covariance 

In [None]:
a = pd.Series([1, 5, 7, 10, 25])
b = pd.Series([15, 30, 28, 45, 50])
print('Covariance by hand between a and b: %0.02f' % covariance_by_hand(a, b))

Expected output:   

    Covariance by hand between a and b: 112.05

In [None]:
a = pd.Series([1, 5, 7, 10, 25])
b = pd.Series([15, 30, 28, 45, 50])
c = pd.Series([22, 55, 23, 15, 92])
assert math.isclose(covariance_by_hand(a, b), 112.05)
assert math.isclose(covariance_by_hand(a, c), 230.2)

# Exercise 2: implement pearson correlation

Correlation is simply normalized covariance! 

$$ correlation = \frac{covariance(X, Y)}{\sqrt{Var(X) * Var(Y)}} $$

#### Complete here: 

In [None]:
def pearson_correlation_by_hand(s1, s2): 
    """ 
    Naive implementation of pearson correlation
    
    Args:
        s1 (Pd.Series): a pandas series 
        s2 (Pd.Series): a pandas series (the same index and length as s1)

    Returns:
        correlation (float): the correlation between s1 and s2 

    """
    
    # get the variance of s1 and s2 (~2 lines)
    # var_s1 = ...
    # var_s2 = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # calculate the bottom half of the correlation formula (~2 lines)
    # clue: to do the square root, use np.sqrt(var_s1_times_var_s2)
    # var_s1_times_var_s2 = ...
    # bottom_half_of_correlation_formula = ... 
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # finally, calculate the correlation 
    # clue: remember, you've already implemented the top half in exercise one, you can use that now 
    # correlation = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    return correlation

In [None]:
print('Correlation by hand between a and b: %0.02f' % pearson_correlation_by_hand(a, b))

Expected output:   

    Correlation by hand between a and b: 0.87

# Exercise 3: using the pandas versions 

In [None]:
def summarize_relationship(s1, s2): 
    """ 
    Uses the pandas implementation to summarize relationships between series 
    
    Args:
        s1 (Pd.Series): a pandas series 
        s2 (Pd.Series): a pandas series (the same index and length as s1)

    Returns:
        summary (dict): dictionary containing covariance, pearson and spearman correlation

    """
    # using pandas, compute the covariance between s1 and s2 (1 line)
    # covariance = ...
    # YOUR CODE HERE
    raise NotImplementedError()

    # using pandas, compute the pearson correlation between s1 and s2 (1 line)
    # pearson_corr = ...  
    # YOUR CODE HERE
    raise NotImplementedError()
    # using pandas, compute the spearman correlation between s1 and s2 (1 line)
    # spearman_corr = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    return {'covariance': covariance, 'pearson': pearson_corr, 'spearman': spearman_corr}

In [None]:
a = pd.Series([1, 5, 7, 10, 25])
b = pd.Series([15, 30, 28, 45, 50])
c = b * 7

for name, series in {'b': b, 'c': c}.items(): 
    summary = summarize_relationship(a, series)
    print('Between a and %s, the covariance is %0.0f, '
      'the pearson correlation is %0.02f and the spearman '
      'correlation is %0.02f' % (name, 
                                 summary['covariance'], 
                                 summary['pearson'], 
                                 summary['spearman']))

Expected output: 
   
    Between a and c, the covariance is 784, the pearson correlation is 0.87 and the spearman correlation is 0.90
    Between a and b, the covariance is 112, the pearson correlation is 0.87 and the spearman correlation is 0.90

In [None]:
summary_a_c = summarize_relationship(a, c)
assert math.isclose(summary_a_c['covariance'], 784.35, abs_tol=0.01)
assert math.isclose(summary_a_c['pearson'],  0.86626, abs_tol=0.01)
assert math.isclose(summary_a_c['spearman'],  0.9, abs_tol=0.01)

Side question (optional): can you remember why the correlation is the same, given that `c = b * 7`? 

# Exercise 4: time for some fun 

The 'affairs' dataset is a data science classic. It is the result of a questionaire of US marriages, and attempts to understand what factors are correlated with couples having affairs. 

Your job, is to analyze this dataset using whatever tools you wish, and complete the following analysis. 

_Hint: the correlation matrix, and the heatmap visualization might come in handy_

In [None]:
# This time you will have more freedom to play around. 
# We've loaded the dataset for your, but from here on you are on your own! 
data = pd.read_csv('data/affairs.csv')
analysis = {}

# Explore the dataset
# YOUR CODE HERE
raise NotImplementedError()

# Complete the following questions: 
# what is the name of the variable with the highest correlation with the number of affairs? 
# analysis['highest_correlation_with_affair'] = ... 
# YOUR CODE HERE
raise NotImplementedError()

# what is the name of the variable with the lowest (most negative) correlation with the number of affairs? 
# analysis['lowest_correlation_with_affair'] = ...
# YOUR CODE HERE
raise NotImplementedError()

# what is the correlation between having a child, and having an affair? 
# analysis['corr_between_child_and_affair'] = ...
# YOUR CODE HERE
raise NotImplementedError()

# can you think of a confounding variable that might suggest children don't actually cause affairs? 
# analysis['possible confounding variable'] = ...
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
print('The variable that correlates most positively with having an affair is '
      '"%s". \n\nThe variable "%s" seems to correlate negatively with the number of affairs, \nbut '
      'there is no way to establish a direction of causality. \n\nWe did find '
      'a correlation between the presence of a child and an affair of "%0.02f", '
      '\nhowever this might be partially or entirely explained '
      'by the presence confounding variable "%s"' % 
      (analysis['highest_correlation_with_affair'], 
      analysis['lowest_correlation_with_affair'],
      analysis['corr_between_child_and_affair'],
      analysis['possible confounding variable']))

Expected output 

    No expected output on this one, see if it passes the test output ;) 

#### Test output (don't change code here) 

# If you have some free time (optional) 

If you have free time, try exploring the housing dataset from the learning notebook further. 

In [None]:
housing_data = pd.read_csv('data/HousingData.csv')