# SLU 05 - Covariance and Correlation: Exercise notebook

In [None]:
import pandas as pd 
import numpy as np 
import math 
import utils 

from IPython.display import Image
from matplotlib import pyplot as plt
from utils import get_house_prices_and_rooms, plot_house_prices_and_rooms
from utils import get_car_prices_year, get_car_prices_mileage, plot_car_prices_and_mileage,\
                check_currency_change, plot_some_correlations, plot_correlation_bargraph

# this is for grading without showing the answers 
import hashlib
def hash_answer(answer): 
    answer=str(answer)
    return hashlib.sha256((answer).encode()).hexdigest()

In this notebook, you will practice the following: 

    - Covariance 
    - Pearson correlation
    - Spearman correlation
    - Correlation matrix
    - Spurious correlations

----

# Exercise 1:  covariance and correlation with Pandas
Here you will apply what you learned in the Learning Notebook, calculating covariance and correlation on a sample dataset.

We're going to use a dataset of used car values in the USA for this exercise. Let's begin by taking a quick look at the dataset:

In [None]:
data = pd.read_csv('data/USA_cars_datasets.csv', index_col="index")
data.head()

We'll begin by checking if the car's year of make is related to its price listing.

Edit the function below so that when given the two features, it returns their ***covariance***, ***Pearson correlation*** and ***Spearman correlation*** (in this order).

In [None]:
def check_if_related(prices, year):
    # covariance = ...
    # pearson_corr = ...
    # spearman_corr = ...
    
    # YOUR CODE HERE
    raise NotImplementedError()
    
    return covariance, pearson_corr, spearman_corr

In [None]:
prices, year = get_car_prices_year()
assert math.isclose(check_if_related(prices, year)[0], 17446.83, abs_tol=0.01), "The covariance seems to be wrong."
assert math.isclose(check_if_related(prices, year)[1], 0.4182, abs_tol=0.0001), "The Pearson correlation seems to be wrong."
assert math.isclose(check_if_related(prices, year)[2], 0.4768, abs_tol=0.0001), "The Spearman correlation seems to be wrong."
print("Well done! Everything seems in order! The correlation between price and year does not seem very significant.")

# Exercise 2: Using other units of measurement
Now for a thought exercise. These car prices are in USD. Let's assume we want to know the prices in EUR.

The exchange rate at the time of writing is 1.00 USD = 0.85 EUR, meaning that the EUR is a **larger** unit than the one in this dataset.

If we extract the covariance and Pearson/Spearman correlations again, but this time in EUR, which of these statements is true?
>**A.** The covariance, Pearson correlation and Spearman correlation will decrease.
>
>**B.** The covariance will decrease, but the Pearson correlation and Spearman correlation will increase.
>
>**C.** The covariance will remain the same, but Pearson correlation and Spearman correlation all decrease.
>
>**D.** The covariance will decrease, but Pearson correlation and Spearman correlation will remain the same.

Write the letter corresponding to your chosen answer as a text string into the variable ***ex2_answer*** below.

In [None]:
# ex2_answer = "Z"
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert hash_answer(ex2_answer.lower()) == "18ac3e7343f016890c510e93f935261169d9e3f565436429830faf0934f4f8e4", "Wrong choice. Remember that correlation does not depend on units."
print("Good job!")
check_currency_change(prices, year)

# Exercise 3: Predict correlation "by eye"

Let's take a look at a plot representing a correlation between three pairs of features. Assume we don't know anything else about the dataset besides this correlation graph.

In [None]:
plot_some_correlations()

What can we infer from this graph?

>**A.** The orange series has the highest correlation.
>
>**B.** The blue series has the lowest correlation.
>
>**C.** All three series' correlations are positive.
>
>**D.** Nothing can be inferred from just the graph.

Write the letter corresponding to your chosen answer as a text string into the variable ***ex3_answer*** below.

In [None]:
# ex3_answer = "Z"
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert hash_answer(ex3_answer.lower()) == "2e7d2c03a9507ae265ecf5b5356885a53393a2029d241394997265a1a25aefc6", "Wrong choice. Remember that you can't infer the amount of correlation from the degree of the slope, it can only help with estimating possitive and negative correlations."
print("Nice!")
plot_some_correlations(legend=True)

----

# Exercise 4: Back to the cars - enter an outlier 
We are going back to analyze some car prices. 

In one case, there will be no outliers. In the other, there will be a single outlier (maybe someone famous drove one of them). 

In [None]:
plot_car_prices_and_mileage()

# 4.1: Compare methods
Adjust the function below to calculate the correlations between the **mileage** of the car and the listed **price**. Have it return the Pearson correlation and the Spearman correlation, in that order.

In [None]:
def calculate_correlations_with_pandas(price, mileage):
    
    # pearson_corr = ...
    # spearman_corr = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    return pearson_corr, spearman_corr

In [None]:
mileage, car_prices_normal, car_prices_with_outliers = get_car_prices_mileage()

pearson_corr_normal, spearman_corr_normal = calculate_correlations_with_pandas(car_prices_normal, 
                                                                               mileage)
pearson_corr_outlier, spearman_corr_outlier = calculate_correlations_with_pandas(car_prices_with_outliers, 
                                                                                 mileage)

# quick plot to see what happens
plot_correlation_bargraph(pearson_corr_normal, pearson_corr_outlier, 
                          spearman_corr_normal, spearman_corr_outlier )

In [None]:
assert math.isclose(abs(pearson_corr_normal - pearson_corr_outlier), 0.118, abs_tol=.01), "The Pearson correlation seems to be off. You should see some change between the two datasets."
assert math.isclose(abs(spearman_corr_normal - spearman_corr_outlier), 0, abs_tol=.01), "The Spearman correlation seems to be off. You should see almost no change between the two datasets."
print("So far, so good!")

# 4.2: Choose best method

So, unsurprisingly, the more miles a car has, the lower the cost: a negative correlation. But, as you can see, having outliers may hugely affect your analysis. Then, when dealing with a dataset **with outliers**, which correlation methodology should you use?

>**A.** Pearson.
>
>**B.** Spearman.

Write the letter corresponding to your chosen answer as a text string into the variable ***ex4_answer*** below.

In [None]:
# ex4_answer = "Z"
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert hash_answer(ex4_answer.lower()) == "3e23e8160039594a33894f6564e1b1348bbd7a0088d42c4acb73eeaed59c009d", "Wrong choice. Look at the changes in the correlations in the bar graphs."
print("Nice! Now you can avoid the data disruption caused by famous people's cars!")

# Exercise 5: Stocks! 

Time to predict the stockmarket and get rich.

Explore the dataset provided below:

- Hint 1: you can use display (<name of df>) to force it to pretty print.
- Hint 2: use the heatmap of the correlation matrix that we used in the learning notebooks.
- Hint 3: you may want to import something to help with the visualization.
- Hint 4: you can either paste the answers or use a purely programmatic solution.
- Hint 5: when we say lowest and highest we mean the sign, not the "magnitude".

In [None]:
stocks = pd.read_csv('data/Stock_prices.csv')

# YOUR CODE HERE
raise NotImplementedError()

Now answer the following questions:
> __Q1__: What is the pair of stocks with the most negative Pearson correlation?  
>
> __Q2__: What is the stock with the most negative Spearman correlation with Ulta Beauty? 
>
> __Q3__: What is the Pearson correlation between QUALCOMM Inc. and Marathon Oil Corp.? 
>
> __Q4__: Observe the top 4 Pearson correlation pairs, and then look at the general correlation matrix. Is there a confounding variable that may explain why these companies are so correlated?

In [None]:
# Complete the following questions 

# Q1: What is the pair of stocks with the most negative Pearson correlation? 
# (pass the answer as a list, and remember, you can just type it in, no fancy Pandas needed) 
# stock_pair_with_lowest_pearson_corr = ... 
# YOUR CODE HERE
raise NotImplementedError()

# # Q2: What is the stock with the most negative Spearman correlation with Ulta Beauty? 
# (the answer is a string)
# lowest_rank_spearman_corr_with_ulta_beauty = ... 
# YOUR CODE HERE
raise NotImplementedError()

# Q3: What is the Pearson correlation between QUALCOMM Inc. and Marathon Oil Corp.? 
# (answer is a float, use two decimal spaces)
# pearson_corr_between_qualcomm_and_marathon_oil = ... 
# YOUR CODE HERE
raise NotImplementedError()

# Q4: Observe the top 4 Pearson correlation pairs, and then look at the general correlation matrix. 
# Is there a confounding variable that may explain why these companies are so correlated? 
# possible_confounding_variable = 
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert hash_answer(sorted(stock_pair_with_lowest_pearson_corr)[0].lower()) == "6b075863de5a5c233441c451b63db6542c55ccabe6aca117f3262effbbbbcb68", "That is not the pair with the lowest pearson correlation."
assert hash_answer(sorted(stock_pair_with_lowest_pearson_corr)[1].lower()) == "252d948a7b16b67a910f2450a81e78760335c0128c05b53af2060de73bb4dc37", "That is not the pair with the lowest pearson correlation."

assert hash_answer(lowest_rank_spearman_corr_with_ulta_beauty.lower()) == "6b075863de5a5c233441c451b63db6542c55ccabe6aca117f3262effbbbbcb68", "Wrong lowest spearman correlation with Ulta Beauty."
assert math.isclose(pearson_corr_between_qualcomm_and_marathon_oil, 0.8918, abs_tol=0.01), "Wrong pearson correlation value between QUALCOMM Inc. and Marathon Oil Corp."
assert hash_answer(possible_confounding_variable.lower()) == "cb9b59e3f375975dd68cad5a89f1c080d42a070fee6f7b1ee191cdc63e54b366", "Look at what the product those 4 companies deal in."
print("Well done, you may not go bankrupt yet! No promises on getting rich though.")

----

# Exercise 6: lots of stocks
You were hired by a hedge fund, because money. 

On the first day, your boss, Greedy McRiskyface asked you to select one stock pair. He wants the two stocks to be as uncorrelated as possible (use Pearson), so that he can short one and long the other. 

The answer should be (1) the two stocks, as a list and (2) their Pearson correlation, as a float. 

In [None]:
stock_data = utils.get_stocks_data_2()

In [None]:
# most_uncorrelated_pair = ... 
# most_uncorrelated_pair_value = ...
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert isinstance(most_uncorrelated_pair, list)
assert len(most_uncorrelated_pair) == 2
assert isinstance(most_uncorrelated_pair_value, float)
utils.dirty_little_secret()