<div style="font-size:18pt; padding-top:20px; text-align:center">СЕМИНАР. <b>Корреляция и </b> <span style="font-weight:bold; color:green">NumPy/SciPy</span></div><hr>
<div style="text-align:right;">Папулин С.Ю. <span style="font-style: italic;font-weight: bold;">(papulin.study@yandex.ru)</span></div>

<a name="0"></a>
<div><span style="font-size:14pt; font-weight:bold">Содержание</span>
    <ol>
        <li><a href="#1">Корреляция Пирсона</a></li>
        <li><a href="#2">Пример</a>
        <li><a href="#3">Источники</a>
        </li>
    </ol>
</div>

<p><b>Подключение библиотек</b></p>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
%matplotlib inline

<a name="1"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">1. Корреляция Пирсона</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

<p>Коэффициент корреляции Пирсона определяет линейную функциональную зависимость между двумя переменными</p>

$$\rho=\frac{cov(x,y)}{\sigma_x\sigma_y}$$


$$r=\frac{\sum_{k=1}^{N}(x_k - \mu_x)(y_k - \mu_y)}{\sqrt{\sum_{k=1}^{N}(x_k - \mu_x)^2\sum_{k=1}^{N}(y_k - \mu_y)^2}}$$

$$-1 \le r \le 1$$

<p>Матричная запись</p>

$$R_{i,j}=\frac{\sum_{k=1}^{N}(X_{k,i} - \mu_{x_i})(X_{k,j} - \mu_{x_j})}{\sqrt{\sum_{k=1}^{N}(X_{k,i} - \mu_{x_i})^2\sum_{k=1}^{N}(X_{k,j} - \mu_{x_j})^2}}$$

In [None]:
x = np.linspace(-10, 10, 30)

In [None]:
f1 = lambda x: x**2
f2 = lambda x: 0.2*x**2

In [None]:
fig = plt.figure(1, figsize=(12, 4))

plt.subplot(1,2,1)
plt.plot(x, f1(x), "-o", label="$f_1(x)$")
plt.plot(x, f2(x), "-o", label="$f_2(x)$")
plt.xlabel("x")
plt.ylabel("$f_1(x), f_2(x)$")
plt.legend()
plt.grid(True)

plt.subplot(1,2,2)
plt.scatter(f1(x), f2(x))
plt.xlabel("$f_1(x)$")
plt.ylabel("$f_2(x)$")

plt.grid(True)

plt.show()

In [None]:
pearson_coef, pvalue = stats.pearsonr(f1(x), f2(x))
pearson_coef

In [None]:
f3 = lambda x: -0.2*x**2

In [None]:
fig = plt.figure(1, figsize=(12, 4))

plt.subplot(1,2,1)
plt.plot(x, f1(x), "-o", label="$f_1(x)$")
plt.plot(x, f3(x), "-o", label="$f_3(x)$")
plt.xlabel("x")
plt.ylabel("$f_1(x), f_3(x)$")
plt.legend()
plt.grid(True)

plt.subplot(1,2,2)
plt.scatter(f1(x), f3(x))
plt.xlabel("$f_1(x)$")
plt.ylabel("$f_3(x)$")

plt.grid(True)

plt.show()

In [None]:
pearson_coef, pvalue = stats.pearsonr(f1(x), f3(x))
pearson_coef

In [None]:
f4 = lambda x: x**2 + 10*np.sin(x)
f5 = lambda x: -x + 0.1*x**3 + 10*np.cos(x)

In [None]:
fig = plt.figure(1, figsize=(12, 4))

plt.subplot(1,2,1)
plt.plot(x, f4(x), "-o", label="$f_4(x)$")
plt.plot(x, f5(x), "-o", label="$f_5(x)$")
plt.xlabel("x")
plt.ylabel("$f_4(x), f_5(x)$")
plt.legend()
plt.grid(True)

plt.subplot(1,2,2)
plt.scatter(f4(x), f5(x))
plt.xlabel("$f_4(x)$")
plt.ylabel("$f_5(x)$")

plt.grid(True)

plt.show()

In [None]:
pearson_coef, pvalue = stats.pearsonr(f4(x), f5(x))
pearson_coef

<a name="1"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">2. Примеры</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

In [None]:
BREND_FILE = "../data/BZ_010201_160207.csv"
GDP_FILE = "../data/GDP.csv"
USDRUB_FILE = "../data/USDRUB_010201_160207.csv"
EURRUB_FILE = "../data/USDRUB_010201_160207.csv"

<p>Загрузка данных</p>

In [None]:
df_brend = pd.read_csv(BREND_FILE, sep=";", header=None, skiprows=1, parse_dates=True, usecols=[2,7], 
                       names=["Date","Price"], index_col=0)
df_brend.T

In [None]:
df_gdp = pd.read_csv(GDP_FILE, sep=";", encoding = "iso-8859-1", header=None, parse_dates=True, usecols=[0,1], 
                     names=["Date", "GDP"], index_col=0)
df_gdp.index = df_gdp.index.year
df_gdp.sort_index(inplace=True, ascending=1)
df_gdp.T

In [None]:
df_usdrub = pd.read_csv(USDRUB_FILE, sep=";", header=None, skiprows=1, parse_dates=True, usecols=[2,7], 
                       names = ["Date","USDRUB"], index_col=0)
df_usdrub.T

In [None]:
df_eurrub = pd.read_csv(EURRUB_FILE, sep=";", header=None, skiprows=1, parse_dates=True, usecols=[2,7], 
                       names = ["Date","EURRUB"], index_col=0)
df_eurrub.T

In [None]:
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [None]:
plt.figure(1, figsize=[14,8])

plt.subplot(2,2,1)
plt.title("Brend Oil Price by month")
plt.plot(df_brend.index, df_brend)
plt.grid(True)

plt.subplot(2,2,2)
plt.title("Gross Domestic Product by year")
plt.plot(df_gdp.index, df_gdp)
plt.xticks(df_gdp.index, rotation="vertical")
plt.grid(True)

plt.subplot(2,2,3)
plt.title("USDRUB by month")
plt.plot(df_usdrub.index, df_usdrub)
plt.grid(True)

plt.subplot(2,2,4)
plt.title("EURRUB by month")
plt.plot(df_eurrub.index, df_eurrub)
plt.grid(True)

plt.tight_layout()

plt.show()

<p>Вывод данных по году</p>

In [None]:
df_brend.loc["2001"]

<p>Среднее значение за год</p>

In [None]:
df_brend.loc["2008"].mean()

<p>Средние значения с 2001 по 2016</p>

In [None]:
df_brend__year = df_brend.groupby(df_brend.index.year).mean()
df_brend__year.T

In [None]:
df_usdrub__year = df_usdrub.groupby(df_usdrub.index.year).mean()
df_usdrub__year.T

In [None]:
df_eurrub__year = df_eurrub.groupby(df_eurrub.index.year).mean()
df_eurrub__year.T

In [None]:
plt.figure(1, figsize=[14,8])

plt.subplot(2,2,1)
plt.title("Brend Oil Price by year")
plt.plot(df_brend__year.index, df_brend__year, "o-")
plt.grid(True)

plt.subplot(2,2,2)
plt.title("Gross Domestic Product by year")
plt.plot(df_gdp.index, df_gdp, "o-")
plt.xticks(df_gdp.index, rotation="vertical")
plt.grid(True)

plt.subplot(2,2,3)
plt.title("USDRUB by year")
plt.plot(df_usdrub__year.index, df_usdrub__year, "o-")
plt.grid(True)

plt.subplot(2,2,4)
plt.title("EURRUB by year")
plt.plot(df_eurrub__year.index, df_eurrub__year, "o-")
plt.grid(True)

plt.tight_layout()

plt.show()

<p>Выбор рассматриваемого диапазона</p>

In [None]:
st = pd.Timestamp("2002").year
end = pd.Timestamp("2012").year

In [None]:
d = {
    "Brend": df_brend__year.loc[st:end,"Price"], 
    "GDP": df_gdp.loc[st:end,"GDP"],
    "USDRUB": df_usdrub__year.loc[st:end,"USDRUB"],
    "EURRUB": df_eurrub__year.loc[st:end,"EURRUB"],
}

df_all = pd.DataFrame(d)
df_all

<p>Корреляция</p>

In [None]:
#Pandas
df_corr = df_all.corr(method="pearson") #‘pearson’, ‘kendall’, ‘spearman’
df_corr

In [None]:
#NumPy
np.corrcoef(df_all.T)

In [None]:
#SciPy
pearson_coef, pvalue = stats.pearsonr(df_all.GDP, df_all.Brend)
pearson_coef

In [None]:
labels = df_all.columns.to_list()

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
cax = ax.matshow(df_all.corr())
fig.colorbar(cax)

ax.set_xticklabels([""]+labels)
ax.set_yticklabels([""]+labels)

plt.show()

In [None]:
plt.figure(1, figsize=[15,4])

plt.subplot(1,3,1)
plt.title("Brend Oil Price by year")
plt.plot(df_all.index, df_all.Brend, "o-")
plt.grid(True)

plt.subplot(1,3,2)
plt.title("Gross Domestic Product by year")
plt.plot(df_all.index, df_all.GDP, "o-")
plt.grid(True)

plt.subplot(1,3,3)
plt.title("Brend vs GDP")
plt.plot(df_all.Brend, df_all.GDP, "o")
plt.grid(True)

plt.show()

In [None]:
plt.figure(1, figsize=[15,4])

plt.subplot(1,3,1)
plt.title("Brend Oil Price by year")
plt.plot(df_all.index, df_all.Brend, "o-")
plt.grid(True)

plt.subplot(1,3,2)
plt.title("EURRUB by year")
plt.plot(df_all.index, df_all.EURRUB, "o-")
plt.grid(True)

plt.subplot(1,3,3)
plt.title("Brend vs EURRUB")
plt.plot(df_all.Brend, df_all.EURRUB, "o")
plt.grid(True)

plt.show()

In [None]:
plt.figure(1, figsize=[15,4])

plt.subplot(1,3,1)
plt.title("Brend Oil Price by year")
plt.plot(df_all.index, df_all.Brend, "o-")
plt.grid(True)

plt.subplot(1,3,2)
plt.title("USDRUB by year")
plt.plot(df_all.index, df_all.USDRUB, "o-")
plt.grid(True)

plt.subplot(1,3,3)
plt.title("Brend vs USDRUB")
plt.plot(df_all.Brend, df_all.USDRUB, "o")
plt.grid(True)

plt.show()

In [None]:
plt.figure(1, figsize=[15,4])

plt.subplot(1,3,1)
plt.title("Gross Domestic Product by year")
plt.plot(df_all.index, df_all.GDP, "o-")
plt.grid(True)

plt.subplot(1,3,2)
plt.title("USDRUB by year")
plt.plot(df_all.index, df_all.USDRUB, "o-")
plt.grid(True)

plt.subplot(1,3,3)
plt.title("GDP vs USDRUB")
plt.plot(df_all.GDP, df_all.USDRUB, "o")
plt.xticks(rotation='vertical')
plt.grid(True)

plt.show()

In [None]:
plt.title("GDP(Year) in Russia", size=18)
plt.plot(df_gdp.index, df_gdp, "o-")
plt.xticks(df_gdp.index, rotation="vertical")
plt.xlabel("Year", size=14)
plt.ylabel("GDP, BN RUB", size=14)
plt.grid(True)
plt.show()

<a name="6"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">6. Источники</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

<a href="http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.corr.html">pandas.DataFrame.corr</a><br>
<a href="http://docs.scipy.org/doc/numpy-1.10.1/reference/generated/numpy.corrcoef.html">numpy.corrcoef</a><br>
<a href="http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html">scipy.stats.pearsonr</a><br>

<a href="https://msdn.microsoft.com/ru-ru/library/azure/dn905819.aspx">Вычисления линейной корреляции</a><br>