In [2]:
import numpy as np
import pandas as pd

%precision 3

'%.3f'

In [3]:
df = pd.read_csv('Data/ch2_scores_em.csv', index_col = 'student number')

In [4]:
en_scores = np.array(df['english'])[:10]
ma_scores = np.array(df['mathematics'])[:10]

scores_df = pd.DataFrame({'english':en_scores,
                          'mathematics':ma_scores},
                         index=pd.Index(['A', 'B', 'C', 'D', 'E',
                                         'F', 'G', 'H', 'I', 'J'],
                                        name='student'))

scores_df

Unnamed: 0_level_0,english,mathematics
student,Unnamed: 1_level_1,Unnamed: 2_level_1
A,42,65
B,69,80
C,56,63
D,41,63
E,57,76
F,48,60
G,65,81
H,49,66
I,65,78
J,58,82


In [None]:
# 공분산 = 두 변수 간 편차 곱의 평균
# 분산: 편차^2의 루트, 항상 +값
# 공분산: x편차 * y편차의 평균, -값도 可

summary_df = scores_df.copy()
summary_df['english_deviation'] =\
    summary_df['english'] - summary_df['english'].mean()    # x편차(=x-x_hat)의 평균
summary_df['mathematics_deviation'] =\
    summary_df['mathematics'] - summary_df['mathematics'].mean()
summary_df['product of deviations'] =\
    summary_df['english_deviation'] * summary_df['mathematics_deviation']    # y편차(=y-y_hat)의 평균

summary_df

In [None]:
# 공분산 행렬(row, column...r,c로도 표현)
cov_mat= np.cov(en_scores, ma_scores, ddof=0)
cov_mat

In [None]:
# 행렬의 [0,0]: 영어의 분산
# 행렬의 [1,1]: 수학의 분산
cov_mat[0, 0], cov_mat[1, 1]

In [None]:
# variation 으로 확인 可
np.var(en_scores, ddof=0), np.var(ma_scores, ddof=0)

In [None]:

cov_mat[0, 1], cov_mat[1,0]

In [None]:
# 상관계수: 공분산 / 표준편차
# 단위에 영향받지 않는..

np.cov(en_scores, ma_scores, ddof=0)[0,1] /\
(np.std(en_scores) * np.std(ma_scores))

In [None]:

np.corrcoef(en_scores, ma_scores)

In [None]:
scores_df.corr()

In [None]:
# 산점도

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
english_scores = np.array(df['english'])
math_scores = np.array(df['mathematics'])

fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)

# 산점도 출력
ax.scatter(english_scores, math_scores)
ax.set_xlabel('english')
ax.set_ylabel('mathematics')

plt.show()

In [None]:
# 회귀 직선

# 계수 Beta_0, Beta_1 구하기
poly_fit = np.polyfit(english_scores, math_scores, 1)
# Beta_1 + Beta_1*x 반환하는 함수
poly_1d = np.poly1d(poly_fit)
# 직선 그리기 위한 x좌표 생성
xs = np.linspace(english_scores.min(), english_scores.max())
# xs에 대응하는 y 좌표
ys = poly_1d(xs)

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
ax.set_xlabel('english')
ax.set_ylabel('mathematics')
ax.scatter(english_scores, math_scores, label='score')
ax.plot(xs, ys, color='gray', 
        label=f'{poly_fit[1]:.2f}+{poly_fit[0]:.2f}x')

#
ax.legend(loc='upper left')

plt.show()

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111)

c = ax.hist2d(english_scores, math_scores,
              bins=[9,8], range=[(35, 80), (55, 95)])
ax.set_xlabel('english')
ax.set_ylabel('mathematics')
ax.set_xticks(c[1])
ax.set_yticks(c[2])
# 컬러 바의 표시
fig.colorbar(c[3], ax=ax)

plt.show()

In [None]:
# 앤스컴의 예

# npy형식으로 저장된 numpy arr 읽어들인다
anscombe_data =np.load('202410821/StatisticalAnalysis/Data/ch3_anscombe.npy')
print(anscombe_data.shape)
anscombe_data[0]

In [None]:
stats_df = pd.DataFrame(index=['X_mean', 'X_varience',
                               'Y_mean', 'Y_varience',
                               'X&Y_correlation', 'X&Y_regression'])
for i, data in enumerate(anscombe_data):
    dataX = data[:, 0]
    dataY = data[:, 1]
    poly_fit = np.polyfit(dataX, dataY, 1)
    stats_df[f'data{i+1}'] =\
        [f'{np.mean(dataX):.2f}',
         f'{np.var(dataX):.2f}',
         f'{np.mean(dataY):.2f}',
         f'{np.var(dataY):.2f}',
         f'{np.corrcoef(dataX, dataY)[0, 1]:.2f}',
         f'{poly_fit[1]:.2f}+{poly_fit[0]:.2f}x']

stats_df

In [None]:
# 그래프 그리기 위한 영역 2*2 생성
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 10),
                        sharex=True, sharey=True)

xs = np.linspace(0, 30, 100)
for i, data in enumerate(anscombe_data):
    poly_fit = np.polyfit(data[:,0], data[:,1], 1)
    poly_1d = np.poly1d(poly_fit)
    ys = poly_1d(xs)
    # 그리는 영역을 선택
    ax = axes[i//2, i%2]
    ax.set_xlim([4, 20])
    ax.set_ylim([3, 13])
    # 타이틀 부여
    ax.set_title(f'data{i+1}')
    ax.scatter(data[:,0], data[:,1])
    ax.plot(xs, ys, color='gray')

# 그래프 사이의 간격 좁힘
plt.tight_layout()

plt.show()