# 통계분석

## 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

data=pd.read_csv('D:\Machine Learning\ADP-master\data/bike_marketing.csv')

# type별로 컬럼 분류
col_int = ['company_num', 'marketing_total', 'employees']
col_float = ['google_adwords', 'facebook', 'twitter', 'revenues']
col_cat = ['pop_density']

col_num = col_int + col_float
data[col_cat]= data[col_cat].astype('str')
data[col_int] = data[col_int].astype('int', errors='ignore')
data[col_float] = data[col_float].astype('float')

data

Unnamed: 0,company_num,google_adwords,facebook,twitter,marketing_total,revenues,employees,pop_density
0,1,65.66,47.86,52.46,166,39.26,5,High
1,2,39.10,55.20,77.40,172,38.90,7,Medium
2,3,174.81,52.01,68.01,295,49.51,11,Medium
3,4,34.36,61.96,86.86,183,40.56,7,High
4,5,78.21,40.91,30.41,150,40.21,9,Low
...,...,...,...,...,...,...,...,...
177,178,10000.00,23.72,12.32,344,46.02,9,Low
178,179,25000.00,24.26,34.36,102,39.16,6,Medium
179,180,59000.00,53.96,17.46,136,43.96,9,High
180,181,4000.00,28.18,22.88,148,42.38,8,Low


## 기술통계량

In [2]:
def DA(data):
    da = data.describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])
    da = da.T
    df1 = data.isna().sum() # 결측값
    df1.name = 'missing'
    df2 = data.median() # 중앙값
    df2.name = 'median'
    df3 = np.var(data) # 분산
    df3.name = 'variance'
    df4 = data.skew() # 왜도
    df4.name = 'skewness'
    df5 = data.kurtosis() # 첨도
    df5.name = 'kurtosis'
    da = pd.concat([da,df1,df2,df3,df4,df5], axis=1) # 모두 합침
    da['total'] = da['count'] + da['missing'] # 전체 데이터 수
    col_nm = da.columns.tolist()
    order = ['total','count','missing','mean','median','std','variance','skewness','kurtosis','min',
             '5%','10%','25%','50%','75%','90%','95%','max']
    col_nm_new=[]
    for i in order:
        col_nm_new.append(i)
    da = da[col_nm_new]
    da = da.round(2) 
    return da

DA(data[col_num])

Unnamed: 0,total,count,missing,mean,median,std,variance,skewness,kurtosis,min,5%,10%,25%,50%,75%,90%,95%,max
company_num,182.0,182.0,0,91.5,91.5,52.68,2760.25,0.0,-1.2,1.0,10.05,19.1,46.25,91.5,136.75,163.9,172.95,182.0
marketing_total,182.0,182.0,0,244.03,249.0,96.57,9275.3,0.06,-0.94,54.0,101.05,118.1,158.25,249.0,323.75,357.0,399.95,481.0
employees,182.0,182.0,0,7.92,8.0,2.37,5.57,-0.24,-0.67,3.0,4.0,4.1,6.0,8.0,10.0,10.9,12.0,12.0
google_adwords,182.0,182.0,0,1081.41,185.69,6495.36,41957859.32,8.42,72.79,23.65,35.16,45.01,98.25,185.69,252.47,306.7,1156.05,60000.0
facebook,182.0,182.0,0,34.01,33.66,15.23,230.68,0.04,-1.31,8.0,11.52,13.91,19.53,33.66,47.86,53.96,56.65,62.17
twitter,182.0,182.0,0,38.93,34.34,22.06,484.13,0.91,0.68,5.89,12.32,14.48,21.04,34.34,52.78,70.01,77.83,122.19
revenues,182.0,182.0,0,44.72,44.09,5.82,33.65,0.31,-0.5,30.45,36.59,38.01,40.39,44.09,48.91,53.06,54.83,58.38


In [3]:
da_cat = pd.DataFrame()
for i in  col_cat:
    a = data[i].value_counts(dropna =False).to_frame().sort_index().rename(columns={i:'count'}).reset_index()
    a['col_nm'] = i
    a = a.rename(columns = {'index':'class'})
    a = a[['col_nm','class','count']]
    b = data[i].value_counts(dropna =False, normalize = True).to_frame().sort_index().rename(columns={i:'ratio'}).reset_index()
    b = b['ratio'].to_frame()
    a = pd.concat([a,b],axis=1)
    da_cat = pd.concat([da_cat,a], axis = 0)
da_cat = da_cat.reset_index(drop=True)
da_cat

Unnamed: 0,col_nm,class,count,ratio
0,pop_density,High,54,0.296703
1,pop_density,Low,72,0.395604
2,pop_density,Medium,56,0.307692


## 상관분석

In [4]:
data.corr()

Unnamed: 0,company_num,google_adwords,facebook,twitter,marketing_total,revenues,employees
company_num,1.0,0.237863,-0.113343,-0.10733,-0.011841,-0.032752,0.032857
google_adwords,0.237863,1.0,0.096509,0.049002,-0.005958,0.060903,0.066488
facebook,-0.113343,0.096509,1.0,0.353226,0.312744,0.586599,0.429439
twitter,-0.10733,0.049002,0.353226,1.0,0.386476,0.284592,0.229192
marketing_total,-0.011841,-0.005958,0.312744,0.386476,1.0,0.850996,0.715197
revenues,-0.032752,0.060903,0.586599,0.284592,0.850996,1.0,0.772302
employees,0.032857,0.066488,0.429439,0.229192,0.715197,0.772302,1.0


In [5]:
import scipy.stats as spst
print('스피어만 상관계수: ', spst.spearmanr(data['twitter'], data['revenues']).correlation) 
print('켄달의 타우: ', spst.kendalltau(data['twitter'], data['revenues']).correlation)
print('상관계수의 통계적 검증: \n', spst.pearsonr(data['twitter'], data['revenues'])) #상관계수, p-value 출력

스피어만 상관계수:  0.244715787420965
켄달의 타우:  0.16536007414758183
상관계수의 통계적 검증: 
 (0.28459186902273936, 9.871568693501685e-05)


## t-test

In [6]:
import scipy.stats as spst

dat_M = [117, 108, 105, 89, 101, 93, 96, 108, 108, 94, 93, 112, 92, 91, 100, 96, 120, 86, 96, 95]
dat_F = [121, 101, 102, 114, 103, 105, 101, 131, 96, 109, 109, 113, 115, 94, 108, 96, 110, 112, 120, 100]
dat_K = [100, 102, 103, 90, 88, 102, 100, 120, 94, 100, 124, 184, 89, 94, 100, 95, 120, 140, 137, 102]

m = spst.ttest_ind(dat_M, dat_F, equal_var=False) #equal_var : 등분산여부
t=m.statistic
print('독립표본 t-test:', spst.ttest_ind(dat_M, dat_F, equal_var=False))
print('Cohens d:', abs(t) / np.sqrt((len(dat_M)+len(dat_F)-2)))
print('Pearsons R:', np.sqrt((t**2) / ((t**2) + (len(dat_M)+len(dat_F)-2))))
print('대응표본 t-test:', spst.ttest_rel(dat_M, dat_F))
print('ANOVA: ', spst.f_oneway(dat_M, dat_F, dat_K))

독립표본 t-test: Ttest_indResult(statistic=-2.670573872669349, pvalue=0.01108318824471652)
Cohens d: 0.4332242888591059
Pearsons R: 0.39752319599996255
대응표본 t-test: Ttest_relResult(statistic=-2.9868874599588247, pvalue=0.007578486289181322)
ANOVA:  F_onewayResult(statistic=2.0965208952151277, pvalue=0.13225898896377966)


## 카이제곱 검정

In [7]:
Data=pd.crosstab(data['twitter'], data['facebook'])
chi_result=spst.chi2_contingency(Data)
chi_result #카이제곱통계량, p-value, 자유도, 기대빈도분할표

(30394.000000000007,
 2.526580124903946e-14,
 28557,
 array([[0.00549451, 0.00549451, 0.00549451, ..., 0.00549451, 0.00549451,
         0.00549451],
        [0.00549451, 0.00549451, 0.00549451, ..., 0.00549451, 0.00549451,
         0.00549451],
        [0.00549451, 0.00549451, 0.00549451, ..., 0.00549451, 0.00549451,
         0.00549451],
        ...,
        [0.00549451, 0.00549451, 0.00549451, ..., 0.00549451, 0.00549451,
         0.00549451],
        [0.00549451, 0.00549451, 0.00549451, ..., 0.00549451, 0.00549451,
         0.00549451],
        [0.00549451, 0.00549451, 0.00549451, ..., 0.00549451, 0.00549451,
         0.00549451]]))

In [8]:
data=np.array([[8,12], [1,5]])
data=pd.DataFrame(data, columns=['Atlantic', 'Indian'])
data.index=['whales', 'sharks']
oddsratio, pvalue=spst.fisher_exact(data) #피셔의 정확성 검정 (카이제곱 대신)
print('oddsratio: {0}, pvalue: {1}'.format (oddsratio, pvalue))

oddsratio: 3.3333333333333335, pvalue: 0.37975068409851087


In [9]:
data=pd.DataFrame([[40,6], [2,11]], columns=['After: Good', 'After: Bad'], index=['Before: Good', 'Before: Bad'])
data=np.array(data)
from statsmodels.stats.contingency_tables import mcnemar #맥네마의 검정 : 같은 샘플의 전후 상태를 교차표로 만들어 검정 
result=mcnemar(data, exact=True) #result=mcnemar(data, exact=False, correction=True) : 샘플사이즈가 25개 이상일 때
print(result)

pvalue      0.2890624999999999
statistic   2.0


## 판별분석 

In [10]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
x=np.array([[-1,-1],[-2,-1],[-3,-2],[1,1],[2,1],[3,2]])
y=np.array([1,1,1,2,2,2])
clf=QuadraticDiscriminantAnalysis()
clf.fit(x,y)
clf.predict([[-0.8, -1]])

array([1])

## 대응분석 : 2개 이상의 범주 군 사이의 상관성을 분석하는 기법

In [11]:
X=[[0,0,1],[1,0,0],[2,2,2],[3,5,4]]
Y=[[0,1,-0.2], [0,9,1.1], [6,2,5.9], [11,9,12.3]]

from sklearn.cross_decomposition import CCA
cca=CCA(n_components=1)
cca.fit(X,Y)
X_c, Y_c=cca.transform(X,Y)
X_test=[[2,4,5]]
Y_test=[[0.4, 5, 5]]
cca.predict(X_test, Y_test)

array([[10.91678023,  6.9609436 , 11.81740118]])

## 다차원척도법 MDS

In [12]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

data=pd.read_csv('D:\Machine Learning\ADP-master\data/bike_marketing.csv')

# type별로 컬럼 분류
col_int = ['company_num', 'marketing_total', 'employees']
col_float = ['google_adwords', 'facebook', 'twitter', 'revenues']
col_cat = ['pop_density']

col_num = col_int + col_float
data[col_cat]= data[col_cat].astype('str')
data[col_int] = data[col_int].astype('int', errors='ignore')
data[col_float] = data[col_float].astype('float')

X=data[col_num]
X

Unnamed: 0,company_num,marketing_total,employees,google_adwords,facebook,twitter,revenues
0,1,166,5,65.66,47.86,52.46,39.26
1,2,172,7,39.10,55.20,77.40,38.90
2,3,295,11,174.81,52.01,68.01,49.51
3,4,183,7,34.36,61.96,86.86,40.56
4,5,150,9,78.21,40.91,30.41,40.21
...,...,...,...,...,...,...,...
177,178,344,9,10000.00,23.72,12.32,46.02
178,179,102,6,25000.00,24.26,34.36,39.16
179,180,136,9,59000.00,53.96,17.46,43.96
180,181,148,8,4000.00,28.18,22.88,42.38


In [13]:
from sklearn.manifold import MDS
mds=MDS(n_components=2) #계량적    *참고: nmds=MDS(n_components=2, metric=False) : 비계량적
X_transformed=mds.fit_transform(X[:100])
X_transformed.shape

(100, 2)