In [17]:
import numpy as np
import numpy.random as random
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import scipy as sp
import statsmodels.api as sm

import sklearn
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency
import requests,zipfile
import io

import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

%matplotlib inline
%precision 3


#行の表示数の上限を撤廃
pd.set_option('display.max_rows', None)

#列の表示数の上限を撤廃
pd.set_option('display.max_columns', None)

zip_file_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip'

r = requests.get(zip_file_url, stream=True)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

bank = pd.read_csv('bank-full.csv', sep=';')
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [18]:
bank . columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [19]:
bank.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [20]:
bank_int = bank.select_dtypes(include=int)
bank_object = bank.select_dtypes(include=object)

print(bank_int.columns)
print(bank_object.columns)

Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'], dtype='object')
Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome', 'y'],
      dtype='object')


In [21]:
# 説明変数（例: 'age', 'balance', 'duration'）
X = bank_int

# 定数項を追加
X = sm.add_constant(X)

# 目的変数（y）
y = bank['y'].apply(lambda x: 1 if x == 'yes' else 0)  # 'yes' = 1, 'no' = 0

# ロジスティック回帰モデルの適用
model = sm.Logit(y, X).fit()

# 結果の表示
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.293206
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                45211
Model:                          Logit   Df Residuals:                    45203
Method:                           MLE   Df Model:                            7
Date:                Wed, 22 Jan 2025   Pseudo R-squ.:                  0.1875
Time:                        00:32:12   Log-Likelihood:                -13256.
converged:                       True   LL-Null:                       -16315.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.4699      0.077    -45.099      0.000      -3.621      -3.319
age            0.0080      0.

In [22]:
contingency_job = pd.crosstab(bank['job'], bank['y'])
contingency_job

y,no,yes
job,Unnamed: 1_level_1,Unnamed: 2_level_1
admin.,4540,631
blue-collar,9024,708
entrepreneur,1364,123
housemaid,1131,109
management,8157,1301
retired,1748,516
self-employed,1392,187
services,3785,369
student,669,269
technician,6757,840


In [23]:
# カイ二乗検定（職業と定期預金契約）
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_job)
print("職業と定期預金契約（y）のカイ二乗検定結果:")
print(f"カイ二乗統計量: {chi2_stat}")
print(f"p値: {p_value}")
print(f"自由度: {dof}")
print(f"期待値: \n{expected}\n")

職業と定期預金契約（y）のカイ二乗検定結果:
カイ二乗統計量: 836.1054877471965
p値: 3.337121944935502e-172
自由度: 11
期待値: 
[[4566.072  604.928]
 [8593.504 1138.496]
 [1313.044  173.956]
 [1094.939  145.061]
 [8351.558 1106.442]
 [1999.146  264.854]
 [1394.281  184.719]
 [3668.045  485.955]
 [ 828.268  109.732]
 [6708.266  888.734]
 [1150.569  152.431]
 [ 254.308   33.692]]



In [24]:
# 数値型データの標準化
numerical_transformer = StandardScaler()

# カテゴリ型データのOne-Hotエンコーディング
categorical_transformer = OneHotEncoder(drop='first')

# 前処理パイプラインの構築
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, bank_int),
        ('cat', categorical_transformer, bank_object)
    ])


In [25]:
# k-meansクラスタリングを行うパイプラインの作成
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('kmeans', KMeans(n_clusters=4, random_state=42))  # ここでクラスタ数を指定（4は仮の例）
])

In [26]:
# クラスタリングの実行
pipeline.fit(bank)

# 各顧客のクラスタラベルを取得
bank['cluster'] = pipeline.named_steps['kmeans'].labels_

# 主成分分析（PCA）を使用して結果を2Dで可視化
pca = PCA(n_components=2)
principal_components = pca.fit_transform(preprocessor.fit_transform(bank))

# 可視化
plt.figure(figsize=(8, 6))
sns.scatterplot(x=principal_components[:, 0], y=principal_components[:, 1], hue=bank['cluster'], palette='viridis')
plt.title("K-means Clustering Results (PCA)")
plt.show()

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed