In [12]:
import numpy as np
import numpy.random as random
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import scipy as sp
import statsmodels.api as sm

import sklearn
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind
import requests,zipfile
import io
%matplotlib inline
%precision 3


#行の表示数の上限を撤廃
pd.set_option('display.max_rows', None)

#列の表示数の上限を撤廃
pd.set_option('display.max_columns', None)

zip_file_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip'

r = requests.get(zip_file_url, stream=True)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

bank = pd.read_csv('bank-full.csv', sep=';')
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [13]:
print(bank.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB
None


In [14]:
bank.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [15]:
# グループに分ける
yes_group = bank[bank["y"] == "yes"]["balance"]
no_group = bank[bank["y"] == "no"]["balance"]

# t検定
t_stat, p_value = ttest_ind(yes_group, no_group, equal_var=False)  # Welchのt検定

# 結果の表示
print("t統計量:", t_stat)
print("p値:", p_value)

# 有意差の解釈
alpha = 0.05
if p_value < alpha:
    print("帰無仮説を棄却します: 2つのグループ間で平均値に有意な差があります。")
else:
    print("帰無仮説を棄却できません: 2つのグループ間で平均値に有意な差はありません。")


t統計量: 9.933545392962255
p値: 4.3837327771001536e-23
帰無仮説を棄却します: 2つのグループ間で平均値に有意な差があります。


In [16]:
bank["age"].unique()

array([58, 44, 33, 47, 35, 28, 42, 43, 41, 29, 53, 57, 51, 45, 60, 56, 32,
       25, 40, 39, 52, 46, 36, 49, 59, 37, 50, 54, 55, 48, 24, 38, 31, 30,
       27, 34, 23, 26, 61, 22, 21, 20, 66, 62, 83, 75, 67, 70, 65, 68, 64,
       69, 72, 71, 19, 76, 85, 63, 90, 82, 73, 74, 78, 80, 94, 79, 77, 86,
       95, 81, 18, 89, 84, 87, 92, 93, 88], dtype=int64)

In [17]:
bank["balance"].unique()

array([ 2143,    29,     2, ...,  8205, 14204, 16353], dtype=int64)

In [19]:


# 説明変数（例: 'age', 'balance', 'duration'）
X = bank[['age', 'balance', 'duration']]

# 定数項を追加
X = sm.add_constant(X)

# 目的変数（y）
y = bank['y'].apply(lambda x: 1 if x == 'yes' else 0)  # 'yes' = 1, 'no' = 0

# ロジスティック回帰モデルの適用
model = sm.Logit(y, X).fit()

# 結果の表示
print(model.summary())


Optimization terminated successfully.
         Current function value: 0.302937
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                45211
Model:                          Logit   Df Residuals:                    45207
Method:                           MLE   Df Model:                            3
Date:                Sun, 19 Jan 2025   Pseudo R-squ.:                  0.1605
Time:                        17:38:13   Log-Likelihood:                -13696.
converged:                       True   LL-Null:                       -16315.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.5539      0.066    -53.471      0.000      -3.684      -3.424
age            0.0073      0.

In [20]:
# モデルの概要を文字列として保存
summary_text = model.summary().as_text()

# プロット用にテキストを分割して表示
fig, ax = plt.subplots(figsize=(10, 8))
ax.axis('off')
ax.text(0, 1, summary_text, fontsize=10, va='top', family='monospace')

# ファイルパス
file_path = r"C:\Users\sk062\OneDrive\デスクトップ\Imagefile\model_summary.jpeg"

# 画像を保存
plt.savefig(file_path, bbox_inches='tight', dpi=300)
plt.close()

file_path


'C:\\Users\\sk062\\OneDrive\\デスクトップ\\Imagefile\\model_summary.jpeg'