In [None]:
# Initialize Otter
import otter
grader = otter.Notebook("code.ipynb")

# 1 用户存款行为预测

### 数据集

银行营销数据集

#### 背景介绍

金融机构为了在下一次营销活动寻找最佳的改进策略，探寻如何在未来的营销活动中发挥更大的效力。为了解决这个问题，数据分析师必须分析该银行最近开展的营销活动，通过之前活动的情况预测用户是否会参加之后的营销活动。

#### 特征介绍

##### 个人信息相关
1 - age : 年龄 (数值列)

2 - job : 职业的种类(类别列: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')

3 - marital : 婚姻状况 (类别列: 'divorced','married','single','unknown')

4 - education : 受教育情况(类别列: 
'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')

5 - default: 是否信用违约 (类别列: 'no','yes','unknown')

6 - balance: 余额 (数值列)

7 - housing: 是否有住房贷款(类别列: 'no','yes','unknown')

8 - loan: 是否有个人贷款? (类别列: 'no','yes','unknown')



##### 与当前营销活动的最后一次联系：
9 - contact: 联系人通信类型 (类别列: 'cellular','telephone')

10 - month: 最后一次联系的月份 (类别列: 'jan', 'feb', 'mar', ..., 'nov', 'dec')

11 - day: 最后一次联系在星期几？ (类别列: 'mon','tue','wed','thu','fri')

12 - duration: 最后一次联系的时长, 单位秒 (数值列). 

##### 其他的一些特征：
13 - campaign: 此活动期间和此客户的联系人数 (数值列)

14 - pdays: 上次活动中联系客户后经过的天数 (数值列; 999 表示近期无联系)

15 - previous: 此活动之前和此客户端执行的联系人数 (数值列)

16 - poutcome: 上一次营销活动的结果 (类别列: 'failure','nonexistent','success')




### 目标

##### 预测定期存款额度

17 - deposit: 是否会存款 (类别列: 'yes','no')


- 数据来源：https://www.kaggle.com/datasets/janiobachmann/bank-marketing-dataset

## 1.1 环境导入和数据准备

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression
rng_seed = 44

In [None]:
df=pd.read_csv("bank.csv")
df.info()

In [None]:
from sklearn.preprocessing import LabelEncoder
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

print(len(x))
x_simple_prepared = pd.get_dummies(x.copy())

lbe =  LabelEncoder()
lbe.fit(y)
y = lbe.transform(y)

## 1.2 模型的训练和评估

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

rng_seed = 44
np.random.seed(rng_seed)



# Split the data
x_train, x_test, y_train, y_test = train_test_split(x_simple_prepared, y, train_size=0.8, test_size=0.2, random_state=0)

# Train SVC model
svc_model = SVC(random_state=0, probability=True)
svc_model.fit(x_train, y_train)

# Evaluate SVC model
y_svc_pred = svc_model.predict(x_test)
svc_accuracy = accuracy_score(y_test, y_svc_pred)

# Train LR model
lr_model = LogisticRegression(random_state=0, max_iter=10000)
lr_model.fit(x_train, y_train)

# Evaluate LR model
y_lr_pred = lr_model.predict(x_test)
lr_accuracy = accuracy_score(y_test, y_lr_pred)

print('svc accuracy:', svc_accuracy)
print('lr accuracy:', lr_accuracy)

# Assign the calculated accuracy to svc_simple_score
svc_simple_score = svc_accuracy


## 1.3 数据预处理和再训练

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif


rng_seed = 44
np.random.seed(rng_seed)

# Assume df is your dataset
# Handling outliers
# ...
df = df.dropna()
# Separating features and labels, encoding labels
y = df.iloc[:, -1]
lbe = LabelEncoder()
y_encoded = lbe.fit_transform(y)

# Encoding features
x = df.iloc[:, :-1]
x_encoded = pd.get_dummies(x.copy())

# Normalizing features
scaler = preprocessing.StandardScaler()
x_normalized = scaler.fit_transform(x_encoded)

# Feature engineering, using SelectKBest as an example
# You can choose other feature engineering methods based on your specific needs
k_best = SelectKBest(mutual_info_classif, k=51)
x_feature_engineered = k_best.fit_transform(x_normalized, y_encoded)

# Splitting the dataset
x_train_my, x_test, y_train, y_test = train_test_split(x_feature_engineered, y_encoded, train_size=0.9, test_size=0.1, random_state=0)

# Encoding features for simple LR model
x_simple_encoded = pd.get_dummies(x.copy())

# Splitting the dataset for simple LR model
x_train_simple, x_test_simple, y_train_simple, y_test_simple = train_test_split(x_simple_encoded, y, train_size=0.9, test_size=0.1, random_state=0)

# Training the SVC model
svc_model = SVC(random_state=0, probability=True)
svc_model.fit(x_train_my, y_train)
y_svc_pred = svc_model.predict(x_test)
svc_score = accuracy_score(y_test, y_svc_pred)

# Training the simple LR model
lr_model_simple = LogisticRegression(random_state=0, max_iter=10000)
lr_model_simple.fit(x_train_simple, y_train_simple)
y_lr_pred_simple = lr_model_simple.predict(x_test_simple)
lr_simple_score = accuracy_score(y_test_simple, y_lr_pred_simple)

# Training the enhanced LR model
lr_model = LogisticRegression(random_state=0, max_iter=10000)
lr_model.fit(x_train_my, y_train)
y_lr_pred = lr_model.predict(x_test)
lr_score = accuracy_score(y_test, y_lr_pred)

print('svc accuracy:', svc_score)
print('lr_simple accuracy:', lr_simple_score)
print('lr accuracy:', lr_score)


# 2 逻辑回归分类器的实现

## 2.1 确定优化目标

In [None]:
rng_seed = 44
np.random.seed(rng_seed)
#  实现sigmoid激活函数
def sigmoid(z):
    return 1 / (1 + np.exp(-z))
# 实现二元交叉熵损失函数
def logistic_loss(y, y_hat):
    logistic_loss =-np.mean( y * np.log(y_hat ) + (1 - y) * np.log(1 - y_hat ))
    return logistic_loss
# 定义整体损失函数
def loss_function(X, y, weights):
    z = np.dot(X, weights)
    y_hat = sigmoid(z)
    loss = logistic_loss(y, y_hat)
    return loss


## 2.2 计算优化目标的梯度


$\nabla_{\theta} J(\theta) = \frac{1}{m} X^T (\hat{y} - y)$


In [None]:
# 实现sigmoid激活函数
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# 实现梯度函数
def gradient(X, y, weights):
    m = y.size
    z = np.dot(X, weights)
    y_hat = sigmoid(z)
    error = y_hat - y
    gradient = (1 / m) * np.dot(X.T, error)
    return gradient

## 2.3 使用随机梯度下降（SGD）进行优化

权重更新的数学表示为：

$\theta = \theta - \alpha \nabla_{\theta} J(\theta)$

In [None]:
def stochastic_gradient_descent(X, y, weights, learning_rate=0.01, num_iterations=100):
    m = y.size
    loss_history = []
    
    for i in range(num_iterations):
        for j in range(m):
            # 随机选择一个数据点
            idx = np.random.randint(m)
            X_i = X[idx, :].reshape(1, -1)
            y_i = y[idx]
            
            # 计算梯度并更新权重
            grad = gradient(X_i,y_i,weights)
            weights -= learning_rate * grad
            
            # 计算并记录损失，用于监控
            loss = loss_function(X,y,weights)
            loss_history.append(loss)
        
        # 每10次迭代打印一次损失值
#         if (i+1) % 1 == 0:
#             print(f"Iteration {i+1}/{num_iterations}, Loss: {loss:.4f}")
    
    return weights, loss_history
# rng_seed=44
# np.random.seed(rng_seed)
# num_features = 6
# num_samples = 138
# dummy_weights = np.random.rand(num_features)
# dummy_X = np.random.rand(num_samples, num_features)
# dummy_y = np.random.randint(0, 2, num_samples)
# initial_weights = np.zeros(num_features)

# final_weights, loss_history = stochastic_gradient_descent(dummy_X, dummy_y, initial_weights)

## 2.4 训练与评测


In [None]:
# 得到初始的x_train, x_test

x_simple_prepared = pd.get_dummies(x.copy())

lbe = LabelEncoder()
lbe.fit(y)
y = lbe.transform(y)

# 划分train，test数据集
x_train, x_test, y_train, y_test = train_test_split(x_simple_prepared, y, train_size=0.8, test_size=1-0.8, random_state=0)
x_train = x_train.values
x_test = x_test.values
x_train = np.where(x_train == True, 1, x_train)
x_train = np.where(x_train == False, 0, x_train)
x_test = np.where(x_test == True, 1, x_test)
x_test = np.where(x_test == False, 0, x_test)
x_train = x_train.astype(np.float64)
x_test = x_test.astype(np.float64)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

np.random.seed(rng_seed)

def predict(X, weights):
    # 根据给定的权重，预测数据集X的标签
    predictions = np.dot(X, weights)
    return np.where(predictions >= 0.5, 1, 0)

num_features = x_train.shape[1]

# 标准化x_train和x_test
mean = np.mean(x_train, axis=0)
std = np.std(x_train, axis=0)

x_train = (x_train - mean) / std
x_test = (x_test - mean) / std

# weight初始化为0
weights = np.zeros(num_features)

learning_rate = 0.01
num_iterations = 10

final_weights, loss_history = stochastic_gradient_descent(x_train, y_train, weights, learning_rate, num_iterations)

predictions = predict(x_test, final_weights)

accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

accuracy
