Decision Tree Method --- analysis-of-student-mental-health 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# 加载数据
df = pd.read_csv(r'C:\Users\isabe\Downloads\Student Mental health.csv')

# 删除 'Timestamp' 列
df = df.drop('Timestamp', axis=1)

# 为二元分类特征应用标签编码
binary_columns = ['Choose your gender', 'Marital status', 'Do you have Depression?', 
                  'Do you have Anxiety?', 'Do you have Panic attack?', 'Did you seek any specialist for a treatment?']
label_encoder = LabelEncoder()
for col in binary_columns:
    df[col] = label_encoder.fit_transform(df[col])

# 确定其他分类特征
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

# 应用独热编码
df_encoded = pd.get_dummies(df, columns=categorical_columns)

# 分离特征和目标变量
X = df_encoded.drop('Do you have Depression?', axis=1)
y = df_encoded['Do you have Depression?']

# 拆分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 初始化决策树分类器，设置剪枝参数
dtree = DecisionTreeClassifier(max_depth=4, random_state=42)

# 训练模型
dtree.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = dtree.predict(X_test)

# 评估模型
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# 输出准确度和分类报告
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Accuracy: 0.7419354838709677
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.90      0.82        20
           1       0.71      0.45      0.56        11

    accuracy                           0.74        31
   macro avg       0.73      0.68      0.69        31
weighted avg       0.74      0.74      0.72        31



In [None]:
# 获取特征重要性
import numpy as np
importances = dtree.feature_importances_

# 转换为长条图需要的格式
indices = np.argsort(importances)[::-1]
names = [X.columns[i] for i in indices]

# 创建长条图
plt.figure(figsize=(15, 7))
plt.title("Feature Importance")
plt.bar(range(X.shape[1]), importances[indices])
plt.xticks(range(X.shape[1]), names, rotation=90)
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# 训练决策树模型
dtree = DecisionTreeClassifier(max_depth=4, random_state=42)
dtree.fit(X_train, y_train)

# 可视化决策树
plt.figure(figsize=(20,10))
plot_tree(dtree, filled=True, feature_names=X.columns, class_names=['No Depression', 'Depression'], fontsize=10)
plt.title('Decision Tree for Predicting Depression')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# 假设df是您的数据框，我们将为每个列绘制直方图
for column in df.columns:
    plt.figure()  # 创建新的图形
    df[column].hist(bins=20)  # 对于每个列生成直方图，您可以调整bins的数量
    plt.title(column)  # 设置图表标题为列名
    plt.show()


In [None]:
# 假设df是你的Pandas DataFrame
number_of_instances = df.shape[0]  # DataFrame中的行数
number_of_attributes = df.shape[1] - 1  # DataFrame中的列数，减去目标变量的一列（如果有的话）

print("number_of_attributes: " + str(number_of_attributes))
print("number_of_instances: " + str(number_of_instances))

Neural networks --- analysis-of-student-mental-health

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# 加载数据
data = pd.read_csv(r'C:\Users\isabe\Downloads\Student Mental health.csv')

# 对 'Age' 列处理缺失值
if 'Age' in data.columns:
    imputer = SimpleImputer(strategy='mean')
    data['Age'] = imputer.fit_transform(data[['Age']])

# 选择分类特征进行独热编码
categorical_features = ['Choose your gender', 'What is your course?', 'Your current year of Study', 'Marital status']
for feature in categorical_features:
    if feature in data.columns:
        encoder = OneHotEncoder()  # 移除了sparse=False参数
        encoded = encoder.fit_transform(data[[feature]]).toarray()  # 使用.toarray()转换为密集矩阵
        encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([feature]))
        data = pd.concat([data, encoded_df], axis=1).drop([feature], axis=1)

# 特征缩放 'Age' 列
if 'Age' in data.columns:
    scaler = StandardScaler()
    data['Age'] = scaler.fit_transform(data[['Age']])

# 将目标变量映射为数值
data['Do you have Depression?'] = data['Do you have Depression?'].map({'Yes': 1, 'No': 0})

# 删除非特征列
features = data.drop(['Timestamp', 'Do you have Depression?', 'Do you have Anxiety?', 'Do you have Panic attack?', 'Did you seek any specialist for a treatment?'], axis=1)
target = data['Do you have Depression?']

# 数据分割
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import pandas as pd

# 假设data已经被加载和定义
# 缺失值处理
imputer = SimpleImputer(strategy='mean')
data[['Age']] = imputer.fit_transform(data[['Age']].values.reshape(-1,1))

# 编码分类变量
categorical_features = ['Choose your gender', 'What is your course?', 'Your current year of Study', 'What is your CGPA?', 'Marital status']
encoder = OneHotEncoder()  # 移除了sparse=False
categorical_encoded = encoder.fit_transform(data[categorical_features]).toarray()  # 使用.toarray()转换为密集矩阵
categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out(categorical_features))
data = pd.concat([data, categorical_encoded_df], axis=1).drop(categorical_features, axis=1)

# 特征缩放
scaler = StandardScaler()
data[['Age']] = scaler.fit_transform(data[['Age']].values.reshape(-1,1))

# 数据分割
X = data.drop(['Timestamp', 'Do you have Depression?', 'Do you have Anxiety?', 'Do you have Panic attack?', 'Did you seek any specialist for a treatment?'], axis=1)
y = data['Do you have Depression?']  # 根据你的分析目标选择
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# 定义模型
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# 编译模型
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
# 假设目标变量 y 是'Yes'或'No'，将其转换为数值
y_train = y_train.map({'Yes': 1, 'No': 0})
y_test = y_test.map({'Yes': 1, 'No': 0})

# 训练模型
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# 评估模型
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)


In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='validation')
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# 绘制损失曲线
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()


In [None]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# 假设X和y已经是准备好的数据和标签，这里X和y需要是NumPy数组
# 如果X和y是Pandas DataFrame或Series，使用.values属性将它们转换为NumPy数组
X = X.values
y = y.map({'Yes': 1, 'No': 0}).values

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

for train, test in kfold.split(X, y):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X[train].shape[1],)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    # 使用.iloc对于Pandas DataFrame，对于NumPy数组直接使用索引
    model.fit(X[train], y[train], epochs=10, batch_size=32, verbose=0)
    
    scores.append(model.evaluate(X[test], y[test], verbose=0))

average_score = np.mean(scores, axis=0)
print(f'Average accuracy: {average_score[1]}, Average loss: {average_score[0]}')


Decision Tree Method --- house-price-prediction data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 加载数据
df = pd.read_csv(r'C:\Users\isabe\Downloads\house-price-prediction data.csv')  # 使用上传的文件路径

# 删除所有非数值型列
df = df.drop(['date', 'street', 'city', 'statezip', 'country'], axis=1)

# 分离特征和目标变量
X = df.drop('price', axis=1)  # 删除目标变量列
y = df['price']  # 目标变量

# 拆分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 初始化决策树回归器，设置剪枝参数
dtree = DecisionTreeRegressor(max_depth=4, random_state=42)

# 训练模型
dtree.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = dtree.predict(X_test)

# 评估模型
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 输出MSE和R^2分数
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 特征重要性图
feature_importances = dtree.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances')
plt.show()

# 实际值与预测值比较图
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)  # 绘制45度线
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs. Predicted Values')
plt.show()

# 决策树图（需要安装 graphviz 和 dtreeviz 库）
from sklearn.tree import plot_tree

plt.figure(figsize=(20, 10))
plot_tree(dtree, feature_names=features, filled=True)
plt.title('Decision Tree')
plt.show()

# 误差图
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.title('Distribution of Residuals')
plt.xlabel('Residuals')
plt.show()


In [None]:
# 假设df是你的Pandas DataFrame
number_of_instances = df.shape[0]  # DataFrame中的行数
number_of_attributes = df.shape[1] - 1  # DataFrame中的列数，减去目标变量的一列（如果有的话）

print("number_of_attributes: " + str(number_of_attributes))
print("number_of_instances: " + str(number_of_instances))

Neural networks --- house-price-prediction data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# 加载数据
df = pd.read_csv(r'C:\Users\isabe\Downloads\house-price-prediction data.csv')  # 使用上传的文件路径

# 删除所有非数值型列
df = df.drop(['date', 'street', 'city', 'statezip', 'country'], axis=1)

# 分离特征和目标变量
X = df.drop('price', axis=1)
y = df['price']

# 拆分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 特征标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 构建神经网络模型
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))  # 输出层使用线性激活函数，因为这是一个回归问题

# 编译模型
model.compile(optimizer='adam', loss='mean_squared_error')

# 训练模型
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1)

# 评估模型
mse = model.evaluate(X_test, y_test)
print("Mean Squared Error:", mse)


In [None]:
from sklearn.model_selection import KFold
import numpy as np

# 定义K折交叉验证参数
k = 5
kf = KFold(n_splits=k, random_state=42, shuffle=True)

# 准备记录每个折的分数
fold_no = 1
loss_per_fold = []

for train, test in kf.split(X):
    # 定义模型结构
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dense(32, activation='relu'))
    # 可以在这里添加Dropout层，例如：model.add(Dropout(0.2))
    model.add(Dense(1, activation='linear'))  # 回归问题使用线性激活函数

    # 编译模型
    model.compile(optimizer='adam', loss='mean_squared_error')

    # 选择划分数据
    X_train_fold, y_train_fold = X.iloc[train], y.iloc[train]
    X_test_fold, y_test_fold = X.iloc[test], y.iloc[test]

    # 标准化特征数据
    scaler = StandardScaler()
    X_train_fold = scaler.fit_transform(X_train_fold)
    X_test_fold = scaler.transform(X_test_fold)

    # 训练模型
    print(f'Training for fold {fold_no} ...')
    model.fit(
        X_train_fold, y_train_fold,
        epochs=50,  # 可能需要根据模型的具体情况调整epochs的数量
        batch_size=32,
        verbose=0
    )

    # 评估模型
    scores = model.evaluate(X_test_fold, y_test_fold, verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores}')
    loss_per_fold.append(scores)

    # 增加折数
    fold_no = fold_no + 1

# 输出交叉验证的平均损失
print(f'Average scores for all folds: {np.mean(loss_per_fold)}')


In [None]:
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))  # 增加到128个神经元
model.add(Dense(64, activation='relu'))  # 增加到64个神经元
model.add(Dense(1, activation='linear')) # 输出层使用线性激活函数，因为这是一个回归问题
# 编译模型
model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
# 使用测试数据评估模型性能
test_loss = model.evaluate(X_test, y_test, verbose=0)

# 打印测试集上的损失值
print("Test loss:", test_loss)
