In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump

from collections import Counter

import warnings

warnings.filterwarnings('ignore')

sns.set_style('darkgrid')

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score

plt.rcParams['font.family'] = ['Heiti TC']

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB


In [68]:
df = pd.read_csv('Graduate Admission/Admission_Predict_Ver1.1.csv')

In [70]:
df.head(10)

In [72]:
df.info()

In [74]:
l = df.columns
print('段名为: ', l)

In [76]:
print(df.isnull().sum())

In [78]:
import numpy as np
from collections import Counter


def detect_outliers(df, n, features):
    """
    接受一个DataFrame（df）和特征列表，返回包含超过n个Tukey方法定义的异常值的观察索引列表。
    参数:
    df : DataFrame, 包含数据的DataFrame。
    n : int, 被认为是多异常值的最小数量。
    features : list, 需要检查异常值的特征列表。
    """
    outlier_indices = []

    # 遍历每个特征（列）
    for col in features:
        # 第一四分位数（25%）
        Q1 = np.percentile(df[col], 25)
        # 第三四分位数（75%）
        Q3 = np.percentile(df[col], 75)
        # 四分位数间距（IQR）
        IQR = Q3 - Q1

        # 异常值的步长
        outlier_step = 1.5 * IQR

        # 确定特征col的异常值索引列表
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index

        # 将找到的异常值索引附加到异常值索引列表
        outlier_indices.extend(outlier_list_col)

    # 选择包含多于n个异常值的观察
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = [k for k, v in outlier_indices.items() if v > n]

    return multiple_outliers


outliers_to_drop = detect_outliers(df, 2,
                                   ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research'])

# 使用该函数时需要指定DataFrame df及其它参数
# 例如：outliers_to_drop = detect_outliers(df, 2, ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA', 'Research'])


因为异常值会对预测（特别是在回归问题中）产生巨大影响，所以我选择了管理这些异常值。

我使用了Tukey方法（Tukey JW., 1977）来检测异常值，该方法定义了一个介于分布值的第一四分位数和第三四分位数之间的四分位数间距（IQR）。如果某行数据在某个特征值上超出了（IQR ± 异常值步长）的范围，那么这一行就被视为含有异常值。

我决定从数值特征（GRE Score, TOEFL Score, University Rating, SOP, LOR , CGPA, Research）中检测异常值。然后，我将那些至少有两个异常数值的行视为异常值行。

In [81]:
# 显示异常值行
df.loc[outliers_to_drop]

不存在异常值，因为所有的值都在一个固定的范围内，没有一个值会低于或超过这个范围，因此不产生异常值

In [84]:
cols = df.drop(labels='Serial No.', axis=1)

cols.head().T

In [86]:
# 计算相关性矩阵
corr = cols.corr()

# 创建一个用于掩盖矩阵上三角的掩码
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# 设置绘图风格和图像大小
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(9, 7))
    # 绘制带掩码的热图，设置正方形格子，注释，格式化数字，线宽和颜色映射
    ax = sns.heatmap(corr, mask=mask, square=True, annot=True, fmt='0.2f', linewidths=.8, cmap="hsv")
plt.show()  # 显示图像

可以看到，录取的机会与CGPA高度相关，GRE和托福成绩也是相关的。

从上面的配对图推断:

GRE成绩、托福成绩、CGPA成绩均呈线性相关关系
无论如何，研究型学生往往得分更高

In [90]:
df.drop(columns='Serial No.', inplace=True)

In [92]:
images_per_row = 4
plt.figure(figsize=(15, 8))

# 循环绘制每个列的分布图
for i, col in enumerate(df.columns):
    if df[col].dtype != 'object':
        plt.subplot(2, images_per_row, i + 1)
        sns.histplot(df[col], kde=True, color='green')
        plt.xlabel(col)

plt.tight_layout()
plt.show()

In [94]:
X = df.drop('Chance of Admit', axis=1)
y = df['Chance of Admit']

In [96]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [98]:
X_train

In [100]:
# 缩放值
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [102]:
X_train

In [106]:
import pickle
# 保存模型
pickle.dump(scaler, open('models/scaler.pkl', 'wb'))

## 模型构建：神经网络架构

In [109]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [111]:
# 创建一个顺序模型对象
model = Sequential()

# 添加层到模型
# 输入层和隐藏层：
model.add(Dense(7, activation='relu', input_dim=7))
# 输出层：1个输出，使用线性激活函数，适合回归问题
model.add(Dense(1, activation='linear'))

In [113]:
model.summary()

In [115]:
# 回归问题（损失函数 - 均方误差）
model.compile(loss='mean_squared_error', optimizer='Adam')

In [117]:
history = model.fit(X_train, y_train, epochs=10, validation_split=0.2)

In [118]:
# 获取第0层的权重和偏置
model.layers[0].get_weights()

In [121]:
y_pred = model.predict(X_test)

In [123]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

R2 得分呈负数表示回归模型的性能比拟合数据的水平线差。这表明该模型无法捕获特征和目标变量之间的任何有意义的关系，从而导致预测性能较差。

In [126]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

## 模型改进

In [129]:
# 创建模型对象
model = Sequential()

# 输入层
model.add(Dense(15, activation='relu', input_dim=7))

# 隐藏层
model.add(Dense(15, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(15, activation='relu'))

# 输出层
model.add(Dense(1, activation='linear'))

In [131]:
model.summary()

In [133]:
# 回归问题（损失函数 - 均方误差）
model.compile(loss='mean_squared_error', optimizer='Adam')

In [135]:
history = model.fit(X_train, y_train, epochs=100, validation_split=0.2)

In [140]:
y_pred = model.predict(X_test)

In [142]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

In [144]:
model.save('models/model.keras')

In [146]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

In [148]:
df.head()

通过增加 epoch 数量并使用额外的隐藏层细化模型架构，R2 分数显着提高。

In [153]:
from tensorflow.keras.models import load_model

# 加载模型
loaded_model = load_model('models/model.keras')

data = pd.read_csv('Graduate Admission/Admission_Predict_Ver1.1.csv')

data = data[['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research']]

# 标准化数据
data_scaled = scaler.transform(data)
# 使用加载的模型进行预测
predictions = loaded_model.predict(data_scaled)

print(predictions[:10])

In [155]:
from tensorflow.keras.models import load_model

model_path = 'models/model.keras'
model = load_model(model_path)

model.summary()


In [157]:
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor,AdaBoostClassifier
from sklearn.ensemble import ExtraTreesRegressor,ExtraTreesClassifier
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.svm import SVR,SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score,mean_squared_error

In [159]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error
from joblib import dump

# 初始化各种模型
dtree = DecisionTreeRegressor(random_state=42)  # 决策树回归模型
rf = RandomForestRegressor(n_estimators=100, random_state=42)  # 随机森林回归模型
knn = KNeighborsRegressor(n_neighbors=5)  # K-最近邻回归模型
lr = LinearRegression()  # 线性回归模型
ada = AdaBoostRegressor(random_state=42)  # AdaBoost回归模型

# 用于存储模型的字典
models = {
    "Decision Tree": dtree,
    "Random Forest": rf,
    "KNN": knn,
    "Linear Regression": lr,
    "AdaBoost": ada
}
results = {}

# 训练并评估每个模型
for name, model in models.items():
    model.fit(X_train, y_train)  # 训练模型
    predictions = model.predict(X_test)  # 对测试集进行预测
    r2 = r2_score(y_test, predictions)  # 计算 R² 分数
    mse = mean_squared_error(y_test, predictions)  # 计算均方误差
    results[name] = (r2, mse)  # 存储每个模型的评估结果
    file_path = f'models/{name.replace(" ", "_")}_model.joblib'  # 创建文件名，替换空格以防文件名错误
    dump(model, file_path)  # 使用 joblib 的 dump 函数保存模型
    print(f"{name} model saved at: {file_path}")


# 输出评估结果
for name, scores in results.items():
    print(f"{name} - R²: {scores[0]:.3f}, MSE: {scores[1]:.3f}")

In [165]:
from tensorflow.keras.models import load_model
import joblib

# 加载模型
job_model = joblib.load(f"models/Linear_Regression_model.joblib")

data = pd.read_csv('Graduate Admission/Admission_Predict_Ver1.1.csv')

data = data[['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research']]
print(job_model)
# 标准化数据
data_scaled = scaler.transform(data)
# 使用加载的模型进行预测
predictions = job_model.predict(data_scaled)

predictions[:10]