## 数据处理

In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler


In [None]:
class DataProcessing:
    def __init__(self):
        pass

    # 读取数据并分离自变量与目标变量
    def readData(self, file_train: str, file_test: str, show='yes'):
        df_train = pd.read_csv('train.csv')
        df_test = pd.read_csv('test.csv')

        if show == 'yes':
            df_train.head(5)
        
        # 分离自变量与目标变量
        X_train = df_train.drop(columns='y')
        y_train = df_train['y']

        X_test = df_test
    
        return X_train, y_train, X_test

    # X44 和 X45 存在空白值，使用KNN进行插值，注意只能插入0或1
    def impute_KNN(self, X_train, X_test, n_neighbors=5):
        imputer = KNNImputer(n_neighbors=n_neighbors)

        X_train_imputed = imputer.fit_transform(X_train)
        X_test_imputed = imputer.fit_transform(X_test)

        # KNN 插值不一定为0或1（取的是平均值），手动调整为0或1
        for i in range(len(X_train_imputed)):
            X44 = X_train_imputed[i, -2]
            X45 = X_train_imputed[i, -1]
            if X44 <= 0.5:
                X44 = 0
            else:
                X44 = 1

            if X45 <= 0.5:
                X45 = 0
            else:
                X45 = 1

        for i in range(len(X_test_imputed)):
            X44 = X_test_imputed[i, -2]
            X45 = X_test_imputed[i, -1]
            if X44 <= 0.5:
                X44 = 0
            else:
                X44 = 1

            if X45 <= 0.5:
                X45 = 0
            else:
                X45 = 1

    # 对分类变量进行独热编码，对数值型变量进行标准化
    def standardization(self, X_train_imputed, X_test_imputed):
        scaler = StandardScaler()
        X_train_imputed_scaled = scaler.fit_transform(X_train_imputed)
        X_test_imputed_scaled = scaler.transform(X_test_imputed)
        return X_test_imputed_scaled, X_test_imputed_scaled

## 画图，描述性统计

In [None]:
class DrawFigures:
    def __init__(self):
        pass

    # 按pdf文件中的分组对自变量作图，观察每组中自变量间关系
    def drawAll(self, df_train, X_train):
        feature_columns = [col for col in X_train.columns if col.startswith('X')]
        # print(feature_columns)
        feature_dict = {}

        for i in range(0, len(feature_columns), 5):
            key = f"X{i+1} to X{i+6}"
            feature_dict[key] = feature_columns[i:i+5]
            feature_dict[key].append('y')
            g = sns.pairplot(df_train[feature_dict[key]], hue='y')
            g.fig.suptitle(key)
            plt.show()
        # sns.pairplot(df_train[], hue='y')
        # plt.show()

    # 选取两个自变量，作图并观察其间关系
    def drawPair(self, df_train, X_train):
        feature_columns = [col for col in X_train.columns if col.startswith('X')]

        # length = len(feature_columns)
        length = 3

        for i in range(length):
            for j in range(i+1, length):
                features = [feature_columns[i], feature_columns[j]]
                features.append('y')
                title = f"{feature_columns[i]} and {feature_columns[j]}"
                g = sns.pairplot(df_train[features], hue='y')
                g.fig.suptitle(title)
                plt.show

## 模型训练与评估

In [None]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [None]:
class Model:
    def __init__(self, X_train, y_train, X_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test

    # 多元逻辑回归
    def LR(self, random_state=42, generate_report='yes'):
        # 尝试直接进行多元逻辑回归，观察效果
        LR_model = LogisticRegression(random_state=random_state)

        LR_model.fit(self.X_train, self.y_train)

        # 评估拟合效果
        y_train_pred = LR_model.predict(self.X_train)
        y_test_pred = LR_model.predict(self.X_test)

        if generate_report == 'yes':
            report = classification_report(self.y_train, y_train_pred)

## 生成csv结果文件

In [None]:
def createSolution(X_test, y_pred, name='solution'):
    

## 主文件