In [1]:
import torch 
import torch.nn  as nn 
import torch.optim  as optim 
import numpy as np 
import pandas as pd 
from sklearn.impute  import SimpleImputer 
from sklearn.preprocessing  import LabelEncoder, OneHotEncoder 
from sklearn.preprocessing  import MinMaxScaler, StandardScaler 
from imblearn.over_sampling  import SMOTE 
from sklearn.model_selection  import train_test_split 
from torch.utils.data  import TensorDataset, DataLoader 
from tqdm import tqdm

In [2]:
# 设置随机种子 
torch.manual_seed(1)  
torch.cuda.manual_seed(1)  

In [5]:
# Iris
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
iris = fetch_ucirepo(id=53) 
# data (as pandas dataframes) 
X = iris.data.features 
y = iris.data.targets 

# 处理缺失值
missing_threshold = 0.3
missing_ratio = X.isnull().mean()
X = X.drop(columns=missing_ratio[missing_ratio > missing_threshold].index)  # 删除缺失值比例超过30%的特征

# 处理数据类型转换
y[y.columns[0]] = LabelEncoder().fit_transform(y[y.columns[0]])
for col in X.select_dtypes(include=['object']).columns:
    if X[col].nunique() < 20:  # 类别变量进行编码
        X[col] = LabelEncoder().fit_transform(X[col])
    else:  # 删除冗余字符串特征 (通常为注释)
        X.drop(columns=[col], inplace=True)

# 对剩余缺失值采用中位数填充
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# 归一化特征
scaler_minmax = MinMaxScaler()
scaler_zscore = StandardScaler()

# 对具有高斯分布的特征进行z-score标准化，否则进行min-max归一化
X_scaled = X_imputed.copy()
for col in X_scaled.columns:
    if np.abs(X_scaled[col].skew()) < 1:  # 判断是否接近正态分布
        X_scaled[col] = scaler_zscore.fit_transform(X_scaled[[col]])
    else:
        X_scaled[col] = scaler_minmax.fit_transform(X_scaled[[col]])

# 处理类别不均衡
try:
    smote = SMOTE(sampling_strategy='auto', k_neighbors=1, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
except:
    (X_resampled, y_resampled) = (X_scaled, y)

# 分层拆分数据集（保持类别比例）
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# 查看处理后数据集大小
print(f"训练集大小: {X_train.shape}, 测试集大小: {X_test.shape}")

训练集大小: (120, 4), 测试集大小: (30, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[y.columns[0]] = LabelEncoder().fit_transform(y[y.columns[0]])


In [6]:
y_train

Unnamed: 0,class
8,0
106,2
76,1
9,0
89,1
...,...
37,0
2,0
33,0
52,1


In [7]:
X_train

Unnamed: 0,sepal length,sepal width,petal length,petal width
8,-1.748856,-0.356361,-1.341272,-1.312977
106,-1.143017,-1.281972,0.421564,0.659118
76,1.159173,-0.587764,0.592162,0.264699
9,-1.143017,0.106445,-1.284407,-1.444450
89,-0.416010,-1.281972,0.137236,0.133226
...,...,...,...,...
37,-1.143017,0.106445,-1.284407,-1.444450
2,-1.385353,0.337848,-1.398138,-1.312977
33,-0.416010,2.651878,-1.341272,-1.312977
52,1.280340,0.106445,0.649027,0.396172


In [None]:
# 创建TensorDataset和DataLoader 
# 数据集用于存储训练数据，DataLoader用于批量加载数据 
train_dataset = TensorDataset(X_train_tensor, y_train_tensor) 
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True) 
test_dataset = TensorDataset(X_test_tensor, y_test_tensor) 
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=True) 