In [1]:
import pandas as pd
from sklearn.impute import KNNImputer

# Load the dataset
file_path = 'US Stock Market Dataset.csv'
data = pd.read_csv(file_path)

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1013 entries, 0 to 1012
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         1013 non-null   int64  
 1   Date               1013 non-null   object 
 2   Natural_Gas_Price  1013 non-null   float64
 3   Natural_Gas_Vol.   1009 non-null   float64
 4   Crude_oil_Price    1013 non-null   float64
 5   Crude_oil_Vol.     990 non-null    float64
 6   Copper_Price       1013 non-null   float64
 7   Copper_Vol.        976 non-null    float64
 8   Bitcoin_Price      1013 non-null   object 
 9   Bitcoin_Vol.       1013 non-null   int64  
 10  Platinum_Price     1013 non-null   object 
 11  Platinum_Vol.      636 non-null    float64
 12  Ethereum_Price     1013 non-null   object 
 13  Ethereum_Vol.      1013 non-null   int64  
 14  S&P_500_Price      1013 non-null   object 
 15  Nasdaq_100_Price   1013 non-null   object 
 16  Nasdaq_100_Vol.    1012 

In [4]:
#字符串转换为数值
for col in data.columns:
    if data[col].dtype == 'object':
        if col!='Date':
            data[col] = pd.Categorical(data[col]).codes

data.info()  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1013 entries, 0 to 1012
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         1013 non-null   int64  
 1   Date               1013 non-null   object 
 2   Natural_Gas_Price  1013 non-null   float64
 3   Natural_Gas_Vol.   1009 non-null   float64
 4   Crude_oil_Price    1013 non-null   float64
 5   Crude_oil_Vol.     990 non-null    float64
 6   Copper_Price       1013 non-null   float64
 7   Copper_Vol.        976 non-null    float64
 8   Bitcoin_Price      1013 non-null   int16  
 9   Bitcoin_Vol.       1013 non-null   int64  
 10  Platinum_Price     1013 non-null   int16  
 11  Platinum_Vol.      636 non-null    float64
 12  Ethereum_Price     1013 non-null   int16  
 13  Ethereum_Vol.      1013 non-null   int64  
 14  S&P_500_Price      1013 non-null   int16  
 15  Nasdaq_100_Price   1013 non-null   int16  
 16  Nasdaq_100_Vol.    1012 

In [6]:
#使用knn对缺失值进行填充
imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(data.select_dtypes(include=['number']))#只对数值型数据进行填充
filled_Data = pd.DataFrame(imputed_data, columns=data.select_dtypes(include=['number']).columns)

#将填充后的数据与非数值型数据合并
non_numerical_columns = data.select_dtypes(exclude=['number']).columns
if len(non_numerical_columns) > 0:
    final_imputed_data = pd.concat([filled_Data, data[non_numerical_columns]], axis=1)
else:
    final_imputed_data = filled_Data

final_imputed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1013 entries, 0 to 1012
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         1013 non-null   float64
 1   Natural_Gas_Price  1013 non-null   float64
 2   Natural_Gas_Vol.   1013 non-null   float64
 3   Crude_oil_Price    1013 non-null   float64
 4   Crude_oil_Vol.     1013 non-null   float64
 5   Copper_Price       1013 non-null   float64
 6   Copper_Vol.        1013 non-null   float64
 7   Bitcoin_Price      1013 non-null   float64
 8   Bitcoin_Vol.       1013 non-null   float64
 9   Platinum_Price     1013 non-null   float64
 10  Platinum_Vol.      1013 non-null   float64
 11  Ethereum_Price     1013 non-null   float64
 12  Ethereum_Vol.      1013 non-null   float64
 13  S&P_500_Price      1013 non-null   float64
 14  Nasdaq_100_Price   1013 non-null   float64
 15  Nasdaq_100_Vol.    1013 non-null   float64
 16  Apple_Price        1013 

In [7]:
#缺失值数据个数
print(final_imputed_data.isnull().sum())

Unnamed: 0           0
Natural_Gas_Price    0
Natural_Gas_Vol.     0
Crude_oil_Price      0
Crude_oil_Vol.       0
Copper_Price         0
Copper_Vol.          0
Bitcoin_Price        0
Bitcoin_Vol.         0
Platinum_Price       0
Platinum_Vol.        0
Ethereum_Price       0
Ethereum_Vol.        0
S&P_500_Price        0
Nasdaq_100_Price     0
Nasdaq_100_Vol.      0
Apple_Price          0
Apple_Vol.           0
Tesla_Price          0
Tesla_Vol.           0
Microsoft_Price      0
Microsoft_Vol.       0
Silver_Price         0
Silver_Vol.          0
Google_Price         0
Google_Vol.          0
Nvidia_Price         0
Nvidia_Vol.          0
Berkshire_Price      0
Berkshire_Vol.       0
Netflix_Price        0
Netflix_Vol.         0
Amazon_Price         0
Amazon_Vol.          0
Meta_Price           0
Meta_Vol.            0
Gold_Price           0
Gold_Vol.            0
Date                 0
dtype: int64


In [None]:
#导出
final_imputed_data.to_csv('newdata.csv', index=False)