In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

# Load dataset
file_path = "return_predictability_data_2009to2021.csv"  # Adjust the path as needed
data = pd.read_csv(file_path)

# Convert DATE to datetime
data['DATE'] = pd.to_datetime(data['DATE'])

# Sort data by DATE
data = data.sort_values(by='DATE').reset_index(drop=True)

# Extract temporal components: year, month, quarter
data['year'] = data['DATE'].dt.year
data['month'] = data['DATE'].dt.month
data['quarter'] = data['DATE'].dt.quarter


# Imputation Function
def apply_imputation(data, feature_groups):
    """
    Apply imputation methods dynamically based on feature groups.
    """
    for method, features in feature_groups.items():
        valid_features = [feature for feature in features if feature in data.columns]
        if method == 'mean':
            data[valid_features] = data[valid_features].fillna(data[valid_features].mean())
        elif method == 'linear':
            data[valid_features] = data[valid_features].interpolate(method='linear', axis=0)
        elif method == 'knn':
            knn_imputer = KNNImputer(n_neighbors=5)
            data[valid_features] = knn_imputer.fit_transform(data[valid_features])
        elif method == 'iterative':
            iterative_imputer = IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=0)
            data[valid_features] = iterative_imputer.fit_transform(data[valid_features])
    return data



# Apply Imputation
imputation_feature_groups = {
    'mean': ['beta', 'betasq', 'dolvol', 'pricedelay', 'ill', 'std_dolvol',
            'zerotrade', 'mom1m', 'bm', 'bm_ia', 'cashdebt', 'cashpr', 'cfp',
             'cfp_ia', 'chinv', 'depr', 'divo', 'dy', 'ep', 'gma', 'herf', 'ps', 'quick',
             'roic', 'salerec'],
    'linear': ['chmom', 'mom6m', 'mom12m', 'mom36m', 'chcsho', 'convind'],
    'knn': ['absacc', 'agr', 'chempia', 'currat', 'hire', 'invest', 'lgr',
            'mve_ia', 'pchcapx_ia', 'pchcurrat', 'pchdepr', 'pchgm_pchsale', 'pchquick', 
            'saleinv', 'sgr', 'sp', 'tang', 'idiovol', 'turn', 
            'std_turn', 'baspread'],
    'iterative': ['rd_mve', 'pctacc', 'orgcap', 'acc', 'chatoia', 'grcapx',
                  'cinvest', 'rsup', 'tb']
}

# Apply imputations
data = apply_imputation(data, imputation_feature_groups)

# Handle missing values for 'chcsho' and 'convind'
data['chcsho'].fillna(data['chcsho'].median(), inplace=True)  # Use median for numerical feature
data['convind'].fillna(data['convind'].mode()[0], inplace=True)

# Replace missing labels (e.g., 'missing' with 'Unknown') for categorical features
data['sic2'] = data['sic2'].replace('missing', 'Unknown')

# Print the number of features and rows in the final dataset
print(f"\nFinal Dataset: {data.shape[0]} rows, {data.shape[1]} features.")

#Checking missing data
print("Missing values after imputation:")
print(data.isnull().sum())


# Check for missing values after imputation
missing_summary = data.isnull().sum()
missing_count = missing_summary[missing_summary > 0]

if not missing_count.empty:
    print("\nFeatures with Missing Values After Imputation:")
    print(missing_count)
else:
    print("\nNo missing values found after imputation!")


# Save processed dataset
output_path = "processed_data_2009to2021_data.csv"
data.to_csv(output_path, index=False)
print(f"Processed dataset saved to {output_path}")


KeyboardInterrupt: 

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

# Load dataset
file_path = "return_predictability_data_2009to2021.csv"  # Adjust the path as needed
data = pd.read_csv(file_path)

# Convert DATE to datetime
data['DATE'] = pd.to_datetime(data['DATE'])

# Sort data by DATE
data = data.sort_values(by='DATE').reset_index(drop=True)

# Extract temporal components: year, month, quarter
data['year'] = data['DATE'].dt.year
data['month'] = data['DATE'].dt.month
data['quarter'] = data['DATE'].dt.quarter


# Imputation Function
def apply_imputation(data, feature_groups):
    """
    Apply imputation methods dynamically based on feature groups.
    """
    for method, features in feature_groups.items():
        valid_features = [feature for feature in features if feature in data.columns]
        if method == 'mean':
            data[valid_features] = data[valid_features].fillna(data[valid_features].mean())
        elif method == 'median':
            data[valid_features] = data[valid_features].fillna(data[valid_features].median())
        elif method == 'mode':
            for feature in valid_features:
                data[feature] = data[feature].fillna(data[feature].mode()[0])
        elif method == 'knn':
            knn_imputer = KNNImputer(n_neighbors=5)
            data[valid_features] = knn_imputer.fit_transform(data[valid_features])
    return data


# Redistributed Features for Imputation (without KNN)
imputation_feature_groups = {
    'mean': ['beta', 'betasq', 'dolvol', 'pricedelay', 'ill', 'std_dolvol',
             'zerotrade', 'mom1m', 'bm', 'bm_ia', 'cashdebt', 'cashpr', 'cfp',
             'cfp_ia', 'chinv', 'depr', 'divo', 'dy', 'ep', 'gma', 'herf', 'ps', 
             'quick', 'roic', 'salerec', 'absacc', 'agr', 'currat', 'hire', 
             'invest', 'lgr', 'pchcapx_ia', 'pchcurrat', 'pchdepr', 
             'pchgm_pchsale', 'pchquick', 'saleinv', 'sgr', 'sp', 'tang', 
             'idiovol', 'turn', 'std_turn', 'baspread', 'cash', 'roaq', 'roeq', 'age'],
    'linear': ['chmom', 'mom6m', 'mom12m', 'mom36m', 'chcsho', 'convind'],
    'iterative': ['rd_mve', 'pctacc', 'orgcap', 'acc', 'chatoia', 'grcapx',
                  'cinvest', 'rsup', 'tb']
}                  

# Apply imputations
data = apply_imputation(data, imputation_feature_groups)

# Handle missing values for 'chcsho' and 'convind'
data['chcsho'].fillna(data['chcsho'].median(), inplace=True)  # Use median for numerical feature
data['convind'].fillna(data['convind'].mode()[0], inplace=True)

# Replace missing labels (e.g., 'missing' with 'Unknown') for categorical features
data['sic2'] = data['sic2'].replace('missing', 'Unknown')

# Identify and drop features with remaining missing data
missing_summary = data.isnull().sum()
features_to_drop = missing_summary[missing_summary > 0].index.tolist()
if features_to_drop:
    print(f"\nDropping features with remaining missing data: {features_to_drop}")
    data = data.drop(columns=features_to_drop)

# Check for duplicate features and drop them
# Check for duplicate features and drop them
def drop_duplicate_features(data):
    """
    Identify and drop duplicate features based on their values.
    """
    duplicate_features = []
    cols = data.columns
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            if data[cols[i]].equals(data[cols[j]]):  # Check if two columns are identical
                duplicate_features.append(cols[j])
    duplicate_features = list(set(duplicate_features))  # Remove duplicates from the list
    if duplicate_features:
        print(f"\nDropping duplicate features: {duplicate_features}")
        data = data.drop(columns=duplicate_features)
    return data

# Print the number of features and rows in the final dataset
print(f"\nFinal Dataset: {data.shape[0]} rows, {data.shape[1]} features.")
print("Missing values after dropping features:")
print(data.isnull().sum())

# Save the engineered dataset
output_path = "processed_sample_dataset.csv"
data.to_csv(output_path, index=False)
print(f"Imputation completed and dataset saved to {output_path}.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['chcsho'].fillna(data['chcsho'].median(), inplace=True)  # Use median for numerical feature
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['convind'].fillna(data['convind'].mode()[0], inplace=True)



Dropping features with remaining missing data: ['mvel1', 'chmom', 'mom6m', 'mom12m', 'mom36m', 'acc', 'chatoia', 'chempia', 'chpmia', 'divi', 'egr', 'grcapx', 'grltnoa', 'lev', 'mve_ia', 'operprof', 'orgcap', 'pchsale_pchinvt', 'pchsale_pchrect', 'pchsale_pchxsga', 'pchsaleinv', 'pctacc', 'rd', 'rd_mve', 'rd_sale', 'realestate', 'salecash', 'secured', 'securedind', 'sin', 'tb', 'aeavol', 'chtx', 'cinvest', 'ear', 'nincr', 'roavol', 'rsup', 'stdacc', 'stdcf', 'ms', 'maxret', 'retvol']

Final Dataset: 890593 rows, 67 features.
Missing values after dropping features:
permno        0
DATE          0
beta          0
betasq        0
dolvol        0
             ..
macro_dfy     0
macro_svar    0
year          0
month         0
quarter       0
Length: 67, dtype: int64
Imputation completed and dataset saved to processed_sample_dataset.csv.
