In [10]:
import numpy as np
import csv

# Function to read the header
def read_header(file_path):
    with open(file_path, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
    return header

# Read headers
x_test_header = read_header('dataset/x_test.csv')
y_train_header = read_header('dataset/y_train.csv')
x_train_header = read_header('dataset/x_train.csv')

# Load csvs x_test, y_test and x_train, skipping the header row
x_test = np.genfromtxt('dataset/x_test.csv', delimiter=',', skip_header=1)
y_train = np.genfromtxt('dataset/y_train.csv', delimiter=',', skip_header=1)
x_train = np.genfromtxt('dataset/x_train.csv', delimiter=',', skip_header=1)
x_train_cleaned = x_train.copy()
x_test_cleaned = x_test.copy()

# Identify columns with only one distinct value or NaN 
columns_to_drop = [] 
for i in range(x_train_cleaned.shape[1]): 
    unique_values = np.unique(x_train_cleaned[:, i][~np.isnan(x_train_cleaned[:, i])]) 
    if len(unique_values) == 1: 
        columns_to_drop.append(i) 

# Drop these columns in both x_train and x_test 
x_train_cleaned = np.delete(x_train_cleaned, columns_to_drop, axis=1) 
x_test_cleaned = np.delete(x_test_cleaned, columns_to_drop, axis=1)

# Drop the same columns from the headers
x_train_header_cleaned = [col for i, col in enumerate(x_train_header) if i not in columns_to_drop]
x_test_header_cleaned = [col for i, col in enumerate(x_test_header) if i not in columns_to_drop]

# Save cleaned data with headers
with open('dataset/x_train_cleaned.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(x_train_header_cleaned)
    writer.writerows(x_train_cleaned)

with open('dataset/x_test_cleaned.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(x_test_header_cleaned)
    writer.writerows(x_test_cleaned)

print(x_train.shape)
print(x_train_cleaned.shape)

(328135, 322)
(328135, 316)
