In [1]:
import numpy as np
import pandas as pd
import zipfile
import pyarrow
import os

Import dataset and get information about the data

In [2]:
df = pd.read_parquet(r'C:\Users\nicho\Downloads\Jane_street_daeseo_ai\jane-street-real-time-market-data-forecasting/train.parquet')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47127338 entries, 0 to 47127337
Data columns (total 93 columns):
 #   Column        Dtype   
---  ------        -----   
 0   date_id       int16   
 1   time_id       int16   
 2   symbol_id     int8    
 3   weight        float32 
 4   feature_00    float32 
 5   feature_01    float32 
 6   feature_02    float32 
 7   feature_03    float32 
 8   feature_04    float32 
 9   feature_05    float32 
 10  feature_06    float32 
 11  feature_07    float32 
 12  feature_08    float32 
 13  feature_09    int8    
 14  feature_10    int8    
 15  feature_11    int16   
 16  feature_12    float32 
 17  feature_13    float32 
 18  feature_14    float32 
 19  feature_15    float32 
 20  feature_16    float32 
 21  feature_17    float32 
 22  feature_18    float32 
 23  feature_19    float32 
 24  feature_20    float32 
 25  feature_21    float32 
 26  feature_22    float32 
 27  feature_23    float32 
 28  feature_24    float32 
 29  feature_25  

In [3]:
def reduce_mem_usage(self, float16_as32=True):
    # memory_usage() calculates the memory usage of the dataframe; sum computes the total.
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is: {:.2f} MB".format(start_mem))

    for col in df.columns:  # Iterate through column names
        col_type = df[col].dtype  # Get the data type of the column

        if col_type != object and str(col_type) != "category":  # Skip if the column is of object type
            c_min, c_max = df[col].min(), df[col].max()  # Get the min and max values of the column

            # If the column is an integer type
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)

            # If the column is a floating-point type
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    if float16_as32:  # If high precision is needed, use float32
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df


In [5]:
train_data = reduce_mem_usage(df, False)

Memory usage of dataframe is: 8179.83 MB
Memory usage after optimization is: 8179.83 MB
Decreased by 0.0%


In [7]:
feat_cols = [f'feature_{i:02d}' for i in range(79)]
print(feat_cols)
for i in feat_cols:
    train_data[i][np.isnan(train_data[i])] = 0
print([np.max(train_data[i]) for i in feat_cols])
print([np.min(train_data[i]) for i in feat_cols])

['feature_00', 'feature_01', 'feature_02', 'feature_03', 'feature_04', 'feature_05', 'feature_06', 'feature_07', 'feature_08', 'feature_09', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51', 'feature_52', 'feature_53', 'feature_54', 'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59', 'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64', 'feature_65', 'feature_66', 'feature_67', 'feature_68', 'feature_69', 'feature_70', 'feat

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  train_data[i][np.isnan(train_data[i])] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[i][np.isn

[np.float16(6.477), np.float16(6.293), np.float16(6.492), np.float16(6.695), np.float16(6.164), np.float16(35.72), np.float16(93.6), np.float16(54.94), np.float16(21.69), np.int8(82), np.int8(12), np.int16(539), np.float16(139.4), np.float16(381.2), np.float16(169.1), np.float16(111.06), np.float16(220.1), np.float16(134.5), np.float16(5.824), np.float16(5.973), np.float16(5.42), np.float16(161.5), np.float16(4.13), np.float16(3.244), np.float16(10.74), np.float16(4.137), np.float16(4.863), np.float16(2.643), np.float16(2.88), np.float16(7.535), np.float16(9.414), np.float16(128.0), np.float16(6.008), np.float16(5.996), np.float16(5.07), np.float16(5.37), np.float16(5.7), np.float16(31.5), np.float16(26.22), np.float16(5.586), np.float16(5.473), np.float16(5.246), np.float16(5.15), np.float16(5.152), np.float16(4.94), np.float16(5.223), np.float16(4.688), np.float16(1019.0), np.float16(487.5), np.float16(147.4), np.float16(5.94), np.float16(5.81), np.float16(5.67), np.float16(5.707), n

In [8]:
# Log Transform and normalize it to 0,1
for i in feat_cols:
    mean_val = np.mean(train_data[i])
    std_val = np.std(train_data[i])
    train_data[i] = (train_data[i] - mean_val) / std_val

# Normalize log-transformed data to [0, 1] range

print(train_data)

  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan


          date_id  time_id  symbol_id    weight  feature_00  feature_01  \
0               0        0          1  3.888672         NaN         NaN   
1               0        0          7  1.371094         NaN         NaN   
2               0        0          9  2.285156         NaN         NaN   
3               0        0         10  0.690430         NaN         NaN   
4               0        0         14  0.440674         NaN         NaN   
...           ...      ...        ...       ...         ...         ...   
47127333     1698      967         34  3.242188         NaN         NaN   
47127334     1698      967         35  1.079102         NaN         NaN   
47127335     1698      967         36  1.033203         NaN         NaN   
47127336     1698      967         37  1.243164         NaN         NaN   
47127337     1698      967         38  3.193359         NaN         NaN   

          feature_02  feature_03  feature_04  feature_05  ...  responder_0  \
0                NaN 

  has_large_values = (abs_vals > 1e6).any()
