In [13]:
import numpy as np
import pandas as pd
import zipfile
import pyarrow
import os
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split

In [9]:
!python3 -m pip install "numpy<2"
from pmdarima import auto_arima


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


Import dataset and get information about the data

In [10]:
df = pd.read_parquet('Jane Street Market Data Forecasting/train.parquet')
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47127338 entries, 0 to 47127337
Data columns (total 93 columns):
 #   Column        Dtype   
---  ------        -----   
 0   date_id       int16   
 1   time_id       int16   
 2   symbol_id     int8    
 3   weight        float32 
 4   feature_00    float32 
 5   feature_01    float32 
 6   feature_02    float32 
 7   feature_03    float32 
 8   feature_04    float32 
 9   feature_05    float32 
 10  feature_06    float32 
 11  feature_07    float32 
 12  feature_08    float32 
 13  feature_09    int8    
 14  feature_10    int8    
 15  feature_11    int16   
 16  feature_12    float32 
 17  feature_13    float32 
 18  feature_14    float32 
 19  feature_15    float32 
 20  feature_16    float32 
 21  feature_17    float32 
 22  feature_18    float32 
 23  feature_19    float32 
 24  feature_20    float32 
 25  feature_21    float32 
 26  feature_22    float32 
 27  feature_23    float32 
 28  feature_24    float32 
 29  feature_25  

In [11]:
def reduce_mem_usage(self, float16_as32=True):
    # memory_usage() calculates the memory usage of the dataframe; sum computes the total.
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is: {:.2f} MB".format(start_mem))

    for col in df.columns:  # Iterate through column names
        col_type = df[col].dtype  # Get the data type of the column

        if col_type != object and str(col_type) != "category":  # Skip if the column is of object type
            c_min, c_max = df[col].min(), df[col].max()  # Get the min and max values of the column

            # If the column is an integer type
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)

            # If the column is a floating-point type
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    if float16_as32:  # If high precision is needed, use float32
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df


In [14]:
train_data = reduce_mem_usage(df, False)

Memory usage of dataframe is: 15910.22 MB
Memory usage after optimization is: 8179.83 MB
Decreased by 48.6%


In [16]:
seed = 123
Xt, Xe, Yt, Ye = train_test_split(df, df["time_id"], test_size=0.1, random_state=seed)

In [17]:
Xt

Unnamed: 0,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,...,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id
8458198,538,471,10,1.008789,0.708496,0.900391,1.141602,0.653809,0.807617,0.054901,...,0.019180,-0.010727,0.031372,0.753418,-0.588867,0.076172,0.810059,-0.508789,0.121643,3
28433828,1191,497,12,1.885742,0.533691,0.201050,0.692383,0.168457,0.104248,-1.098633,...,-0.013199,-0.217651,-0.002594,-0.094116,0.446533,-0.120483,-0.107727,1.117188,-0.233398,7
13524700,740,893,12,1.712891,1.372070,0.844727,0.498779,1.265625,-0.223022,0.140869,...,0.006252,0.571777,0.000249,0.459717,-0.524414,-0.044556,0.651855,-0.344971,-0.087891,4
7803057,510,633,5,2.492188,1.090820,-0.583984,0.945312,1.204102,1.348633,1.125000,...,-0.062805,-0.065063,-0.435547,0.443359,-0.103149,0.903320,0.482666,-0.029510,2.037109,3
25393648,1110,790,12,2.791016,1.103516,0.224121,0.974609,1.544922,0.456055,-0.009941,...,-0.319336,0.217041,-0.076965,1.017578,-0.077698,0.451172,1.732422,-0.234619,0.785645,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34795484,1362,185,27,2.484375,1.986328,0.604980,2.488281,1.659180,0.057465,-1.247070,...,0.601074,-0.373291,0.956055,0.891113,0.963867,-0.471924,0.672852,2.064453,-1.335938,8
28329282,1188,707,35,1.073242,0.986328,0.049774,1.036133,0.094238,0.567383,-0.458984,...,0.025345,0.314941,-0.037598,0.387939,-0.404053,-0.144287,0.459229,-0.984863,-0.183960,6
20999550,985,345,11,3.160156,1.034180,-0.690430,0.418945,0.447266,-0.138672,-0.241211,...,0.019608,0.013046,-0.226807,0.405518,0.019562,-0.033661,0.580566,0.012566,0.215576,5
42220909,1567,9,12,2.933594,0.040009,-1.611328,-0.239868,0.459473,-0.763672,-1.078125,...,0.053467,-0.310303,-0.882812,-0.453369,-0.561035,-0.542480,-0.618652,-0.526855,-0.551758,9
