In [None]:
import numpy as np
import pandas as pd
import zipfile
import pyarrow
import os
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split
import gzip
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler



In [None]:
!python3 -m pip install "numpy<2"
!python3 -m pip install pmdarima
from pmdarima import auto_arima

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (7.8 kB)
Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pmdarima
Successfully installed pmdarima-2.0.4


In [None]:
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Reduced_file_size_train.gzip'

df = pd.read_parquet(file_path, engine='pyarrow')
df.info()


Mounted at /content/drive
<class 'pandas.core.frame.DataFrame'>
Index: 4712734 entries, 2952328 to 24244632
Data columns (total 93 columns):
 #   Column        Dtype  
---  ------        -----  
 0   date_id       int16  
 1   time_id       int16  
 2   symbol_id     int8   
 3   weight        float16
 4   feature_00    float16
 5   feature_01    float16
 6   feature_02    float16
 7   feature_03    float16
 8   feature_04    float16
 9   feature_05    float16
 10  feature_06    float16
 11  feature_07    float16
 12  feature_08    float16
 13  feature_09    int8   
 14  feature_10    int8   
 15  feature_11    int16  
 16  feature_12    float16
 17  feature_13    float16
 18  feature_14    float16
 19  feature_15    float16
 20  feature_16    float16
 21  feature_17    float16
 22  feature_18    float16
 23  feature_19    float16
 24  feature_20    float16
 25  feature_21    float16
 26  feature_22    float16
 27  feature_23    float16
 28  feature_24    float16
 29  feature_25    flo

In [None]:
def reduce_mem_usage(self, float16_as32=True):
    # memory_usage() calculates the memory usage of the dataframe; sum computes the total.
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is: {:.2f} MB".format(start_mem))

    for col in df.columns:  # Iterate through column names
        col_type = df[col].dtype  # Get the data type of the column

        if col_type != object and str(col_type) != "category":  # Skip if the column is of object type
            c_min, c_max = df[col].min(), df[col].max()  # Get the min and max values of the column

            # If the column is an integer type
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)

            # If the column is a floating-point type
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    if float16_as32:  # If high precision is needed, use float32
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
train_data = reduce_mem_usage(df, False)

Memory usage of dataframe is: 867.42 MB
Memory usage after optimization is: 853.94 MB
Decreased by 1.6%


In [None]:
train_data.round(1)

Unnamed: 0,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,...,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id
2952328,233,17,19,5.199219,,,,,,0.300049,...,-0.099976,-0.099976,-1.200195,-0.199951,1.299805,-0.899902,-0.300049,1.200195,-0.899902,1
5274252,371,505,12,1.200195,1.099609,-1.500000,1.599609,1.500000,-0.600098,-0.700195,...,-0.799805,0.099976,0.099976,-0.500000,0.799805,-0.300049,0.300049,1.000000,-1.000000,2
25873646,1123,552,37,0.799805,0.399902,-1.799805,0.399902,0.899902,-0.700195,-2.000000,...,0.199951,-0.000000,-0.000000,-0.399902,-1.500000,-0.199951,-0.700195,-2.099609,-0.399902,6
1055013,105,565,11,0.700195,,,,,,2.800781,...,-0.099976,-2.199219,-5.000000,-0.399902,-2.300781,-5.000000,-0.300049,-0.199951,-0.700195,0
38340137,1461,186,11,4.500000,0.399902,2.000000,-0.199951,0.199951,-0.600098,0.300049,...,0.000000,-0.099976,0.199951,-0.099976,-0.300049,-0.199951,-0.199951,-0.300049,-0.399902,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15379799,802,482,26,0.500000,-1.799805,-2.000000,-1.000000,-1.900391,-1.500000,-0.600098,...,-0.700195,-0.300049,-0.500000,-0.399902,-0.199951,0.000000,-0.199951,-0.099976,0.399902,4
43467893,1600,553,18,2.000000,0.099976,-2.000000,-0.399902,-0.099976,1.700195,-0.300049,...,-0.000000,-0.000000,0.300049,1.599609,0.300049,-0.099976,2.199219,0.199951,-0.500000,9
286346,32,793,0,2.400391,,,,,,1.099609,...,0.500000,2.800781,-0.000000,-0.199951,2.300781,-0.300049,-0.600098,-0.600098,-0.399902,0
11578470,671,266,1,4.398438,0.300049,0.199951,0.300049,0.500000,0.099976,-0.199951,...,0.099976,0.099976,-0.099976,-0.199951,-0.399902,-0.099976,-0.500000,-0.600098,-0.099976,3


In [None]:
features = train_data.filter(regex='^feature_')
responders = train_data.filter(regex='^responder_')


In [None]:
X = features.values
y = responders['responder_6'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 50)

Scalar = MinMaxScaler()
X_train = Scalar.fit_transform(X_train)
X_test = Scalar.transform(X_test)