# class DataProcessor
# load_and_preprocess_data

Сохранить на Google Диске . Вы можете сохранить файл Python на Google Диске, а затем загружать его оттуда при необходимости.

Чтобы сохранить на Google Диск:

In [None]:
%%writefile /content/drive/MyDrive/BINANCE/data_processor.py

Writing /content/drive/MyDrive/BINANCE/data_processor.py


In [50]:
# preprocess_data_1

from sklearn import preprocessing
import pandas as pd
import logging
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import MinMaxScaler
import os

from google.colab import drive

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_and_preprocess_data(file_path):
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
            # Drop 'SYMBOL' column
            df.drop(columns=['SYMBOL'], errors='ignore', inplace=True)
        else:
            logger.error(f"Unsupported file format: {file_path}")
            return None
        return df
    except FileNotFoundError:
        logger.error(f"File {file_path} not found.")
        return None

def test_load_and_preprocess_data_v1():
    file_path = '/content/drive/MyDrive/BINANCE/1H/TABLE_10_10_23_1H.csv'
    df = load_and_preprocess_data(file_path)
    assert df is not None, f"Failed to load data from {file_path}"

def test_load_and_preprocess_data_v2():
    file_path = '/content/drive/MyDrive/BINANCE/1H/TABLE_10_10_23_1H.csv'
    df = load_and_preprocess_data(file_path)
    assert df is not None, f"Failed to load data from {file_path}"

df = load_and_preprocess_data('/content/drive/MyDrive/BINANCE/1H/TABLE_10_10_23_1H.csv')
print(df.head())  # Print the first few rows of the DataFrame

import matplotlib.pyplot as plt
import pdb
df = load_and_preprocess_data('/content/drive/MyDrive/BINANCE/1H/TABLE_10_10_23_1H.csv')
#df.hist()
plt.show()

# Function to load and preprocess data
def load_and_preprocess_data(file_path):
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        else:
            logger.error(f"Unsupported file format: {file_path}")
            return None
        return df
    except FileNotFoundError:
        logger.error(f"File {file_path} not found.")
        return None

class DataProcessor:
    def __init__(self, file_path: str, scaler_path: str):
        self.file_path = file_path
        self.scaler_path = scaler_path
        self.df = None
        self.logger = self.configure_logging()

    @staticmethod
    def configure_logging():
        logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        return logging.getLogger(__name__)

    def read_data(self):
        """Reads the data from the file path."""
        if not os.path.exists(self.file_path):
            self.logger.error(f"File {self.file_path} not found.")
            return

        try:
            self.df = pd.read_csv(self.file_path)
            if 'EVENT_TYPE' in self.df.columns:
                self.df.drop(columns=['EVENT_TYPE'], inplace=True)
            if 'SYMBOL' in self.df.columns:
                self.df.drop(columns=['SYMBOL'], inplace=True)
            self.logger.info(f'Successfully read data from {self.file_path}')
        except Exception as e:
            self.logger.error(f"Unexpected error while reading data from {self.file_path}: {e}", exc_info=True)

    @staticmethod
    def handle_missing_values(df):
        """Handles missing values in the dataframe."""
        df.ffill(inplace=True)
        return df

    @staticmethod
    def convert_to_datetime(df, columns):
        """Converts specified columns to datetime."""
        for col in columns:
            df[col] = pd.to_datetime(df[col])
        return df

    @staticmethod
    def scale_columns(df, columns):
        """Scales specified columns using MinMaxScaler."""
        scaler = MinMaxScaler()
        df[columns] = scaler.fit_transform(df[columns])
        return df, scaler

    def preprocess(self, scale_data=False):
        """Preprocesses the dataframe."""
        if self.df is None:
            self.logger.error("Dataframe is None in preprocess method.")
            return None, None

        if 'TRADE_TIME' not in self.df.columns:
            self.logger.error("'TRADE_TIME' column not found in the dataframe.")
            return None, None

        # Convert TRADE_TIME column to datetime format
        self.df['TRADE_TIME'] = pd.to_datetime(self.df['TRADE_TIME'], format='%d.%m.%y %H:%M:%S')

        # Extract datetime components
        self.df['TRADE_YEAR'] = self.df['TRADE_TIME'].dt.year
        self.df['TRADE_MONTH'] = self.df['TRADE_TIME'].dt.month
        self.df['TRADE_DAY'] = self.df['TRADE_TIME'].dt.day
        self.df['TRADE_HOUR'] = self.df['TRADE_TIME'].dt.hour
        self.df['TRADE_MINUTE'] = self.df['TRADE_TIME'].dt.minute
        self.df['TRADE_SECOND'] = self.df['TRADE_TIME'].dt.second

        # Continue with other preprocessing steps
        self.df = self.handle_missing_values(self.df)
        self.df['IS_BUYER_MARKET_MAKER'] = self.df['IS_BUYER_MARKET_MAKER'].map({'Y': 1, 'N': 0}, na_action='ignore')

        # Conditionally scale the PRICE and QUANTITY columns
        if scale_data:
            self.df, scaler = self.scale_columns(self.df, ["PRICE", "QUANTITY"])
        else:
            scaler = None

        # Drop 'EVENT_TIME' column
        self.df.drop(columns=["EVENT_TIME"], errors='ignore', inplace=True)

        return scaler


    def save_scaler(self, scaler):
        """Saves the scaler object to a file."""
        directory = os.path.dirname(self.scaler_path)
        if not os.path.exists(directory):
            os.makedirs(directory)

        try:
            with open(self.scaler_path, 'wb') as f:
                pickle.dump(scaler, f)
            self.logger.info(f"Scaler saved to {self.scaler_path}")
        except Exception as e:
            self.logger.error(f"Error saving scaler: {e}", exc_info=True)

    def process(self):
        """Main processing method."""
        self.read_data()
        scaler = self.preprocess(scale_data=True)

        if scaler:
            self.save_scaler(scaler)
        self.show_dataframe()
        self.save_dataframe()  # Add this line to save the DataFrame
        return self.df

    def show_dataframe(self):
        """Displays the dataframe."""
        if self.df is not None:
            display(self.df)
        else:
            self.logger.error("Dataframe is not available or hasn't been processed yet.")
    def save_dataframe(self):
        try:
            self.df.to_csv("/content/drive/MyDrive/BINANCE/1H/1h_processed_data.csv", index=False)
            self.logger.info(f"Dataframe saved to /content/drive/MyDrive/BINANCE/1H/1h_processed_data.csv")
        except Exception as e:
            self.logger.error(f"Error saving dataframe: {e}", exc_info=True)


# Main execution
if __name__ == '__main__':
    # Mount Google Drive
    drive.mount('/content/drive')

    file_path = '/content/drive/MyDrive/BINANCE/1H/TABLE_10_10_23_1H.csv'
    scaler_path = '/content/drive/MyDrive/BINANCE/1H/scaler.pkl'

    processor = DataProcessor(file_path, scaler_path)
    processed_df = processor.process()

    # Load the saved scaler
    with open(scaler_path, 'rb') as f:
        loaded_scaler = pickle.load(f)

    # Assume df_scaled is the DataFrame with scaled 'PRICE' and 'QUANTITY' values
    scaled_values = processed_df[["PRICE", "QUANTITY"]].values

    # Get the original values
    original_values = loaded_scaler.inverse_transform(scaled_values)

    # Create a DataFrame from the original values
    df_original = pd.DataFrame(original_values, columns=["PRICE", "QUANTITY"])

    # Now df_original contains the original 'PRICE' and 'QUANTITY' values
    print(df_original.head())




  EVENT_TYPE         EVENT_TIME    TRADE_ID     PRICE  QUANTITY  \
0      trade  10.10.23 11:37:37  3234595905  27565.79   0.00036   
1      trade  10.10.23 11:37:37  3234595906  27565.80   0.00025   
2      trade  10.10.23 11:37:37  3234595907  27565.85   0.00049   
3      trade  10.10.23 11:37:37  3234595908  27565.86   0.00036   
4      trade  10.10.23 11:37:37  3234595909  27565.88   0.00033   

   BUYER_ORDER_ID  SELLER_ORDER_ID         TRADE_TIME IS_BUYER_MARKET_MAKER  
0     22680259990      22680258817  10.10.23 11:37:37                     N  
1     22680259990      22680258889  10.10.23 11:37:37                     N  
2     22680259990      22680259285  10.10.23 11:37:37                     N  
3     22680259990      22680258833  10.10.23 11:37:37                     N  
4     22680259990      22680258857  10.10.23 11:37:37                     N  
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,TRADE_ID,PRICE,QUANTITY,BUYER_ORDER_ID,SELLER_ORDER_ID,TRADE_TIME,IS_BUYER_MARKET_MAKER,TRADE_YEAR,TRADE_MONTH,TRADE_DAY,TRADE_HOUR,TRADE_MINUTE,TRADE_SECOND
0,3234595905,0.857830,0.000058,22680259990,22680258817,2023-10-10 11:37:37,0,2023,10,10,11,37,37
1,3234595906,0.857931,0.000040,22680259990,22680258889,2023-10-10 11:37:37,0,2023,10,10,11,37,37
2,3234595907,0.858432,0.000080,22680259990,22680259285,2023-10-10 11:37:37,0,2023,10,10,11,37,37
3,3234595908,0.858532,0.000058,22680259990,22680258833,2023-10-10 11:37:37,0,2023,10,10,11,37,37
4,3234595909,0.858733,0.000053,22680259990,22680258857,2023-10-10 11:37:37,0,2023,10,10,11,37,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...
47986,3234643764,0.074494,0.001283,22680777141,22680777760,2023-10-10 12:38:38,1,2023,10,10,12,38,38
47987,3234643765,0.074594,0.000260,22680777779,22680777210,2023-10-10 12:38:38,0,2023,10,10,12,38,38
47988,3234643766,0.074594,0.000308,22680777781,22680777210,2023-10-10 12:38:38,0,2023,10,10,12,38,38
47989,3234643767,0.074594,0.000283,22680777782,22680777210,2023-10-10 12:38:38,0,2023,10,10,12,38,38


      PRICE  QUANTITY
0  27565.79   0.00036
1  27565.80   0.00025
2  27565.85   0.00049
3  27565.86   0.00036
4  27565.88   0.00033


In [49]:
import os

file_path = '/content/drive/MyDrive/BINANCE/1H/1h_processed_data.csv'
if os.path.exists(file_path):
    print(f"File {file_path} exists.")
else:
    print(f"File {file_path} does not exist.")



File /content/drive/MyDrive/BINANCE/1H/1h_processed_data.csv exists.
