In [2]:
def process_file(file_path):
    try:
        dataset = pd.read_csv(file_path)
        dataset = preprocess_data(dataset)
        dataset = calculate_features(dataset)
        dataset = dataset.sort_index(ascending=False)
        dataset = dataset.reset_index(drop=True)
        save_path = os.path.join(data_folder, os.path.basename(file_path))
        dataset.to_csv(save_path, index=False)
        print(f"Processed file: {file_path}")
    except Exception as e:
        print(f"Error processing file: {file_path}")
        print(str(e))
def preprocess_data(dataset):
    # Remove suffixes "K" from the "Объём" column
    dataset['Объём'] = dataset['Объём'].str.replace('M', '000000').str.replace('K', '000')

    # Remove "%" from the "Изм. %" column
    dataset['Изм. %'] = dataset['Изм. %'].str.replace('%', '')

    # Convert data types with thousands separators and possible decimal commas
    dataset['Дата'] = pd.to_datetime(dataset['Дата'], dayfirst=True)  # Pass dayfirst=True to silence the warning
    dataset['Цена'] = pd.to_numeric(dataset['Цена'].str.replace('.', '').str.replace(',', '.'))
    dataset['Откр.'] = pd.to_numeric(dataset['Откр.'].str.replace('.', '').str.replace(',', '.'))
    dataset['Макс.'] = pd.to_numeric(dataset['Макс.'].str.replace('.', '').str.replace(',', '.'))
    dataset['Мин.'] = pd.to_numeric(dataset['Мин.'].str.replace('.', '').str.replace(',', '.'))
    dataset['Объём'] = pd.to_numeric(dataset['Объём'].str.replace('.', '').str.replace(',', ''))
    dataset['Изм. %'] = pd.to_numeric(dataset['Изм. %'].str.replace(',', '.'))

    return dataset
def calculate_features(dataset):
    # Add additional date-related features
    dataset['dayofweek'] = dataset['Дата'].dt.dayofweek
    dataset['quarter'] = dataset['Дата'].dt.quarter
    dataset['month'] = dataset['Дата'].dt.month
    dataset['year'] = dataset['Дата'].dt.year
    dataset['dayofyear'] = dataset['Дата'].dt.dayofyear
    dataset['dayofmonth'] = dataset['Дата'].dt.day
    dataset['weekofyear'] = dataset['Дата'].dt.isocalendar().week

    # Calculate moving averages, RSI, and MACD
    dataset = calculate_moving_averages(dataset, window=20)
    dataset = calculate_rsi(dataset)
    dataset = calculate_macd(dataset)

    # Interpolate missing values for 'RSI' and 'SMA_20' columns
    dataset['RSI'].interpolate(inplace=True)
    dataset['SMA_20'].interpolate(inplace=True)

    return dataset

In [3]:
# Функция для расчета скользящих средних
def calculate_moving_averages(data, window):
    data['SMA_' + str(window)] = data['Цена'].rolling(window=window).mean()  # Простое скользящее среднее
    data['EMA_' + str(window)] = data['Цена'].ewm(span=window, adjust=False).mean()  # Экспоненциальное скользящее среднее
    return data
# Функция для расчета RSI
def calculate_rsi(data, window=14):
    delta = data['Цена'].diff(1)
    gain = (delta.where(delta > 0, 0)).fillna(0)
    loss = (-delta.where(delta < 0, 0)).fillna(0)
    avg_gain = gain.rolling(window=window).mean()
    avg_loss = loss.rolling(window=window).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    data['RSI'] = rsi
    return data
# Функция для расчета MACD
def calculate_macd(data, short_window=12, long_window=26, signal_window=9):
    short_ema = data['Цена'].ewm(span=short_window, adjust=False).mean()
    long_ema = data['Цена'].ewm(span=long_window, adjust=False).mean()
    data['MACD'] = short_ema - long_ema
    data['Signal_Line'] = data['MACD'].ewm(span=signal_window, adjust=False).mean()
    return data

In [4]:
import pandas as pd
import os

from free_utils import (
    file_names,
    data_folder
)



  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


In [14]:
data_folder = 'data_updated'

for file_name in file_names:
    file_path = os.path.join('../data', file_name)
    process_file(file_path)

Error processing file: ../data\Прошлые данные - LKOH.csv
Cannot save file into a non-existent directory: 'data_updated'
Error processing file: ../data\Прошлые данные - INGR.csv
Cannot save file into a non-existent directory: 'data_updated'
Error processing file: ../data\Прошлые данные - LENT.csv
Cannot save file into a non-existent directory: 'data_updated'
Error processing file: ../data\Прошлые данные - LSRG.csv
Cannot save file into a non-existent directory: 'data_updated'
Error processing file: ../data\Прошлые данные - MVID.csv
Cannot save file into a non-existent directory: 'data_updated'
Error processing file: ../data\Прошлые данные - NVTK.csv
Cannot save file into a non-existent directory: 'data_updated'
Error processing file: ../data\Прошлые данные - OZONDR.csv
Cannot save file into a non-existent directory: 'data_updated'
Error processing file: ../data\Прошлые данные - PIKK.csv
Cannot save file into a non-existent directory: 'data_updated'
Error processing file: ../data\Прошлые

In [12]:
file_path

'data\\Прошлые данные - GAZP (3).csv'

In [5]:
data_lkoh = pd.read_csv('../data_updated/Прошлые данные - LKOH.csv')
data_lkoh.head()

Unnamed: 0,Дата,Цена,Откр.,Макс.,Мин.,Объём,Изм. %,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,SMA_20,EMA_20,RSI,MACD,Signal_Line
0,2013-01-14,2020.0,2010.9,2021.4,2001.9,117000000,0.69,0,1,1,2013,14,14,3,2006.07,2004.281941,51.827243,5.544697,5.470472
1,2013-01-15,2006.0,2018.3,2018.6,2000.2,55997000,-0.69,1,1,1,2013,15,15,3,2004.785,2002.627408,49.582754,4.699511,5.451916
2,2013-01-16,2007.6,2006.5,2011.0,1990.5,70856000,0.08,2,1,1,2013,16,16,3,2003.99,2002.272398,47.187293,4.977845,5.640017
3,2013-01-17,2012.7,2013.3,2020.0,1996.0,94260000,0.25,3,1,1,2013,17,17,3,2004.12,2001.711598,52.541296,5.097786,5.80556
4,2013-01-18,2017.0,2016.0,2022.0,2012.4,65519000,0.21,4,1,1,2013,18,18,3,2003.735,2000.554924,55.273189,4.657452,5.982504


In [9]:
data_lkoh.iloc[-1]

Дата           2023-10-13
Цена               7244.0
Откр.              7143.0
Макс.              7250.0
Мин.               7136.0
Объём           168000000
Изм. %               1.74
dayofweek               4
quarter                 4
month                  10
year                 2023
dayofyear             286
dayofmonth             13
weekofyear             41
SMA_20                NaN
EMA_20             7244.0
RSI                   NaN
MACD                  0.0
Signal_Line           0.0
Name: 2693, dtype: object