In [17]:
import pandas as pd
import yfinance as yf
import pendulum
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
# Define the start and end dates for the loop
import datetime

In [5]:
# data = yf.download(
#   tickers=['AAPL'],
#   # start=start_time,
#   # end=end_time,
#   period='max',
#   interval='1d'
# )

# # Drop second level of column names
# data.columns = data.columns.droplevel(1)

# csv_file = "stock_data_alternative.csv"
# with open(csv_file, mode="w", newline="") as file:
# 		writer = csv.writer(file)
# 		# Write the header
# 		writer.writerow(["Date"] + list(data.columns))
# 		# Write the data
# 		for index, row in data.iterrows():
# 				writer.writerow([index] + list(row))


In [6]:
def clean_data(df):
    print("Initial data information:")
    print(df.info())

    # Remove unnecessary columns (if there are columns named 'Unnamed')
    df = df.loc[:, ~df.columns.get_level_values(1).str.contains('^Unnamed')]

    # Handle the 'timestamp' column (if present)
    if ('timestamp', '') in df.columns:
        # Convert to datetime format
        df[('timestamp', '')] = pd.to_datetime(df[('timestamp', '')], errors='coerce')  
        
        # Remove rows with missing timestamp values
        df = df.dropna(subset=[('timestamp', '')]) 

    # Fill missing values using forward fill and backward fill methods
    df = df.fillna(method='ffill').fillna(method='bfill')

    # Normalize 'close' values between 0 and 1 if present
    if ('close', '') in df.columns:
        df[('close', '')] = (df[('close', '')] - df[('close', '')].min()) / (df[('close', '')].max() - df[('close', '')].min())

    # Print out the data after cleaning
    print("Data after cleaning:")
    print(df.head())

    return df

In [12]:
def fillData(df):
  # Create a new DataFrame with a complete range of dates and times
  complete_index = pd.date_range(start=df.index.min(), end=df.index.max(), freq='1min')
  complete_data = pd.DataFrame(index=complete_index)

  # Reindex the 'all_data' DataFrame with the complete index
  filledData = df.reindex(complete_data.index)

  # Reset the index to make 'Datetime' a column again
  filledData.reset_index(inplace=True)

  # Rename the 'Datetime' column to 'index'
  filledData.set_index('index', inplace=True)

  # Rename 'index' to 'Date' in the index
  filledData.index.name = 'Date'

  # replace nan values with mean of the before and after values for all columns
  filledData = filledData.fillna(df.mean())

  # Print the updated DataFrame
  return filledData


In [13]:
def UpdateData(filePath):
	df = pd.read_csv(filePath)
	# df = pd.read_csv('./stock_data/BTC_2010-2011.csv')

	# Convert the 'Date' column to the DataFrame index
	df['Date'] = pd.to_datetime(df['Date'])
	df.set_index('Date', inplace=True)

	# Get the end date of the data
	days_before = df.index[-1]
	days_before = days_before.date()

	# Get date time now and replace the time with 00:00:00
	current_date = datetime.date.today()

	# Create an empty DataFrame to store the data
	# all_data = pd.read_csv('stock_data/BTC_2010-2011.csv', index_col=0, header=[0, 1]).sort_index(axis=1)
	all_data = pd.DataFrame()

	# Loop through the range of dates
	for date in pd.date_range(days_before, current_date, freq='D'):
		# Define the start and end times for the data retrieval
		start_time = date.replace(hour=0, minute=0, second=0)
		end_time = date.replace(hour=23, minute=59, second=59)

		# Retrieve the data for the specified date range
		data = yf.download(
			tickers=['BTC-USD'],
			start=start_time,
			end=end_time,
			# period='1d',
			interval='1m'
		)

		# Flatten the column headers to remove the ticker
		if (isinstance(data.columns, pd.MultiIndex)):
				data.columns = [col[0] for col in data.columns]  # Retain only the first level

		# Append the data to the DataFrame
		all_data = pd.concat([all_data, data])
		
	# Update the `all_data` to include the new data
	new_data = pd.concat([df, all_data])

	# Remove any duplicate rows
	new_data = new_data[~new_data.index.duplicated(keep='first')]

	return fillData(new_data)

In [24]:
def splitData():
    data = pd.read_csv('./stock_data/BTC_2010-2011.csv')

    # Chuyển đổi cột 'Date' sang datetime và đặt làm index
    data['Date'] = pd.to_datetime(data['Date'])
    data.set_index('Date', inplace=True)

    print("Các cột trong dữ liệu:", data.columns)

    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data)

    print("Các cột trong dữ liệu:", data.columns)

    print(data.describe())
    print("Số lượng giá trị NaN trong mỗi cột:")
    print(data.isnull().sum())
    print("Kiểm tra giá trị inf/-inf:")
    print(data[data.isin([np.inf, -np.inf]).any(axis=1)])

    # Loại bỏ các hàng có giá trị NaN hoặc inf
    data = data.dropna()  # Xóa hàng chứa NaN
    data = data.replace([np.inf, -np.inf], np.nan).dropna()  # Loại bỏ inf/-inf

    # scaler = MinMaxScaler()
    # scaled_data = scaler.fit_transform(data)

    scaled_data = pd.DataFrame(scaled_data, columns=data.columns, index=data.index)
    # scaled_data = pd.DataFrame(scaled_data, columns=data.columns, index=data.index)

    # Số bước thời gian
    timesteps = 100

    X = []
    y = []

    close_index = data.columns.get_loc('Close')

    for i in range(timesteps, len(scaled_data)):
        X.append(scaled_data.iloc[i - timesteps:i].values)
        y.append(scaled_data.iloc[i, close_index])

    X = np.array(X)
    y = np.array(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    return X_train, X_test, y_train, y_test

In [26]:
def main():

  new_data = UpdateData('./stock_data/BTC_2010-2011.csv')

  # Save the updated data to a CSV file
  new_data.to_csv('./stock_data/BTC_2010-2011.csv')

  return splitData()

In [27]:
main()

[*********************100%***********************]  1 of 1 completed


Các cột trong dữ liệu: Index(['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume'], dtype='object')
Các cột trong dữ liệu: Index(['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume'], dtype='object')
           Adj Close          Close           High            Low  \
count   71455.000000   71455.000000   71455.000000   71455.000000   
mean    86562.813003   86562.813003   86562.813630   86562.813003   
std     12159.049996   12159.049996   12159.050501   12159.049996   
min     65198.042969   65198.042969   65198.042969   65198.042969   
25%     72298.496094   72298.496094   72298.496094   72298.496094   
50%     96477.439955   96477.439955   96477.441262   96477.439955   
75%     96477.439955   96477.439955   96477.441262   96477.439955   
max    101352.539062  101352.539062  101352.539062  101352.539062   

                Open        Volume  
count   71455.000000  7.145500e+04  
mean    86562.813630  1.870927e+07  
std     12159.050501  1.085916e+08  
min     65198.042969  0.00

(array([[[7.02993155e-01, 7.02993155e-01, 7.02993155e-01, 7.02993155e-01,
          7.02993155e-01, 1.06577868e-03],
         [7.02732555e-01, 7.02732555e-01, 7.02732555e-01, 7.02732555e-01,
          7.02732555e-01, 8.13801593e-06],
         [8.65159257e-01, 8.65159257e-01, 8.65159293e-01, 8.65159257e-01,
          8.65159293e-01, 7.46777222e-04],
         ...,
         [7.19216933e-01, 7.19216933e-01, 7.19216933e-01, 7.19216933e-01,
          7.19216933e-01, 1.83105358e-04],
         [7.18279549e-01, 7.18279549e-01, 7.18279549e-01, 7.18279549e-01,
          7.18279549e-01, 4.82252796e-05],
         [8.65159257e-01, 8.65159257e-01, 8.65159293e-01, 8.65159257e-01,
          8.65159293e-01, 7.46777222e-04]],
 
        [[8.65159257e-01, 8.65159257e-01, 8.65159293e-01, 8.65159257e-01,
          8.65159293e-01, 7.46777222e-04],
         [8.65159257e-01, 8.65159257e-01, 8.65159293e-01, 8.65159257e-01,
          8.65159293e-01, 7.46777222e-04],
         [8.65159257e-01, 8.65159257e-01, 8.651