In [36]:
import yfinance as yf
import pendulum
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os
# Define the start and end dates for the loop
import datetime


In [37]:
# data = yf.download(
#   tickers=['AAPL'],
#   # start=start_time,
#   # end=end_time,
#   period='max',
#   interval='1d'
# )

# # Drop second level of column names
# data.columns = data.columns.droplevel(1)

# csv_file = "stock_data_alternative.csv"
# with open(csv_file, mode="w", newline="") as file:
# 		writer = csv.writer(file)
# 		# Write the header
# 		writer.writerow(["Date"] + list(data.columns))
# 		# Write the data
# 		for index, row in data.iterrows():
# 				writer.writerow([index] + list(row))


In [38]:
def clean_data(df):
    print("Initial data information:")
    print(df.info())

    # Remove unnecessary columns (if there are columns named 'Unnamed')
    df = df.loc[:, ~df.columns.get_level_values(1).str.contains('^Unnamed')]

    # Handle the 'timestamp' column (if present)
    if ('timestamp', '') in df.columns:
        # Convert to datetime format
        df[('timestamp', '')] = pd.to_datetime(df[('timestamp', '')], errors='coerce')  
        
        # Remove rows with missing timestamp values
        df = df.dropna(subset=[('timestamp', '')]) 

    # Fill missing values using forward fill and backward fill methods
    df = df.fillna(method='ffill').fillna(method='bfill')

    # Normalize 'close' values between 0 and 1 if present
    if ('close', '') in df.columns:
        df[('close', '')] = (df[('close', '')] - df[('close', '')].min()) / (df[('close', '')].max() - df[('close', '')].min())

    # Print out the data after cleaning
    print("Data after cleaning:")
    print(df.head())

    return df

In [39]:
def fillData(df):
  # Create a new DataFrame with a complete range of dates and times
  complete_index = pd.date_range(start=df.index.min(), end=df.index.max(), freq='1min')
  complete_data = pd.DataFrame(index=complete_index)

  # Reindex the 'all_data' DataFrame with the complete index
  filledData = df.reindex(complete_data.index)

  # Reset the index to make 'Datetime' a column again
  filledData.reset_index(inplace=True)

  # Rename the 'Datetime' column to 'index'
  filledData.set_index('index', inplace=True)

  # Rename 'index' to 'Date' in the index
  filledData.index.name = 'Date'

  # Linear interpolation to fill small gaps
  filledData = filledData.interpolate(method='linear')

  # replace nan values with mean of the before and after values for all columns
  filledData = df.fillna(df.rolling(12, min_periods=1, center=True).mean())

  # Forward and backward fill for large gaps
  df = df.ffill().bfill()

  filledData.to_csv('filledData.csv')

  # for column in filledData.columns:
  #   filledData[column] = filledData[column].fillna((filledData[column].shift() + filledData[column].shift(-1))/2)

  # Print the updated DataFrame
  # return filledData


In [40]:
def UpdateData(filePath):
	df = pd.DataFrame()

	days_before = pd.Timestamp.now()

	if os.path.exists(filePath):
		df = pd.read_csv(filePath)
		# df = pd.read_csv('./stock_data/BTC_2010-2011.csv')

		# Convert the 'Date' column to the DataFrame index
		df['Date'] = pd.to_datetime(df['Date'])
		df.set_index('Date', inplace=True)
		days_before = df.index.max().date()


	# Get the end date of the data
	if days_before == pd.Timestamp.now():
		last_month = pd.Timestamp.now() - pd.DateOffset(months=1)
		days_before = last_month.date()

	# Get date time now and replace the time with 00:00:00
	current_date = datetime.date.today()

	# Create an empty DataFrame to store the data
	# all_data = pd.read_csv('stock_data/BTC_2010-2011.csv', index_col=0, header=[0, 1]).sort_index(axis=1)
	all_data = pd.DataFrame()

	print(days_before)
	print(current_date)

	# Loop through the range of dates
	for date in pd.date_range(days_before, current_date, freq='D'):
		# Define the start and end times for the data retrieval
		start_time = date.replace(hour=0, minute=0, second=0)
		end_time = date.replace(hour=23, minute=59, second=59)

		# Retrieve the data for the specified date range
		data = yf.download(
			tickers=['AAPL'],
			start=start_time,
			end=end_time,
			# period='1d',
			interval='1m'
		)

		# Flatten the column headers to remove the ticker
		if (isinstance(data.columns, pd.MultiIndex)):
				data.columns = [col[0] for col in data.columns]  # Retain only the first level

		# Append the data to the DataFrame
		all_data = pd.concat([all_data, data])
		
	# Update the `all_data` to include the new data
	new_data = pd.concat([df, all_data])

	# Remove any duplicate rows
	new_data = new_data[~new_data.index.duplicated(keep='first')]

	new_data.to_csv('filledData2.csv')
	print(new_data)

	return fillData(new_data)

In [41]:
def splitData():
    data = pd.read_csv('./stock_data/BTC_2010-2011.csv')

    # Chuyển đổi cột 'Date' sang datetime và đặt làm index
    data['Date'] = pd.to_datetime(data['Date'])
    data.set_index('Date', inplace=True)

    print("Các cột trong dữ liệu:", data.columns)

    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data)

    print("Các cột trong dữ liệu:", data.columns)

    print(data.describe())
    print("Số lượng giá trị NaN trong mỗi cột:")
    print(data.isnull().sum())
    print("Kiểm tra giá trị inf/-inf:")
    print(data[data.isin([np.inf, -np.inf]).any(axis=1)])

    # Loại bỏ các hàng có giá trị NaN hoặc inf
    data = data.dropna()  # Xóa hàng chứa NaN
    data = data.replace([np.inf, -np.inf], np.nan).dropna()  # Loại bỏ inf/-inf

    # scaler = MinMaxScaler()
    # scaled_data = scaler.fit_transform(data)

    scaled_data = pd.DataFrame(scaled_data, columns=data.columns, index=data.index)
    # scaled_data = pd.DataFrame(scaled_data, columns=data.columns, index=data.index)

    # Số bước thời gian
    timesteps = 100

    X = []
    y = []

    close_index = data.columns.get_loc('Close')

    for i in range(timesteps, len(scaled_data)):
        X.append(scaled_data.iloc[i - timesteps:i].values)
        y.append(scaled_data.iloc[i, close_index])

    X = np.array(X)
    y = np.array(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    return X_train, X_test, y_train, y_test

In [42]:
# Function to download and format data for a single stock
def download_and_format_stock_data(ticker, start_date, end_date, retries=3):
    for i in range(retries):
        try:
            stock_data = yf.download(ticker, start=start_date, end=end_date)
            if not stock_data.empty:
                stock_data.reset_index(inplace=True)
                stock_data['Name'] = ticker
                stock_data = stock_data[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Name']]
                stock_data.columns = ['date', 'open', 'high', 'low', 'close', 'volume', 'name']
                return stock_data
        except Exception as e:
            print(f"Attempt {i+1} failed for {ticker}: {e}")
    return pd.DataFrame()

In [43]:
def main():

  # new_data = UpdateData('./stock_data/AAPL.csv')
  UpdateData('./stock_data/AAPL.csv')

  # Save the updated data to a CSV file
  # new_data.to_csv('./stock_data/AAPL.csv')

  # return splitData()

In [44]:
main()

[*********************100%***********************]  1 of 1 completed

2024-12-17
2024-12-19



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['AAPL']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1m 2024-12-19 00:00:00 -> 2024-12-19 23:59:59) (Yahoo error = "Data doesn\'t exist for startDate = 1734584400, endDate = 1734670799")')


                            Adj Close       Close        High         Low  \
2024-11-18 14:30:00+00:00  226.199997  226.199997  226.410004  225.179993   
2024-11-18 14:31:00+00:00  226.565994  226.565994  226.639999  226.020004   
2024-11-18 14:32:00+00:00  226.679993  226.679993  226.880005  226.430099   
2024-11-18 14:33:00+00:00  226.410004  226.410004  226.820007  226.110001   
2024-11-18 14:34:00+00:00  226.490005  226.490005  226.550003  226.240005   
...                               ...         ...         ...         ...   
2024-12-18 16:56:00+00:00  253.539902  253.539902  253.539993  253.460007   
2024-12-18 16:57:00+00:00  253.570007  253.570007  253.570007  253.509995   
2024-12-18 16:58:00+00:00  253.530106  253.530106  253.585007  253.520004   
2024-12-18 16:59:00+00:00  253.430893  253.430893  253.544998  253.430099   
2024-12-18 17:00:00+00:00  253.369995  253.369995  253.369995  253.369995   

                                 Open     Volume  
2024-11-18 14:30:00+00:0

In [45]:

df = pd.read_csv('filledData.csv')

# Remove nan values
df = df.dropna()

# Set first column as index
df.set_index(df.columns[0], inplace=True)


# Reset the index to make 'Datetime' a column again
df.reset_index(inplace=True)

# Rename the 'Datetime' column to 'index'
df.set_index('index', inplace=True)

# Rename 'index' to 'Date' in the index
df.index.name = 'Date'

df.to_csv('filledData1.csv')

KeyError: "None of ['index'] are in the columns"