In [27]:
# Dependencies and setup
import pandas as pd
import os

# csv paths
kaggle_path = "Resources/kaggle_clean.csv"
dow_path = "Resources/dow_clean.csv"
indu_path = "Resources/indu_clean.csv"
spy_path = "Resources/spy_clean.csv"

# Read the csv files into dataframes - NEW update to to avoid overwriting the file path variables
kaggle_df = pd.read_csv(kaggle_path)
dow_df = pd.read_csv(dow_path)
indu_df = pd.read_csv(indu_path)
spy_df = pd.read_csv(spy_path)

In [29]:
# Create list of DataFrames
all_csvs = [kaggle_df, dow_df, indu_df, spy_df]

# Process each DataFrame
for df in all_csvs:   
    columns = df.columns.tolist()
    columns = ['symbol'] + [col for col in columns if col != 'symbol']
    df = df.sort_values(by="date", ascending=True)
    df = df[columns]
    df = df.round(2)
    print(f"{df.head()}")



      symbol        date   open   high    low  close    volume
0       NVDA  2014-06-02   4.76   4.80   4.70   4.74  18150000
10056   PANW  2014-06-02  25.04  25.06  24.00  24.56   6110100
12571    NOW  2014-06-02  52.33  52.34  49.46  50.26   3069100
2515    META  2014-06-02  63.23  63.59  62.05  63.08  35996000
15086    AMD  2014-06-02   4.03   4.03   3.94   3.97  10859500
     symbol        date   open   high    low  close   volume
1309    DOW  2019-03-20  52.75  53.50  49.50  49.80  2350838
1308    DOW  2019-03-21  49.99  50.00  48.20  48.98  1764671
1307    DOW  2019-03-22  48.80  49.95  48.16  48.60   844690
1306    DOW  2019-03-25  48.60  49.40  48.00  49.15   440892
1305    DOW  2019-03-26  49.00  49.75  48.18  48.85   504734
     symbol        date  open  high  low  close  volume
1704   INDU  2017-08-21  9.81  9.81  9.7   9.70   52682
1703   INDU  2017-08-22  9.70  9.70  9.7   9.70   27295
1702   INDU  2017-08-23  9.70  9.70  9.7   9.70       5
1701   INDU  2017-08-24  9.70  9

In [30]:
# Combine kaggle data and index csvs into one csv
master_df = pd.concat(all_csvs, ignore_index=True)
master_df


Unnamed: 0,symbol,date,open,high,low,close,volume
0,NVDA,2014-06-02,4.7650,4.795,4.6950,4.7350,18150000
1,NVDA,2014-06-03,4.7450,4.750,4.6525,4.7150,24321200
2,NVDA,2014-06-04,4.7025,4.755,4.7000,4.7200,16123600
3,NVDA,2014-06-05,4.7275,4.755,4.7000,4.7400,18869600
4,NVDA,2014-06-06,4.7625,4.800,4.7450,4.7575,16105200
...,...,...,...,...,...,...,...
26483,SPY,2017-08-28,245.1700,245.200,244.0900,244.5700,40565606
26484,SPY,2017-08-25,244.9000,245.610,244.3900,244.5600,64445912
26485,SPY,2017-08-24,245.0000,245.180,243.7500,243.9900,50741671
26486,SPY,2017-08-23,244.3300,245.050,244.1600,244.5600,50203837


In [31]:
#define path to Resources folder
resources_path = "Resources"

file_path = os.path.join(resources_path, "master.csv")
master_df.to_csv(file_path, index=False, header=True)

In [32]:
#reading df svc file
master_df = pd.read_csv(file_path)

In [34]:
# to ensure that we compare the stok for the same period, we set up data frame based on data periods avaiailble for free actross all stocks
# Convert 'date' column to datetime type to ensure correct filtering
master_df['date'] = pd.to_datetime(master_df['date'])

# Define the date range
start_date = '2019-03-20'
end_date = '2024-05-28'

# Filter the DataFrame based on the date range
filtered_df = master_df[(master_df['date'] >= start_date) & (master_df['date'] <= end_date)]

# Set 'date' column as the index
filtered_df.set_index('date', inplace=True)

# Display the first few rows to check the result
print(filtered_df.head())



           symbol       open       high        low      close    volume
date                                                                   
2019-03-20   NVDA  44.220001  44.757500  43.250000  43.599998  71914800
2019-03-21   NVDA  43.832500  46.250000  43.782501  45.985001  82431200
2019-03-22   NVDA  45.709999  46.200001  44.235001  44.375000  74764400
2019-03-25   NVDA  43.965000  44.612499  42.777500  43.445000  52521200
2019-03-26   NVDA  44.872501  45.437500  43.650002  44.217499  70350800


In [36]:
# Save the filtered DataFrame to a new CSV file if needed
output_path = os.path.join(resources_path, "filtered_master.csv")
filtered_df.to_csv(output_path, index=True, header=True)

In [37]:
#reading df svc file
df = pd.read_csv(output_path)

In [38]:


# Convert 'date' column to datetime type to be on the safe side as we merge several data sets
df['date'] = pd.to_datetime(df['date'])

# Sort DataFrame by 'symbol' and 'date' to ensure our merged (updated) data is sorted.
df = df.sort_values(by=['symbol', 'date'], ascending=[True, True])

# Calculate the daily return for each symbol
# The daily return is calculated as: (current day's close - current day's open) / previous day's close
df['daily_return'] = df.groupby('symbol').apply(lambda x: (x['close'] - x['open']) / x['close'].shift(1)).reset_index(level=0, drop=True)

# df['daily_return'] = df['daily_return'].shift() - this cid was commented as the code worked correctly without it and incorrectly with it

# Format the daily return as percentage with 13 decimals to match price format
df['daily_return'] = df['daily_return'].apply(lambda x: f"{x:.13%}" if pd.notnull(x) else None)

# Display the first few rows to check the result
print(df.head(10))

# Save the result to a new CSV file
df.to_csv('Resources/daily_returns_master.csv', index=False)

            date symbol        open        high         low       close  \
11237 2020-12-09     AI  100.000000  115.000000   90.029999   92.489998   
11238 2020-12-10     AI   99.480003  131.746002   96.000000  130.000000   
11239 2020-12-11     AI  123.820000  133.000000  116.000000  119.580002   
11240 2020-12-14     AI  122.160004  123.830002  100.660004  102.360001   
11241 2020-12-15     AI  104.570000  109.889999   98.099998  102.000000   
11242 2020-12-16     AI  105.000000  116.709999  104.000000  113.690002   
11243 2020-12-17     AI  118.440002  121.000000  115.449997  117.239998   
11244 2020-12-18     AI  116.669998  145.000000  115.089996  137.589996   
11245 2020-12-21     AI  142.014008  168.770004  141.000000  160.889999   
11246 2020-12-22     AI  170.529999  179.000000  163.300003  177.470001   

         volume        daily_return  
11237  24805600                None  
11238  22839500   32.9981590961003%  
11239  11758600   -3.2615368182843%  
11240   8197500  -16.5