In [12]:
import pandas as pd

# Load the data again
data = pd.read_csv('data1/Meta_Platforms_Stock_Price_History.csv')

# Print the first few rows of the dataset to inspect the 'Date' column
print(data.head())

# Print column names to verify if 'Date' is present
print(data.columns)


         Date   Price    Open    High     Low    Vol. Change %
0  01/24/2025  647.49  636.40  652.00  634.20  19.07M    1.73%
1  01/23/2025  636.45  627.36  636.60  621.15   9.91M    2.08%
2  01/22/2025  623.50  623.30  633.70  619.43  12.27M    1.14%
3  01/21/2025  616.46  617.83  621.50  609.01  11.67M    0.60%
4  01/17/2025  612.77  624.05  624.97  603.67  17.28M    0.24%
Index(['Date', 'Price', 'Open', 'High', 'Low', 'Vol.', 'Change %'], dtype='object')


In [13]:
# Strip extra spaces from column names
data.columns = data.columns.str.strip()

# Print the column names again to confirm
print(data.columns)


Index(['Date', 'Price', 'Open', 'High', 'Low', 'Vol.', 'Change %'], dtype='object')


In [14]:
# Convert 'Date' column to datetime
data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%Y')

# Print the first few rows to check the result
print(data.head())


        Date   Price    Open    High     Low    Vol. Change %
0 2025-01-24  647.49  636.40  652.00  634.20  19.07M    1.73%
1 2025-01-23  636.45  627.36  636.60  621.15   9.91M    2.08%
2 2025-01-22  623.50  623.30  633.70  619.43  12.27M    1.14%
3 2025-01-21  616.46  617.83  621.50  609.01  11.67M    0.60%
4 2025-01-17  612.77  624.05  624.97  603.67  17.28M    0.24%


In [15]:
# Set 'Date' as the index
data.set_index('Date', inplace=True)

# Print the first few rows to confirm
print(data.head())


             Price    Open    High     Low    Vol. Change %
Date                                                       
2025-01-24  647.49  636.40  652.00  634.20  19.07M    1.73%
2025-01-23  636.45  627.36  636.60  621.15   9.91M    2.08%
2025-01-22  623.50  623.30  633.70  619.43  12.27M    1.14%
2025-01-21  616.46  617.83  621.50  609.01  11.67M    0.60%
2025-01-17  612.77  624.05  624.97  603.67  17.28M    0.24%


In [16]:
# Remove 'M' and 'K' from 'Vol.' and convert to numeric
data['Vol.'] = data['Vol.'].replace({'M': '*1e6', 'K': '*1e3'}, regex=True).map(pd.eval).astype(float)

# Remove '%' from 'Change %' and convert to numeric
data['Change %'] = data['Change %'].str.rstrip('%').astype('float') / 100.0

# Print the cleaned data
print(data.head())


             Price    Open    High     Low        Vol.  Change %
Date                                                            
2025-01-24  647.49  636.40  652.00  634.20  19070000.0    0.0173
2025-01-23  636.45  627.36  636.60  621.15   9910000.0    0.0208
2025-01-22  623.50  623.30  633.70  619.43  12270000.0    0.0114
2025-01-21  616.46  617.83  621.50  609.01  11670000.0    0.0060
2025-01-17  612.77  624.05  624.97  603.67  17280000.0    0.0024


In [17]:
# Final check of the preprocessed data
print(data.head())


             Price    Open    High     Low        Vol.  Change %
Date                                                            
2025-01-24  647.49  636.40  652.00  634.20  19070000.0    0.0173
2025-01-23  636.45  627.36  636.60  621.15   9910000.0    0.0208
2025-01-22  623.50  623.30  633.70  619.43  12270000.0    0.0114
2025-01-21  616.46  617.83  621.50  609.01  11670000.0    0.0060
2025-01-17  612.77  624.05  624.97  603.67  17280000.0    0.0024


In [18]:
# Load the data again
data = pd.read_csv('data1/Meta_Platforms_Stock_Price_History.csv')

# Print the first few rows of the dataset to inspect the 'Date' column
print(data.head())

# Print column names to verify if 'Date' is present
print(data.columns)


         Date   Price    Open    High     Low    Vol. Change %
0  01/24/2025  647.49  636.40  652.00  634.20  19.07M    1.73%
1  01/23/2025  636.45  627.36  636.60  621.15   9.91M    2.08%
2  01/22/2025  623.50  623.30  633.70  619.43  12.27M    1.14%
3  01/21/2025  616.46  617.83  621.50  609.01  11.67M    0.60%
4  01/17/2025  612.77  624.05  624.97  603.67  17.28M    0.24%
Index(['Date', 'Price', 'Open', 'High', 'Low', 'Vol.', 'Change %'], dtype='object')


In [19]:
# Load the data again
data = pd.read_csv('data1/Meta_Platforms_Stock_Price_History.csv')

# Print the first few rows of the dataset to inspect the 'Date' column
print(data.head())

# Print column names to verify if 'Date' is present
print(data.columns)


         Date   Price    Open    High     Low    Vol. Change %
0  01/24/2025  647.49  636.40  652.00  634.20  19.07M    1.73%
1  01/23/2025  636.45  627.36  636.60  621.15   9.91M    2.08%
2  01/22/2025  623.50  623.30  633.70  619.43  12.27M    1.14%
3  01/21/2025  616.46  617.83  621.50  609.01  11.67M    0.60%
4  01/17/2025  612.77  624.05  624.97  603.67  17.28M    0.24%
Index(['Date', 'Price', 'Open', 'High', 'Low', 'Vol.', 'Change %'], dtype='object')


In [20]:
# Set 'Date' as the index
data.set_index('Date', inplace=True)

# Print the first few rows to confirm
print(data.head())


             Price    Open    High     Low    Vol. Change %
Date                                                       
01/24/2025  647.49  636.40  652.00  634.20  19.07M    1.73%
01/23/2025  636.45  627.36  636.60  621.15   9.91M    2.08%
01/22/2025  623.50  623.30  633.70  619.43  12.27M    1.14%
01/21/2025  616.46  617.83  621.50  609.01  11.67M    0.60%
01/17/2025  612.77  624.05  624.97  603.67  17.28M    0.24%


In [21]:
# Remove 'M' and 'K' from 'Vol.' and convert to numeric
data['Vol.'] = data['Vol.'].replace({'M': '*1e6', 'K': '*1e3'}, regex=True).map(pd.eval).astype(float)

# Remove '%' from 'Change %' and convert to numeric
data['Change %'] = data['Change %'].str.rstrip('%').astype('float') / 100.0

# Print the cleaned data
print(data.head())


             Price    Open    High     Low        Vol.  Change %
Date                                                            
01/24/2025  647.49  636.40  652.00  634.20  19070000.0    0.0173
01/23/2025  636.45  627.36  636.60  621.15   9910000.0    0.0208
01/22/2025  623.50  623.30  633.70  619.43  12270000.0    0.0114
01/21/2025  616.46  617.83  621.50  609.01  11670000.0    0.0060
01/17/2025  612.77  624.05  624.97  603.67  17280000.0    0.0024


In [22]:
# Final check of the preprocessed data
print(data.head())


             Price    Open    High     Low        Vol.  Change %
Date                                                            
01/24/2025  647.49  636.40  652.00  634.20  19070000.0    0.0173
01/23/2025  636.45  627.36  636.60  621.15   9910000.0    0.0208
01/22/2025  623.50  623.30  633.70  619.43  12270000.0    0.0114
01/21/2025  616.46  617.83  621.50  609.01  11670000.0    0.0060
01/17/2025  612.77  624.05  624.97  603.67  17280000.0    0.0024


In [23]:
# Fill missing values with forward fill (use previous valid value)
data.fillna(method='ffill', inplace=True)

# OR Drop rows with any missing values
# data.dropna(inplace=True)

# Check again to confirm there are no missing values
print(data.isnull().sum())


Price       0
Open        0
High        0
Low         0
Vol.        0
Change %    0
dtype: int64


  data.fillna(method='ffill', inplace=True)


In [24]:
# Check for duplicate rows
print(f"Duplicate rows: {data.duplicated().sum()}")

# Drop duplicates
data.drop_duplicates(inplace=True)

# Confirm if duplicates are removed
print(f"Duplicate rows: {data.duplicated().sum()}")


Duplicate rows: 0
Duplicate rows: 0


In [25]:
# Save the preprocessed data as a Parquet file
data.to_parquet('preprocessed_data.parquet', index=False)

# Confirm the file is saved
print("File saved as 'preprocessed_data.parquet'.")


File saved as 'preprocessed_data.parquet'.


In [26]:
# Save the preprocessed data as a CSV file
data.to_csv('preprocessed_data.csv', index=False)

# Confirm the file is saved
print("File saved as 'preprocessed_data.csv'.")


File saved as 'preprocessed_data.csv'.
