In [None]:
import pandas as pd
import pyarrow.parquet as pq

price_file = "alpha_vantage_IBM_2025-01-26.parquet"
overview_file = "company_overview_IBM_2025-01-26.parquet"

df_price = pd.read_parquet(price_file)
df_overview = pd.read_parquet(overview_file)

In [None]:
# Display data samples
print("Stock Price Data Sample:")
print(df_price.head())

print("\nCompany Overview Data Sample:")
print(df_overview.head())

In [None]:
# Inspect Parquet file metadata and schema
price_parquet = pq.ParquetFile(price_file)
overview_parquet = pq.ParquetFile(overview_file)

print("\nPrice File Metadata:")
print(price_parquet.metadata)

print("\nCompany Overview Metadata:")
print(overview_parquet.metadata)

print("\nPrice File Schema:")
print(price_parquet.schema)

print("\nCompany Overview Schema:")
print(overview_parquet.schema)

In [None]:
# Statistical Summary of Data
print("\nStock Price Data - Summary Statistics:")
print(df_price.describe())

print("\nCompany Overview Data - Summary Statistics:")
print(df_overview.describe())

In [None]:
# Check for missing values
print("\nMissing values in Stock Price Data:")
print(df_price.isnull().sum())

print("\nMissing values in Company Overview Data:")
print(df_overview.isnull().sum())

In [None]:
# Convert data types for numeric columns
columns_to_convert = ['1. open', '2. high', '3. low', '4. close', '5. volume', 'SMA']
df_price[columns_to_convert] = df_price[columns_to_convert].apply(pd.to_numeric, errors='coerce')

# Handling missing values (forward-fill)
df_price.fillna(method='ffill', inplace=True)

# Save cleaned data
cleaned_price_file = "alpha_vantage_cleaned.parquet"
df_price.to_parquet(cleaned_price_file)

print(f"\nCleaned stock price data saved to {cleaned_price_file}")

In [None]:
# Convert company overview numeric fields
numeric_columns = [
    "MarketCapitalization", "EBITDA", "PERatio", "PEGRatio", "BookValue",
    "DividendPerShare", "DividendYield", "EPS", "RevenueTTM", "GrossProfitTTM",
    "Beta", "52WeekHigh", "52WeekLow", "50DayMovingAverage", "200DayMovingAverage"
]

df_overview[numeric_columns] = df_overview[numeric_columns].apply(pd.to_numeric, errors='coerce')

In [None]:
# Save cleaned company overview data
cleaned_overview_file = "company_overview_cleaned.parquet"
df_overview.to_parquet(cleaned_overview_file)

print(f"\nCleaned company overview data saved to {cleaned_overview_file}")