In [1]:
# ==============================
# 01_data_prep.py
# ==============================

import pandas as pd
import numpy as np
import os

# ------------------------------
# 1️⃣ Load dataset
# ------------------------------
file_path = r"C:\Users\japes\OneDrive\Desktop\japesh\data\fortune1000_2024.csv"
df = pd.read_csv(file_path)
print("Dataset loaded successfully! Columns:")
print(df.columns.tolist())

# Display first few rows
print(df.head())

# ------------------------------
# 2️⃣ Select top N companies
# ------------------------------
N = 20  # number of companies to use in demo
selected_df = df.head(N).copy()  # pick first N rows

# ------------------------------
# 2a️⃣ Pick correct MarketCap column
# ------------------------------
# Use MarketCap_Updated_M if exists, else MarketCap_March28_M
if "MarketCap_Updated_M" in df.columns:
    marketcap_col = "MarketCap_Updated_M"
elif "MarketCap_March28_M" in df.columns:
    marketcap_col = "MarketCap_March28_M"
else:
    raise KeyError("No MarketCap column found in CSV.")

print(f"\nSelected top {N} companies:")
print(selected_df[["Ticker", "Company", marketcap_col, "Revenues_M", "Profits_M"]])

# ------------------------------
# 3️⃣ Prepare returns proxy
# ------------------------------
# Use Profits / MarketCap as a proxy for expected returns
selected_df["ExpectedReturn"] = selected_df["Profits_M"] / selected_df[marketcap_col]

# Covariance matrix proxy: random small numbers for demo
A = np.random.rand(N, N) * 0.05
cov_matrix = (A + A.T) / 2   # make symmetric
np.fill_diagonal(cov_matrix, 0.1)  # diagonal = variance

# ------------------------------
# 4️⃣ Save prepared data in 01_result folder
# ------------------------------
output_folder = r"C:\Users\japes\OneDrive\Desktop\japesh\01_result"
os.makedirs(output_folder, exist_ok=True)

selected_df.to_csv(os.path.join(output_folder, "prepared_portfolio_data.csv"), index=False)
np.save(os.path.join(output_folder, "cov_matrix.npy"), cov_matrix)

print("\nPrepared data saved to:")
print(os.path.join(output_folder, "prepared_portfolio_data.csv"))
print(os.path.join(output_folder, "cov_matrix.npy"))


Dataset loaded successfully! Columns:
['Rank', 'Company', 'Ticker', 'Sector', 'Industry', 'Profitable', 'Founder_is_CEO', 'FemaleCEO', 'Growth_in_Jobs', 'Change_in_Rank', 'Gained_in_Rank', 'Dropped_in_Rank', 'Newcomer_to_the_Fortune500', 'Global500', 'Worlds_Most_Admired_Companies', 'Best_Companies_to_Work_For', 'Number_of_employees', 'MarketCap_March28_M', 'Revenues_M', 'RevenuePercentChange', 'Profits_M', 'ProfitsPercentChange', 'Assets_M', 'CEO', 'Country', 'HeadquartersCity', 'HeadquartersState', 'Website', 'CompanyType', 'Footnote', 'MarketCap_Updated_M', 'Updated']
   Rank             Company Ticker       Sector  \
0     1             Walmart    WMT    Retailing   
1     2              Amazon   AMZN    Retailing   
2     3               Apple   AAPL   Technology   
3     4  UnitedHealth Group    UNH  Health Care   
4     5  Berkshire Hathaway   BRKA   Financials   

                                   Industry Profitable Founder_is_CEO  \
0                     General Merchandiser