In [1]:
# ==============================
# 02_feature_engineering.py
# ==============================

import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler

# ------------------------------
# 1️⃣ Load Prepared Data
# ------------------------------
input_folder = r"C:\Users\japes\OneDrive\Desktop\japesh\01_result"
prepared_csv = os.path.join(input_folder, "prepared_portfolio_data.csv")
cov_file = os.path.join(input_folder, "cov_matrix.npy")

df = pd.read_csv(prepared_csv)
cov_matrix = np.load(cov_file)

print("✅ Data and covariance matrix loaded successfully!")
print("\nFirst 5 rows of raw data:")
print(df.head().to_string(index=False))  # neat console table

# ------------------------------
# 2️⃣ Pick/Check MarketCap column
# ------------------------------
if "MarketCap_Updated_M" in df.columns:
    df['MarketCap'] = df["MarketCap_Updated_M"]
elif "MarketCap_March28_M" in df.columns:
    df['MarketCap'] = df["MarketCap_March28_M"]
else:
    raise KeyError("No MarketCap column found in CSV.")

# Ensure ExpectedReturn exists (if not, create it)
if 'ExpectedReturn' not in df.columns:
    df['ExpectedReturn'] = df['Profits_M'] / df['MarketCap']

# ------------------------------
# 3️⃣ Select numeric columns for normalization
# ------------------------------
numeric_cols = ['MarketCap', 'Revenues_M', 'Profits_M', 'ExpectedReturn']

scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df_scaled[numeric_cols])

print("\n✅ Normalization complete.")
print("\nFirst 5 rows of normalized data:")
print(df_scaled.head().to_string(index=False))  # neat console table

# ------------------------------
# 4️⃣ Create Feature Matrix for QUBO / QAOA
# ------------------------------
features = df_scaled[['Ticker', 'Company'] + numeric_cols]

# ------------------------------
# 5️⃣ Save Scaled Data in 02_feature folder
# ------------------------------
output_folder = r"C:\Users\japes\OneDrive\Desktop\japesh\02_feature"
os.makedirs(output_folder, exist_ok=True)

scaled_csv = os.path.join(output_folder, "scaled_portfolio_data.csv")
cov_scaled_file = os.path.join(output_folder, "cov_matrix_scaled.npy")

features.to_csv(scaled_csv, index=False)
np.save(cov_scaled_file, cov_matrix)  # covariance matrix same as original

print(f"\n✅ Scaled portfolio data saved to: {scaled_csv}")
print(f"✅ Covariance matrix copied to: {cov_scaled_file}")

# ------------------------------
# 6️⃣ Quick Summary
# ------------------------------
print("\n--- Summary Statistics (After Normalization) ---")
print(features.describe().to_string())  # neat console table

print("\nFeature columns ready for QUBO formulation:")
print(features.columns.tolist())


✅ Data and covariance matrix loaded successfully!

First 5 rows of raw data:
 Rank            Company Ticker      Sector                                 Industry Profitable Founder_is_CEO FemaleCEO Growth_in_Jobs  Change_in_Rank Gained_in_Rank Dropped_in_Rank Newcomer_to_the_Fortune500 Global500 Worlds_Most_Admired_Companies Best_Companies_to_Work_For  Number_of_employees  MarketCap_March28_M  Revenues_M  RevenuePercentChange  Profits_M  ProfitsPercentChange  Assets_M                 CEO Country HeadquartersCity HeadquartersState                           Website CompanyType                                                                            Footnote  MarketCap_Updated_M    Updated  ExpectedReturn
    1            Walmart    WMT   Retailing                    General Merchandisers        yes             no        no             no               0             no              no                         no       yes                           yes                        yes          