In [32]:
import pandas as pd

# ==========================
# 1. Load Dataset
# ==========================
df = pd.read_csv("combined_market_data.csv")

# Display initial info
print("Initial dataset shape:", df.shape)
print("\nOriginal Columns:\n", df.columns.tolist())
print(df.dtypes)

Initial dataset shape: (2399, 9)

Original Columns:
 ['Security_ID', 'Vendor_ID', 'Vendor_Code', 'Source_Feed_ID', 'Price_Type', 'Exchange_Code', 'Price_Date', 'Currency_Code', 'Price']
Security_ID        object
Vendor_ID          object
Vendor_Code        object
Source_Feed_ID     object
Price_Type         object
Exchange_Code      object
Price_Date         object
Currency_Code      object
Price             float64
dtype: object


In [33]:
df.rename(columns={
    "Security_ID": "Security_ID",
    "Vendor_ID": "Vendor_ID",
    "Vendor_Code": "Vendor_Code",
    "Source_Feed_ID": "Source_Feed_ID",
    "Price_Type": "Price_Type",
    "Exchange_Code": "Exchange_Code",
    "Price_Date": "Date",
    "Currency_Code": "Currency_Code",
    "Price": "Price"
}, inplace=True)

# ==========================
#  Handle Missing Values
# ==========================
# Drop rows missing essential info
df = df.dropna(subset=["Date", "Security_ID", "Price"])

In [34]:
# ==========================
# 3. Handle Missing Values
# ==========================
# Drop rows missing essential info
df = df.dropna(subset=["Date", "Security_ID", "Price"])

# Create a dummy Conversion_Rate column (if not present)
if "Conversion_Rate" not in df.columns:
    df["Conversion_Rate"] = df["Currency_Code"].map({
        "USD": 1.0,
        "INR": 0.012,   # Example rate
        "EUR": 1.1,
        "GBP": 1.25
    })

# Fill missing conversion rates safely (no FutureWarning)
df["Conversion_Rate"] = df["Conversion_Rate"].fillna(1.0)


In [35]:
# 4. Remove Duplicates
# ==========================
df.drop_duplicates(subset=["Security_ID", "Vendor_ID", "Date", "Price"], inplace=True)


In [36]:
# ==========================
# 5. Standardize Formats
# ==========================

# Clean Security_ID format
df["Security_ID"] = df["Security_ID"].astype(str).str.strip().str.upper()

# Convert Vendor_Code to string and clean format
df["Vendor_Code"] = df["Vendor_Code"].astype(str).str.strip().str.upper()

# Convert numeric columns safely
df["Price"] = pd.to_numeric(df["Price"], errors="coerce")
df["Conversion_Rate"] = pd.to_numeric(df["Conversion_Rate"], errors="coerce")


In [37]:
#  Convert to USD
# ==========================
df["Price_USD"] = df["Price"] * df["Conversion_Rate"]



In [38]:

#  Final Overview
# ==========================
print("\nCleaned dataset shape:", df.shape)
print("\nSample Cleaned Data:\n", df.head())

# Save cleaned data
df.to_csv("cleaned_market_data_usd.csv", index=False)
print("\n✅ Cleaned and transformed data saved as 'cleaned_market_data_usd.csv'")


Cleaned dataset shape: (2041, 11)

Sample Cleaned Data:
   Security_ID Vendor_ID Vendor_Code   Source_Feed_ID Price_Type Exchange_Code  \
0        AAPL     TD001  TWELVEDATA  AAPL_TwelveData       Open        NASDAQ   
1        AAPL     TD001  TWELVEDATA  AAPL_TwelveData       Open        NASDAQ   
2        AAPL     TD001  TWELVEDATA  AAPL_TwelveData       Open        NASDAQ   
3        AAPL     TD001  TWELVEDATA  AAPL_TwelveData       Open        NASDAQ   
4        AAPL     TD001  TWELVEDATA  AAPL_TwelveData       Open        NASDAQ   

         Date Currency_Code      Price  Conversion_Rate  Price_USD  
0  2025-09-02           USD  229.25000              1.0  229.25000  
1  2025-09-03           USD  237.21001              1.0  237.21001  
2  2025-09-04           USD  238.45000              1.0  238.45000  
3  2025-09-05           USD  240.00000              1.0  240.00000  
4  2025-09-08           USD  239.30000              1.0  239.30000  

✅ Cleaned and transformed data saved as 