In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv('financials_cleaned.csv')

print(df.head())
print(df.info())

  Symbol                 Name                  Sector   Price  Price/Earnings  \
0    MMM           3M Company             Industrials  222.89           24.31   
1    AOS      A.O. Smith Corp             Industrials   60.24           27.76   
2    ABT  Abbott Laboratories             Health Care   56.27           22.51   
3   ABBV          AbbVie Inc.             Health Care  108.48           19.41   
4    ACN        Accenture plc  Information Technology  150.51           25.47   

   Dividend_Yield  Earnings/Share  52_Week_Low  52_Week_High    Market_Cap  \
0        2.332862            7.92       259.77       175.490  1.387211e+11   
1        1.147959            1.70        68.39        48.925  1.078342e+10   
2        1.908982            0.26        64.60        42.280  1.021210e+11   
3        2.499560            3.29       125.86        60.050  1.813863e+11   
4        1.714470            5.44       162.60       114.820  9.876586e+10   

         EBITDA  Price/Sales  Price/Book  
0

In [5]:
# Define categorical and numerical features
categorical_features = ['Sector']
numerical_features = ['Price', 'Price/Earnings', 'Dividend_Yield', 'Earnings/Share', 
                      '52_Week_Low', '52_Week_High', 'Market_Cap', 'EBITDA', 
                      'Price/Sales', 'Price/Book']

In [6]:
# Scaling numerical features using StandardScaler
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [7]:
# Encoding categorical features with OneHotEncoder (no dropped columns)
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop=None))
])

In [8]:
# Combine numerical and categorical transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [9]:
# Apply the transformations
data_prepared = preprocessor.fit_transform(df)

# Extract feature names (numerical + one-hot encoded)
feature_names = (numerical_features + 
                 list(preprocessor.named_transformers_['cat'].named_steps['onehot']
                      .get_feature_names_out(categorical_features)))

# Create a DataFrame for the preprocessed data
data_preprocessed_updated = pd.DataFrame(data_prepared, columns=feature_names)

In [10]:
# Save the preprocessed dataset with all categories
data_preprocessed_updated.to_csv('data_preprocessed_with_all_sectors.csv', index=False)

# Confirm successful save
print("Preprocessed dataset saved as 'data_preprocessed_with_all_sectors.csv'")

Preprocessed dataset saved as 'data_preprocessed_with_all_sectors.csv'


In [11]:
# Validate that all categories from the original data are encoded
original_sectors = df['Sector'].unique()
encoded_columns = [col for col in data_preprocessed_updated.columns if col.startswith('Sector_')]

# Compare original sectors with one-hot encoded columns
print("Original Sectors:", original_sectors)
print("Encoded Columns:", encoded_columns)

# Check for missing categories
missing_categories = set(original_sectors) - set([col.split('_')[1] for col in encoded_columns])
print("Missing Categories:", missing_categories)

Original Sectors: ['Industrials' 'Health Care' 'Information Technology'
 'Consumer Discretionary' 'Utilities' 'Financials' 'Materials'
 'Real Estate' 'Consumer Staples' 'Energy' 'Telecommunication Services']
Encoded Columns: ['Sector_Consumer Discretionary', 'Sector_Consumer Staples', 'Sector_Energy', 'Sector_Financials', 'Sector_Health Care', 'Sector_Industrials', 'Sector_Information Technology', 'Sector_Materials', 'Sector_Real Estate', 'Sector_Telecommunication Services', 'Sector_Utilities']
Missing Categories: set()
