In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [23]:
df = pd.read_csv('financials_cleaned.csv')

In [24]:
df.head(), df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495 entries, 0 to 494
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Symbol          495 non-null    object 
 1   Name            495 non-null    object 
 2   Sector          495 non-null    object 
 3   Price           495 non-null    float64
 4   Price/Earnings  495 non-null    float64
 5   Dividend_Yield  495 non-null    float64
 6   Earnings/Share  495 non-null    float64
 7   52_Week_Low     495 non-null    float64
 8   52_Week_High    495 non-null    float64
 9   Market_Cap      495 non-null    float64
 10  EBITDA          495 non-null    float64
 11  Price/Sales     495 non-null    float64
 12  Price/Book      495 non-null    float64
dtypes: float64(10), object(3)
memory usage: 50.4+ KB


(  Symbol                 Name                  Sector   Price  Price/Earnings  \
 0    MMM           3M Company             Industrials  222.89           24.31   
 1    AOS      A.O. Smith Corp             Industrials   60.24           27.76   
 2    ABT  Abbott Laboratories             Health Care   56.27           22.51   
 3   ABBV          AbbVie Inc.             Health Care  108.48           19.41   
 4    ACN        Accenture plc  Information Technology  150.51           25.47   
 
    Dividend_Yield  Earnings/Share  52_Week_Low  52_Week_High    Market_Cap  \
 0        2.332862            7.92       259.77       175.490  1.387211e+11   
 1        1.147959            1.70        68.39        48.925  1.078342e+10   
 2        1.908982            0.26        64.60        42.280  1.021210e+11   
 3        2.499560            3.29       125.86        60.050  1.813863e+11   
 4        1.714470            5.44       162.60       114.820  9.876586e+10   
 
          EBITDA  Price/Sales 

Define Columns

In [25]:
categorical_features = ['Sector']
numerical_features = ['Price', 'Price/Earnings', 'Dividend_Yield', 'Earnings/Share', '52_Week_Low', 
                      '52_Week_High', 'Market_Cap', 'EBITDA', 'Price/Sales', 'Price/Book']

Create Transformers

In [26]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

Combine transformers into a preprocessor with ColumnTransformer


In [27]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

Apply Transformation

In [28]:
data_prepared = preprocessor.fit_transform(df)

Get the feature names after the one-hot encoding


In [29]:
feature_names = (numerical_features + 
                 list(preprocessor.named_transformers_['cat'].named_steps['onehot']
                      .get_feature_names_out(categorical_features)))

Convert the processed data back to DataFrame for better visualization and further use


In [30]:
data_preprocessed = pd.DataFrame(data_prepared, columns=feature_names)


# Creating sequences, X_train, y_train

In [None]:
def create_sequences(data, sequence_length):
    X = []
    y = []
    for i in range(len(data) - sequence_length):
        X.append(data[i:(i + sequence_length)])  
        y.append(data[i + sequence_length])      
    return np.array(X), np.array(y)

In [None]:
data = data_preprocessed['Price'].values  
sequence_length = 10


In [35]:
X, y = create_sequences(data, sequence_length)

In [36]:
#Split data into training and testing sets
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# Result

In [37]:
data_preprocessed.head()

Unnamed: 0,Price,Price/Earnings,Dividend_Yield,Earnings/Share,52_Week_Low,52_Week_High,Market_Cap,EBITDA,Price/Sales,Price/Book,...,Sector_Consumer Staples,Sector_Energy,Sector_Financials,Sector_Health Care,Sector_Industrials,Sector_Information Technology,Sector_Materials,Sector_Real Estate,Sector_Telecommunication Services,Sector_Utilities
0,0.878344,-0.008454,0.280215,0.743147,0.875014,0.861718,0.986142,0.796009,0.125507,-0.035195,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-0.324371,0.076259,-0.489462,-0.382941,-0.349252,-0.327918,-0.425974,-0.432863,-0.108525,-0.090795,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-0.353727,-0.052652,0.004876,-0.643643,-0.373496,-0.390377,0.582168,0.315342,-0.061133,-0.126005,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.03234,-0.128771,0.388497,-0.095082,0.018386,-0.223349,1.457061,0.979604,0.671617,0.129711,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.34313,0.020029,-0.121474,0.29416,0.253414,0.291456,0.545135,0.300682,-0.38753,-0.043217,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [38]:
data_preprocessed.to_csv('data_preprocessed.csv', index=False)