# BCG Finance AI Bot - Data Processing

### Importing Necessary Modules

In [2]:
import pandas as pd
import numpy as np 
import matplotlib as plt
import seaborn as sns

### Loading the automated fetch data from SEC EDGAR's database

In [3]:
## Loading the data fetch during the data extraction phase
df = pd.read_excel("SEC_Financial_Data.xlsx")
df

Unnamed: 0,Company,year,Total Revenue,Net Income,Total Assets,Total Liabilities,Cash Flow
0,Tesla,2023,96773000000.0,14997000000,106618000000,43009000000,13256000000
1,Tesla,2024,97690000000.0,7091000000,122070000000,48390000000,14923000000
2,Tesla,2025,94827000000.0,3794000000,137806000000,54941000000,14747000000
3,Apple,2023,,72361000000,411976000000,205753000000,87582000000
4,Apple,2024,,88136000000,512163000000,243686000000,118548000000
5,Apple,2025,,101832000000,619003000000,275524000000,136162000000
6,Microsoft,2023,,96995000000,352583000000,290437000000,110543000000
7,Microsoft,2024,,93736000000,364980000000,308030000000,118254000000
8,Microsoft,2025,,112010000000,359241000000,285508000000,111482000000


###  Filling missing values for Total Revenue for Apple and Microsoft for the years 2023, 2024, and 2025 based on external data sources


In [4]:
df.isnull().sum()

Company              0
year                 0
Total Revenue        6
Net Income           0
Total Assets         0
Total Liabilities    0
Cash Flow            0
dtype: int64

In [5]:
manual_df = pd.DataFrame([
    {'Company': 'Apple', 'year': 2023, 'Total Revenue': 383285000000},
    {'Company': 'Apple', 'year': 2024, 'Total Revenue': 391035000000},
    {'Company': 'Apple', 'year': 2025, 'Total Revenue': 416161000000},
    {'Company': 'Microsoft', 'year': 2023, 'Total Revenue': 211915000000},
    {'Company': 'Microsoft', 'year': 2024, 'Total Revenue': 245122000000},
    {'Company': 'Microsoft', 'year': 2025, 'Total Revenue': 281724000000},
])

df = df.set_index(['Company', 'year'])
manual_df = manual_df.set_index(['Company', 'year'])

df.update(manual_df)

df = df.reset_index()
df

Unnamed: 0,Company,year,Total Revenue,Net Income,Total Assets,Total Liabilities,Cash Flow
0,Tesla,2023,96773000000.0,14997000000,106618000000,43009000000,13256000000
1,Tesla,2024,97690000000.0,7091000000,122070000000,48390000000,14923000000
2,Tesla,2025,94827000000.0,3794000000,137806000000,54941000000,14747000000
3,Apple,2023,383285000000.0,72361000000,411976000000,205753000000,87582000000
4,Apple,2024,391035000000.0,88136000000,512163000000,243686000000,118548000000
5,Apple,2025,416161000000.0,101832000000,619003000000,275524000000,136162000000
6,Microsoft,2023,211915000000.0,96995000000,352583000000,290437000000,110543000000
7,Microsoft,2024,245122000000.0,93736000000,364980000000,308030000000,118254000000
8,Microsoft,2025,281724000000.0,112010000000,359241000000,285508000000,111482000000


### Checking the data types of the columns

In [6]:
df.dtypes

Company               object
year                   int64
Total Revenue        float64
Net Income             int64
Total Assets           int64
Total Liabilities      int64
Cash Flow              int64
dtype: object

In [7]:
df['Total Revenue'] = df['Total Revenue'].astype('int64')
df

Unnamed: 0,Company,year,Total Revenue,Net Income,Total Assets,Total Liabilities,Cash Flow
0,Tesla,2023,96773000000,14997000000,106618000000,43009000000,13256000000
1,Tesla,2024,97690000000,7091000000,122070000000,48390000000,14923000000
2,Tesla,2025,94827000000,3794000000,137806000000,54941000000,14747000000
3,Apple,2023,383285000000,72361000000,411976000000,205753000000,87582000000
4,Apple,2024,391035000000,88136000000,512163000000,243686000000,118548000000
5,Apple,2025,416161000000,101832000000,619003000000,275524000000,136162000000
6,Microsoft,2023,211915000000,96995000000,352583000000,290437000000,110543000000
7,Microsoft,2024,245122000000,93736000000,364980000000,308030000000,118254000000
8,Microsoft,2025,281724000000,112010000000,359241000000,285508000000,111482000000


### Calculating Year-by-Year growth rates for Total Revenue and Net Income 


In [8]:
df['Revenue Growth (%)'] = df.groupby(['Company'])['Total Revenue'].pct_change() * 100
df['Net Income Growth (%)'] = df.groupby(['Company'])['Net Income'].pct_change() * 100
df

Unnamed: 0,Company,year,Total Revenue,Net Income,Total Assets,Total Liabilities,Cash Flow,Revenue Growth (%),Net Income Growth (%)
0,Tesla,2023,96773000000,14997000000,106618000000,43009000000,13256000000,,
1,Tesla,2024,97690000000,7091000000,122070000000,48390000000,14923000000,0.947578,-52.71721
2,Tesla,2025,94827000000,3794000000,137806000000,54941000000,14747000000,-2.930699,-46.495558
3,Apple,2023,383285000000,72361000000,411976000000,205753000000,87582000000,,
4,Apple,2024,391035000000,88136000000,512163000000,243686000000,118548000000,2.021994,21.800417
5,Apple,2025,416161000000,101832000000,619003000000,275524000000,136162000000,6.425512,15.539621
6,Microsoft,2023,211915000000,96995000000,352583000000,290437000000,110543000000,,
7,Microsoft,2024,245122000000,93736000000,364980000000,308030000000,118254000000,15.669962,-3.359967
8,Microsoft,2025,281724000000,112010000000,359241000000,285508000000,111482000000,14.932156,19.495178


### Calculating Year-by-Year growth rates for Total Assets, Total Liabilities and Cash flow from Operations Activities

In [9]:
df['Assets Growth (%)'] = df.groupby('Company')['Total Assets'].pct_change() * 100
df['Liabilities Growth (%)'] = df.groupby('Company')['Total Liabilities'].pct_change() * 100
df['Cash Flow from Operations Growth(%)'] = df.groupby('Company')['Cash Flow'].pct_change() * 100

### Imputing Null/NaN values that result from pct_change calculations with 0

In [10]:
df.fillna(0, inplace=True)
df

Unnamed: 0,Company,year,Total Revenue,Net Income,Total Assets,Total Liabilities,Cash Flow,Revenue Growth (%),Net Income Growth (%),Assets Growth (%),Liabilities Growth (%),Cash Flow from Operations Growth(%)
0,Tesla,2023,96773000000,14997000000,106618000000,43009000000,13256000000,0.0,0.0,0.0,0.0,0.0
1,Tesla,2024,97690000000,7091000000,122070000000,48390000000,14923000000,0.947578,-52.71721,14.492862,12.511335,12.575438
2,Tesla,2025,94827000000,3794000000,137806000000,54941000000,14747000000,-2.930699,-46.495558,12.890964,13.537921,-1.179388
3,Apple,2023,383285000000,72361000000,411976000000,205753000000,87582000000,0.0,0.0,0.0,0.0,0.0
4,Apple,2024,391035000000,88136000000,512163000000,243686000000,118548000000,2.021994,21.800417,24.31865,18.436183,35.35658
5,Apple,2025,416161000000,101832000000,619003000000,275524000000,136162000000,6.425512,15.539621,20.860546,13.065174,14.858117
6,Microsoft,2023,211915000000,96995000000,352583000000,290437000000,110543000000,0.0,0.0,0.0,0.0,0.0
7,Microsoft,2024,245122000000,93736000000,364980000000,308030000000,118254000000,15.669962,-3.359967,3.516052,6.057424,6.975566
8,Microsoft,2025,281724000000,112010000000,359241000000,285508000000,111482000000,14.932156,19.495178,-1.572415,-7.311625,-5.726656


In [11]:
df.to_csv('SEC_data_cleaned.csv')

In [19]:
summary = df.groupby('Company').agg({
    'Revenue Growth (%)': 'mean',
    'Net Income Growth (%)': 'mean',
    'Assets Growth (%)' : 'mean',
    'Liabilities Growth (%)' : 'mean',
    'Cash Flow from Operations Growth(%)' :'mean'    
}).reset_index()
summary

Unnamed: 0,Company,Revenue Growth (%),Net Income Growth (%),Assets Growth (%),Liabilities Growth (%),Cash Flow from Operations Growth(%)
0,Apple,2.815835,12.446679,15.059732,10.500452,16.738232
1,Microsoft,10.200706,5.378404,0.647879,-0.418067,0.416303
2,Tesla,-0.66104,-33.070923,9.127942,8.683085,3.798683


In [20]:
summary.to_csv('Summary_final_report.csv')