# Preparing the data

- Downloading the Features of 200 manually coded companies from the S&P1500
- The tickers are already coded into one of the four classes 
    - 1 = Orchestrator Core
    - 2 = Orchestrator Periph.
    - 3 = Complementor Core
    - 4 = Complementor Peripheral
    
- We import the necessary libraries and read the Excel file containing the company list with their corresponding tickers and labels.

In [1]:
base_url = "https://financialmodelingprep.com/api"
API_KEY = "ieZWryBMhiEhowJQXvvJBSo8rcJfvMVi"

In [2]:
import pandas as pd
tickers_df = pd.read_excel("CompanyList_Coded.xlsx")
tickers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Ticker    200 non-null    object
 1   Label     200 non-null    int64 
 2   Partners  200 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ KB


## Importing the Balance Sheet Data, Income Statement and CashFlowStatement Data

Next, we import the Balance Sheet, Income Statement, and Cash Flow Statement data for each company using their tickers.

# Function for Importing Data using the V3 Version of the API

In [46]:
import pandas as pd
import requests
from tqdm import tqdm

def fetch_and_concat_data(tickers, labels, partner, statement_type, base_url, version, API_KEY):
    dfs = []  # List to store DataFrames for each company

    # Initialize tqdm progress bar
    progress_bar = tqdm(zip(tickers, labels, partner), desc=f"Fetching {statement_type} data", unit=" ticker")
    
    for ticker, label, partner in progress_bar:
        progress_bar.set_postfix({"Ticker": ticker})
        url = f"{base_url}/{version}/{statement_type}/{ticker}?apikey={API_KEY}"
        response = requests.get(url)
        data = response.json()
        df = pd.DataFrame(data)
        df['Ticker'] = ticker  # Adding a column for ticker symbol
        df['Label'] = label 
        df['Partner'] = partner# Adding a column for label
        dfs.append(df)
        
            
    # Concatenate all DataFrames into a single DataFrame
    final_df = pd.concat(dfs, ignore_index=True)
    final_df.head()
    return final_df

# Assuming tickers_df is your DataFrame containing tickers and labels
balancesheet_df = fetch_and_concat_data(tickers_df["Ticker"], tickers_df["Label"],tickers_df["Partners"], "balance-sheet-statement", base_url, 'v3',API_KEY)
incomestatement_df = fetch_and_concat_data(tickers_df["Ticker"], tickers_df["Label"],tickers_df["Partners"], "income-statement", base_url,'v3', API_KEY)
cashflowstatement_df = fetch_and_concat_data(tickers_df["Ticker"], tickers_df["Label"],tickers_df["Partners"], "cash-flow-statement", base_url,'v3', API_KEY)
keymetrics_df = fetch_and_concat_data(tickers_df["Ticker"], tickers_df["Label"],tickers_df["Partners"], 'key-metrics', base_url,'v3', API_KEY)
# analyststockrec_df = fetch_and_concat_data(tickers_df["Ticker"], tickers_df["Label"], 'analyst-stock-recommendations', base_url,'v3', API_KEY)

Fetching balance-sheet-statement data: 176 ticker [02:11,  1.33 ticker/s, Ticker=STZ]  


KeyboardInterrupt: 

In [None]:
# Check which symbols are missing in balancesheet_df.symbol
# missing_symbols = tickers_df[~tickers_df['Ticker'].isin(employeecount_df['symbol'])]

# Print the missing symbols
# print("Symbols missing in balancesheet_df:")
# print(missing_symbols)

In [None]:
import pandas as pd

# Assuming dfs is a list of DataFrames: balancesheet_df, incomestatement_df, cashflowstatement_df, keymetrics_df
dfs = [balancesheet_df, incomestatement_df, cashflowstatement_df, keymetrics_df]

# Process each DataFrame in the list
for df in dfs:
    # Check if 'date' column exists to extract 'calendarYear'
    if 'calendarYear' in df.columns:
        df['calendarYear'] = pd.DatetimeIndex(df['calendarYear']).year  # Extract year from date
        # Filter rows where 'calendarYear' is between 2007 and 2023
        df = df[df['calendarYear'].between(2007, 2023)]


In [None]:
balancesheet_df = balancesheet_df[balancesheet_df['calendarYear'].between(2007, 2024)]
incomestatement_df = incomestatement_df[incomestatement_df['calendarYear'].between(2007, 2024)]
cashflowstatement_df = cashflowstatement_df[cashflowstatement_df['calendarYear'].between(2007, 2024)]
keymetrics_df = keymetrics_df[keymetrics_df['calendarYear'].between(2007, 2024)]

In [None]:
balancesheet_df.calendarYear.value_counts()

# Import Data from API using V4

Some features have a different link structure, therefore i used two different functions

In [38]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import requests

def fetch_and_concat_data_v4(tickers, labels, partner,statement_type, base_url, version, API_KEY):
    dfs = []  # List to store DataFrames for each company
    years = range(2023, 2006, -1)  # Define range of years for synthetic data

    # Initialize tqdm progress bar
    progress_bar = tqdm(zip(tickers, labels, partner), desc=f"Fetching {statement_type} data", unit="ticker")
    
    for ticker, label, partner in progress_bar:
        progress_bar.set_postfix({"Ticker": ticker})
        url = f"{base_url}/{version}/{statement_type}?symbol={ticker}&apikey={API_KEY}"
        response = requests.get(url)
        data = response.json()
        df = pd.DataFrame(data)
        df['Ticker'] = ticker  # Adding a column for ticker symbol
        df['Label'] = label
        df['Partners'] = partner                       # Adding a column for label
        dfs.append(df)

    # Concatenate all DataFrames into a single DataFrame
    final_df = pd.concat(dfs, ignore_index=True)
    return final_df

employeecount_df = fetch_and_concat_data_v4(tickers_df["Ticker"], tickers_df["Label"],tickers_df["Partners"] ,'historical/employee_count', base_url, 'v4', API_KEY)


Fetching historical/employee_count data: 200ticker [02:36,  1.28ticker/s, Ticker=DLTR] 


In [39]:
import pandas as pd
import numpy as np

def fill_missing_values(employeecount_df, tickers_df):
    # Loop through each ticker in tickers_df
    for index, row in tickers_df.iterrows():
        symbol = row['Ticker']
        label = row['Label']

        # Check if the symbol is in employeecount_df
        if symbol not in employeecount_df['symbol'].values:
            years = range(2023, 2006, -1)  # Define range of years for synthetic data
            synthetic_data = {
                'symbol': [symbol] * len(years),
                'cik': [np.nan] * len(years),
                'acceptanceTime': [np.nan] * len(years),
                'periodOfReport': [f"{year}-12-31" for year in years],  # Example period of report dates
                'companyName': [label] * len(years),
                'formType': [np.nan] * len(years),
                'filingDate': [np.nan] * len(years),
                'employeeCount': [np.nan] * len(years),
                'source': [np.nan] * len(years),
                'Label': [label] * len(years)
            }
            df = pd.DataFrame(synthetic_data)
            employeecount_df = pd.concat([employeecount_df, df], ignore_index=True)
        
    return employeecount_df


employeecount_df = fill_missing_values(employeecount_df, tickers_df)

In [40]:
employeecount_df

Unnamed: 0,symbol,cik,acceptanceTime,periodOfReport,companyName,formType,filingDate,employeeCount,source,Ticker,Label,Partners
0,AAPL,0000320193,2023-11-02 18:08:27,2023-09-30,Apple Inc.,10-K,2023-11-03,161000.0,https://www.sec.gov/Archives/edgar/data/320193...,AAPL,1,373.0
1,AAPL,0000320193,2022-10-27 18:01:14,2022-09-24,Apple Inc.,10-K,2022-10-28,164000.0,https://www.sec.gov/Archives/edgar/data/320193...,AAPL,1,373.0
2,AAPL,0000320193,2021-10-28 18:04:28,2021-09-25,Apple Inc.,10-K,2021-10-29,154000.0,https://www.sec.gov/Archives/edgar/data/320193...,AAPL,1,373.0
3,AAPL,0000320193,2020-10-29 18:06:25,2020-09-26,Apple Inc.,10-K,2020-10-30,147000.0,https://www.sec.gov/Archives/edgar/data/320193...,AAPL,1,373.0
4,AAPL,0000320193,2019-10-30 18:12:36,2019-09-28,Apple Inc.,10-K,2019-10-31,137000.0,https://www.sec.gov/Archives/edgar/data/320193...,AAPL,1,373.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4345,TKO,,,2011-12-31,2,,,,,,2,
4346,TKO,,,2010-12-31,2,,,,,,2,
4347,TKO,,,2009-12-31,2,,,,,,2,
4348,TKO,,,2008-12-31,2,,,,,,2,


In [41]:
employeecount_df = employeecount_df.rename(columns={"periodOfReport": "calendarYear"}) 
employeecount_df["calendarYear"] = pd.to_datetime(employeecount_df["calendarYear"]).dt.year
# employeecount_df = employeecount_df[employeecount_df['calendarYear'].between(2007, 2023)]

In [45]:
employeecount_df.calendarYear.

TypeError: 'numpy.dtype[int64]' object is not callable

# Import Segement Data
The segement data has the data type dictionaries. Therefore we need to request the data differenetly

In [None]:
import pandas as pd
import json
from urllib.request import urlopen
import certifi

# Load the Excel file
tickers_df = pd.read_excel("CompanyList_Coded.xlsx")

def get_jsonparsed_data(url):
    with urlopen(url, cafile=certifi.where()) as response:
        data = response.read().decode("utf-8")
    return json.loads(data)

# Base URL and API key configuration
base_url = "https://financialmodelingprep.com/api/v4"
api_key = "ieZWryBMhiEhowJQXvvJBSo8rcJfvMVi"  # Replace 'YOUR_API_KEY' with your actual API key


def fetch_ticker_data(tickers_df, base_url, datatype, api_key, endpoint_params=""):
    
    
    results = []
    for index, row in tickers_df.iterrows():
        symbol = row['Ticker']
        label = row['Label']
        url = f"{base_url}/{datatype}?symbol={symbol}&structure=flat{endpoint_params}&apikey={api_key}"
        try:
            data = get_jsonparsed_data(url)
            results.append({'Ticker': symbol, 'Label': label, 'Data': data})
        except Exception as e:
            print(f"Failed to fetch data for {symbol}: {e}")

    return pd.DataFrame(results)


geo_revenue_df = fetch_ticker_data(tickers_df, base_url, "revenue-geographic-segmentation",api_key)
product_revenue_df = fetch_ticker_data(tickers_df, base_url,'revenue-product-segmentation',api_key)

In [None]:
geo_revenue_df

In [None]:
import pandas as pd

def aggregate_category_counts(df):
    # List to hold the aggregated data
    results = []

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        Ticker = row['Ticker']
        label = row['Label']
        data_entries = row['Data']

        if not data_entries:  # Check if the data_entries list is empty
            # Generate rows for each year from 2023 to 2007
            for year in range(2023, 2018, -1):
                results.append({
                    'Ticker': Ticker,
                    'calendarYear': year,
                    'Label': label,
                    'Number of Product Segments': None
                })
        else:
            # Process each year's data in the list
            for entry in data_entries:
                for year_date, revenues in entry.items():
                    # Extract the year part from the date string
                    year = pd.to_datetime(year_date).year

                    # Count the number of unique product categories for the current year
                    num_categories = len(revenues)

                    # Append the result as a new row in the results list
                    results.append({
                        'Ticker': Ticker,
                        'calendarYear': year,
                        'Label': label,
                        'Number of Product Segments': num_categories
                    })

    # Convert the results list to a DataFrame
    return pd.DataFrame(results)

# Example usage assuming product_revenue_df and geo_revenue_df are defined
product_revenue_df = aggregate_category_counts(product_revenue_df)
geo_revenue_df = aggregate_category_counts(geo_revenue_df)


In [None]:
# Renaming 'Ticker' column to 'Symbol' in the DataFrame
product_revenue_df = product_revenue_df.rename(columns={'Ticker': 'symbol'})
geo_revenue_df = geo_revenue_df.rename(columns={'Ticker': 'symbol'})

In [None]:
product_revenue_df = product_revenue_df[product_revenue_df['calendarYear'].between(2019, 2023)]
geo_revenue_df = geo_revenue_df[geo_revenue_df['calendarYear'].between(2019,2023)]

# Preparing the Data, Filtering Out only the necessary collumns

After fetching the data, we filter out only the necessary columns from each DataFrame

In [None]:
balancesheet_df = balancesheet_df[['calendarYear', 'symbol','totalCurrentAssets','totalNonCurrentAssets', 
                                   'totalAssets', 'totalCurrentLiabilities','totalNonCurrentLiabilities',
                                   'totalLiabilities', 'Label', 'Partner']]


incomestatement_df = incomestatement_df[['calendarYear', 'symbol','revenue', 'costOfRevenue', 'grossProfit',
                                       'operatingExpenses', 'ebitda', 'Label']]

cashflowstatement_df = cashflowstatement_df[['calendarYear', 'symbol', 'acquisitionsNet','investmentsInPropertyPlantAndEquipment',   
                                            'commonStockIssued','debtRepayment','Label']]

keymetrics_df = keymetrics_df[['calendarYear', 'symbol' ,"debtToEquity", 'debtToAssets', 'marketCap',                              
                               'workingCapital', 'daysOfInventoryOnHand', 'Label']]

employeecount_df = employeecount_df[['calendarYear', 'symbol', 'employeeCount', 'Label']]

## Balance Sheet Ratios 

- (Assets/Total Assets) * 100
- (Liabilities/Total Liabilities) * 100 

In [None]:
def calculate_feature_ratios(df, revenue_column='grossProfit'):
    
    # Create a copy of the original DataFrame to avoid modifying the original data
    df_with_ratios = df.copy()
    
    # Calculate ratios for assets
    asset_columns = ['totalCurrentAssets','totalNonCurrentAssets']
    
    total_assets = df_with_ratios[asset_columns].sum(axis=1)
    
    for feature_column in asset_columns:
        df_with_ratios[f'{feature_column}_to_totalAssets_ratio'] = (df_with_ratios[feature_column] / total_assets) * 100
    
    # Calculate ratios for liabilities
    liability_columns = ['totalCurrentLiabilities','totalNonCurrentLiabilities']
    
    total_liabilities = df_with_ratios[liability_columns].sum(axis=1)
    
    for feature_column in liability_columns:
        df_with_ratios[f'{feature_column}_to_totalLiabilities_ratio'] = (df_with_ratios[feature_column] / total_liabilities) * 100
    
    # Drop the original features from the DataFrame
    df_with_ratios.drop(columns=asset_columns + liability_columns, inplace=True)
    
    return df_with_ratios

# Usage example:
balancesheet_ratios_df = calculate_feature_ratios(balancesheet_df)                     

# Cashflow / Income Statement Ratios

- (Columns / Revenue) * 100 

Used Columns
- *incomestatement_columns* = ['costOfRevenue', 'grossProfit', 'researchAndDevelopmentExpenses', 
                               'sellingGeneralAndAdministrativeExpenses', 'operatingExpenses', 
                               'costAndExpenses', 'ebitda', 'operatingIncome']
                               
- *cashflow_columns* = ['netCashProvidedByOperatingActivities', 'netCashUsedForInvestingActivites',
                        'investmentsInPropertyPlantAndEquipment', 'freeCashFlow']

In [None]:

def calculate_feature_ratios(df, revenue_column='grossProfit', revenue_df=None):
    # Verify input DataFrame and revenue DataFrame are provided
    if df is None or revenue_df is None:
        raise ValueError("Both 'df' and 'revenue_df' must be provided.")
    
    # Verify the revenue column exists in revenue_df
    if revenue_column not in revenue_df.columns:
        raise ValueError(f"The revenue column '{revenue_column}' was not found in the revenue DataFrame.")
    
    # Create a copy of the original DataFrame to avoid modifying the original data
    df_with_ratios = df.copy()
    
    # Define the columns for which to calculate ratios related to income statements
    incomestatement_columns = ['revenue', 'costOfRevenue', 'grossProfit', 'operatingExpenses', 'ebitda',]
    
    # Calculate ratios for income statement features
    for feature_column in incomestatement_columns:
        if feature_column in df_with_ratios.columns:
            df_with_ratios[f'{feature_column}_to_Revenue_ratio'] = (df_with_ratios[feature_column] / revenue_df[revenue_column]) * 100
    
    # Define the columns for cash flow calculation
    cashflow_columns = ['acquisitionsNet','investmentsInPropertyPlantAndEquipment','commonStockIssued','debtRepayment',]
    
    # Calculate ratios for cash flow features
    for feature_column in cashflow_columns:
        if feature_column in df_with_ratios.columns:
            df_with_ratios[f'{feature_column}_to_Revenue_ratio'] = (df_with_ratios[feature_column] / revenue_df[revenue_column]) * 100
    
    return df_with_ratios

# Corrected example usage:
cashflowstatement_ratios_df = calculate_feature_ratios(cashflowstatement_df, revenue_column='revenue', revenue_df=incomestatement_df)
incomestatement_ratios_df = calculate_feature_ratios(incomestatement_df, revenue_column='revenue', revenue_df=incomestatement_df)

In [None]:
cashflowstatement_ratios_df = cashflowstatement_ratios_df.reset_index()
incomestatement_ratios_df = incomestatement_ratios_df.reset_index()

# Key Metrics Ratio 

- Here we just need the Ratio working capital / Revenue 

In [None]:
keymetrics_df
keymetrics_df["workingCapital_to_revenue_Ratio"] = (keymetrics_df["workingCapital"]/incomestatement_df["revenue"])*100
keymetrics_ratios_df = keymetrics_df.copy()
keymetrics_ratios_df.drop("workingCapital", axis = 1, inplace=True)

# Employee Count / Revenue Measure

In [None]:
incomestatement_df_red = incomestatement_df[["calendarYear", "symbol", "revenue"]]
employeecount_df[["calendarYear", "symbol"]] = employeecount_df[["calendarYear", "symbol"]].astype("object")

In [None]:
import pandas as pd

# Assuming employeecount_df and incomestatement_df are your DataFrames

# Convert calendarYear and symbol columns to string and trim any whitespaces
for df in [employeecount_df, incomestatement_df]:
    df['calendarYear'] = df['calendarYear'].astype(str).str.strip()
    df['symbol'] = df['symbol'].astype(str).str.strip()

# Filter the incomestatement_df to include only the necessary columns
incomestatement_reduced = incomestatement_df[['calendarYear', 'symbol', 'revenue']]

# Perform the merge using an inner join
employeecount_ratios_df = pd.merge(employeecount_df, incomestatement_reduced, on=['calendarYear', 'symbol'], how='inner')
employeecount_ratios_df["employee_revenue_ratio"] = (employeecount_ratios_df["employeeCount"]/employeecount_ratios_df["revenue"])*100
employeecount_ratios_df.head()

## Analyst Recommendations

\begin{align*}
\text{WARS} = \frac{(3 \times \text{analystRatingsStrongBuy}) + (2 \times \text{analystRatingsBuy}) + (\text{analystRatingsHold}) - (2 \times \text{analystRatingsSell}) - (3 \times \text{analystRatingsStrongSell})}{\text{Total Ratings}}
\end{align*}


Assign Weights to Each Category:
Strong Buy: +3
Buy: +2
Hold: +1
Sell: -2
Strong Sell: -3


- With this Ratio, we can evaluate the sentiment of analysts, 


## Partners

In [None]:
balancesheet_df.head()

# Dropping Columns

Cashflow-Statements Collumn Drops

In [None]:
cs_2_drop = cashflowstatement_ratios_df[['acquisitionsNet','investmentsInPropertyPlantAndEquipment','commonStockIssued',
                                        'debtRepayment']]
cashflowstatement_ratios_df.drop(columns=cs_2_drop, axis=1, inplace=True )

In [None]:
is_2_drop = incomestatement_ratios_df[["revenue", 'costOfRevenue', 'grossProfit', 'operatingExpenses', 'ebitda' ]]
incomestatement_ratios_df.drop(columns=is_2_drop, axis = 1, inplace=True)

In [None]:
bs_2_drop = balancesheet_df[["totalAssets", "totalLiabilities"]]
balancesheet_ratios_df.drop(columns=bs_2_drop, axis=1, inplace=True)

In [None]:
ec_2_drop = employeecount_ratios_df[["revenue", "employeeCount"]]
employeecount_ratios_df.drop(columns=ec_2_drop, axis =1, inplace = True)

# Final Financial Ratios

- cashflowstatement_ratios_df
- incomestatement_ratios_df
- balancesheet_ratios_df
- keymetrics_ratios_df
- employeecount_ratios_df
- product_revenue_df
- geo_revenue_df

## Imputing missing values using Regression
- When we inspect the distinct dataframes, we can observe that they have different lengths, indicating that there are missing values.

- However, to merge the dfs we need the same lenghts an no missing values, thus we use regression as an imputation method

- The following function checks for every ticker if there is a missing value (year) creates and fills the row with the value obtained from the regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

def impute_missing_data(df):
    if 'calendarYear' in df.columns:
        df['calendarYear'] = pd.to_numeric(df['calendarYear'], errors='coerce').fillna(0).astype(int)

    # Ensure DataFrame has unique symbol-year combinations
    df = df.drop_duplicates(subset=['symbol', 'calendarYear'], keep='first')

    # Get unique symbols and full year range
    symbols = df['symbol'].unique()
    all_years = range(2019, 2024)

    # Create a full DataFrame with all symbols and years
    full_df = pd.DataFrame([(s, y) for s in symbols for y in all_years], columns=['symbol', 'calendarYear'])

    # Merge the original DataFrame with the full DataFrame to fill in missing years
    merged_df = pd.merge(full_df, df, on=['symbol', 'calendarYear'], how='left')

    # Forward fill and backward fill to populate missing 'Label' and other categorical or static data
    merged_df['Label'] = merged_df.groupby('symbol')['Label'].apply(lambda x: x.ffill().bfill())

    # Replace infinite values with NaN
    merged_df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Group by 'Label' and impute missing data for numeric columns using regression
    for label in merged_df['Label'].dropna().unique():
        label_group = merged_df[merged_df['Label'] == label]
        numeric_cols = label_group.select_dtypes(include=['number']).columns.difference(['Label'])

        for col in numeric_cols:
            not_null_data = label_group[label_group[col].notnull() & (label_group[col] != 0)]
            null_data = label_group[label_group[col].isnull() | (label_group[col] == 0)]
            
            if not not_null_data.empty and not null_data.empty:
                predictors = numeric_cols.drop(col)  # Use other numeric columns as predictors
                valid_data = not_null_data.dropna(subset=predictors)
                
                if len(valid_data) > 0:
                    reg = LinearRegression()
                    reg.fit(valid_data[predictors], valid_data[col])

                    # Predict missing or zero values if predictors are available
                    predictors_null = null_data[predictors].dropna()
                    if not predictors_null.empty:
                        predicted_values = reg.predict(predictors_null)
                        merged_df.loc[predictors_null.index, col] = predicted_values

    # Finalize DataFrame by sorting and potentially filling any remaining missing numeric data with 0 or other methods
    merged_df = merged_df.sort_values(by=['symbol', 'calendarYear'])
    
    # Optionally fill any remaining NaNs with 0 or other values
    merged_df.fillna(0, inplace=True)

    return merged_df



balancesheet_ratios_df = impute_missing_data(balancesheet_ratios_df)
cashflowstatement_ratios_df = impute_missing_data(cashflowstatement_ratios_df)
incomestatement_ratios_df = impute_missing_data(incomestatement_ratios_df)
keymetrics_ratios_df = impute_missing_data(keymetrics_ratios_df)
employeecount_ratios_df = impute_missing_data(employeecount_ratios_df)
product_revenue_df = impute_missing_data(product_revenue_df)
geo_revenue_df = impute_missing_data(geo_revenue_df)


In [None]:
balancesheet_ratios_df.info()
incomestatement_ratios_df.info()
cashflowstatement_ratios_df.info()
keymetrics_ratios_df.info()
employeecount_ratios_df.info() 
geo_revenue_df.info()
product_revenue_df.info()

In [None]:
merged_df = balancesheet_ratios_df
merged_df = pd.merge(merged_df, incomestatement_ratios_df, on=['symbol', 'calendarYear', 'Label'], how='outer')
merged_df = pd.merge(merged_df, cashflowstatement_ratios_df, on=['symbol', 'calendarYear', 'Label' ], how='outer')
merged_df = pd.merge(merged_df, keymetrics_ratios_df, on=['symbol', 'calendarYear', 'Label'], how='outer')
merged_df = pd.merge(merged_df, employeecount_ratios_df, on=['symbol', 'calendarYear', 'Label'], how='outer')
merged_df = pd.merge(merged_df, geo_revenue_df, on=['symbol', 'calendarYear', 'Label'], how='outer')
merged_df = pd.merge(merged_df, product_revenue_df, on=['symbol', 'calendarYear', 'Label'], how='outer')

# Check the merged DataFrame
merged_df

In [None]:
impute_missing_data(merged_df)

In [None]:
merged_df.info()

In [None]:
merged_df.info()

In [None]:
merged_df = merged_df.rename(columns={"Number of Product Segments_y": "Number of Geo Segments",
                                      "Number of Product Segments_x":"Number of Product Segments" })


In [None]:
merged_df["Number of Product Segments"]= merged_df["Number of Product Segments"].astype("float64")
merged_df["Number of Geo Segments"]= merged_df["Number of Geo Segments"].astype("float64")


In [None]:
merged_df.to_excel("finaldf_partner_imp.xlsx")