In [None]:
import requests
import warnings
from tqdm import tqdm, trange
import pandas as pd 
import streamlit as st 
import yfinance as yf
from datetime import datetime
from dateutil.relativedelta import relativedelta
import plotly_express  as px
import plotly.graph_objects as go
import random
import unittest
import tables
import pickle
import pyarrow.parquet as pq
import pyarrow as pa
import plotly.io as pio
import math
import investpy
import sklearn.preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [119]:
pio.templates.default = "plotly_dark"

In [242]:
oi = yf.Ticker("TSLA")

In [243]:
oi.info

{'address1': '1 Tesla Road',
 'city': 'Austin',
 'state': 'TX',
 'zip': '78725',
 'country': 'United States',
 'phone': '512 516 8177',
 'website': 'https://www.tesla.com',
 'industry': 'Auto Manufacturers',
 'industryKey': 'auto-manufacturers',
 'industryDisp': 'Auto Manufacturers',
 'sector': 'Consumer Cyclical',
 'sectorKey': 'consumer-cyclical',
 'sectorDisp': 'Consumer Cyclical',
 'longBusinessSummary': 'Tesla, Inc. designs, develops, manufactures, leases, and sells electric vehicles, and energy generation and storage systems in the United States, China, and internationally. The company operates in two segments, Automotive, and Energy Generation and Storage. The Automotive segment offers electric vehicles, as well as sells automotive regulatory credits; and non-warranty after-sales vehicle, used vehicles, body shop and parts, supercharging, retail merchandise, and vehicle insurance services. This segment also provides sedans and sport utility vehicles through direct and used vehic

In [211]:
sp500_list = investpy.get_stocks_list(country="United States")
len(sp500_list)

4582

In [212]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
sp500_table = pd.read_html(url)[0]  # First table contains S&P 500 tickers

# Keep only the columns of interest
sp500_table = sp500_table[["Symbol"]]

In [214]:
sp500_list = sp500_table['Symbol'].to_list()
sp500_list

['MMM',
 'AOS',
 'ABT',
 'ABBV',
 'ACN',
 'ADBE',
 'AMD',
 'AES',
 'AFL',
 'A',
 'APD',
 'ABNB',
 'AKAM',
 'ALB',
 'ARE',
 'ALGN',
 'ALLE',
 'LNT',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AMCR',
 'AEE',
 'AEP',
 'AXP',
 'AIG',
 'AMT',
 'AWK',
 'AMP',
 'AME',
 'AMGN',
 'APH',
 'ADI',
 'ANSS',
 'AON',
 'APA',
 'APO',
 'AAPL',
 'AMAT',
 'APTV',
 'ACGL',
 'ADM',
 'ANET',
 'AJG',
 'AIZ',
 'T',
 'ATO',
 'ADSK',
 'ADP',
 'AZO',
 'AVB',
 'AVY',
 'AXON',
 'BKR',
 'BALL',
 'BAC',
 'BAX',
 'BDX',
 'BRK.B',
 'BBY',
 'TECH',
 'BIIB',
 'BLK',
 'BX',
 'BK',
 'BA',
 'BKNG',
 'BWA',
 'BSX',
 'BMY',
 'AVGO',
 'BR',
 'BRO',
 'BF.B',
 'BLDR',
 'BG',
 'BXP',
 'CHRW',
 'CDNS',
 'CZR',
 'CPT',
 'CPB',
 'COF',
 'CAH',
 'KMX',
 'CCL',
 'CARR',
 'CAT',
 'CBOE',
 'CBRE',
 'CDW',
 'CE',
 'COR',
 'CNC',
 'CNP',
 'CF',
 'CRL',
 'SCHW',
 'CHTR',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CLX',
 'CME',
 'CMS',
 'KO',
 'CTSH',
 'CL',
 'CMCSA',
 'CAG',
 'COP',
 'ED',
 'STZ',
 

In [255]:
data = []
progress = 0
for ticker in tqdm(sp500_list):
    stock = yf.Ticker(ticker)
    
    # Extract relevant financial metrics
    try:
        market_cap = stock.info["marketCap"]
        revenue_growth = stock.info.get("revenueGrowth", None)
        earningsGrowth = stock.info.get("earningsGrowth", None)
        enterpriseToEbitda = stock.info.get("earningsGrowth", None)
        enterpriseToRevenue  = stock.info.get("earningsGrowth", None)
        ebitda_margin = stock.info.get("ebitdaMargins", None)
        operatingMargins = stock.info.get("operatingMargins", None)
        de = stock.info.get("debtToEquity", None)
        pe = stock.info.get("trailingPE", None)
        roe = stock.info.get("returnOnEquity", None)
        roa = stock.info.get("returnOnAssets", None)
        
        data.append([ticker,market_cap,revenue_growth,earningsGrowth,enterpriseToEbitda,enterpriseToRevenue,ebitda_margin,operatingMargins,de, pe, roe,roa])
    
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")

# Convert to DataFrame
df = pd.DataFrame(data, columns=['Company', 'Market Cap', 'Rev Growth', 'NI Growth', 'EV/EBITDA', 'EV/Rev', 'EBITDA Margin', 'Operating Margin', 'Debt/Equity', 'PE', 'ROE','ROA'])
df_no_nan = df.dropna()


# Drop non-numeric columns
df_numeric = df.drop(columns=["Company"])

# Normalize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numeric)

# Convert back to DataFrame
df_scaled = pd.DataFrame(df_scaled, columns=df_numeric.columns)
df_scaled.insert(0, "Company", df["Company"])  # Reinsert company names
df_scaled = df_scaled.dropna()
# Drop non-numeric columns
df_numeric = df.drop(columns=["Company"])

# Normalize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numeric)

# Convert back to DataFrame
df_scaled = pd.DataFrame(df_scaled, columns=df_numeric.columns)
df_scaled.insert(0, "Company", df["Company"])  # Reinsert company names
df_scaled = df_scaled.dropna()
# Set number of clusters (can be adjusted)
k = 10  # Typically, 3-5 clusters work well for comps
kmeans = KMeans(n_clusters=k, random_state=42)

# Fit K-Means
df_scaled["Cluster"] = kmeans.fit_predict(df_scaled.drop(columns=["Company"]))
target_stock = "TSLA"  # Change to your stock
target_cluster = df_scaled[df_scaled["Company"] == target_stock]["Cluster"].values[0]

# Get companies in the same cluster
comparable_companies = df_scaled[df_scaled["Cluster"] == target_cluster]["Company"].tolist()
comparable_companies.remove(target_stock)  # Remove target itself

print(f"Comparable companies for {target_stock}: {comparable_companies}")

 12%|█▏        | 61/503 [00:29<03:30,  2.10it/s]

Error fetching data for BRK.B: 'marketCap'


 15%|█▌        | 76/503 [00:36<03:12,  2.22it/s]

Error fetching data for BF.B: 'marketCap'


100%|██████████| 503/503 [03:58<00:00,  2.11it/s]

Comparable companies for TSLA: ['ABT', 'ACN', 'ABNB', 'AKAM', 'ALGN', 'AMCR', 'AIG', 'AON', 'APTV', 'ADM', 'AJG', 'AIZ', 'AVY', 'AXON', 'BKR', 'BALL', 'BAX', 'BBY', 'TECH', 'BSX', 'BR', 'BLDR', 'BG', 'CHRW', 'KMX', 'CCL', 'CARR', 'CBRE', 'CDW', 'CE', 'COR', 'CNC', 'CRL', 'CVX', 'CB', 'CI', 'CSCO', 'CTSH', 'CMCSA', 'CAG', 'CEG', 'COO', 'CSGP', 'COST', 'CMI', 'CVS', 'DHR', 'DRI', 'DVA', 'DE', 'DAL', 'DXCM', 'DLR', 'DG', 'DOV', 'DOW', 'DHI', 'DTE', 'DD', 'EMN', 'ETN', 'EBAY', 'EA', 'ELV', 'EMR', 'ENPH', 'EPAM', 'EFX', 'EXPE', 'EXPD', 'XOM', 'FE', 'F', 'FTV', 'GEHC', 'GNRC', 'GD', 'GIS', 'GM', 'GPC', 'GL', 'HAL', 'HIG', 'HSIC', 'HPE', 'HON', 'HRL', 'HST', 'HUM', 'HII', 'IEX', 'IR', 'PODD', 'IP', 'IPG', 'IVZ', 'IQV', 'JBHT', 'JBL', 'J', 'JCI', 'JNPR', 'K', 'KVUE', 'KDP', 'KEYS', 'KR', 'LHX', 'LH', 'LW', 'LVS', 'LEN', 'LYV', 'LKQ', 'LMT', 'L', 'LYB', 'MPC', 'MLM', 'MKC', 'MDT', 'MGM', 'MCHP', 'MOH', 'TAP', 'MDLZ', 'NKE', 'NOC', 'NCLH', 'NUE', 'OMC', 'OKE', 'PCAR', 'PKG', 'PANW', 'PH', 'PYPL'




In [247]:
df

Unnamed: 0,Company,Market Cap,Rev Growth,NI Growth,EV/EBITDA,EV/Rev,EBITDA Margin,Operating Margin,Debt/Equity,PE,ROE,ROA
0,MMM,82107334656,0.004,,,,0.24870,0.21433,295.441,20.964186,1.13294,0.09032
1,AOS,9758164992,-0.037,-0.089,-0.089,-0.089,0.20779,0.17793,7.471,18.539946,0.29544,0.14337
2,ABT,221889478656,0.049,0.146,0.146,0.146,0.26263,0.18749,37.589,16.744764,0.14840,0.06488
3,ABBV,324977033216,0.038,-0.123,-0.123,-0.123,0.46153,0.28928,1174.815,76.945600,0.56407,0.07720
4,ACN,240778526720,0.026,0.236,0.236,0.236,0.17052,0.14597,14.127,32.267395,0.26675,0.11627
...,...,...,...,...,...,...,...,...,...,...,...,...
495,XYL,30134896640,0.013,0.413,0.413,0.413,0.20050,0.13498,19.840,35.541546,0.08081,0.04429
496,YUM,36418895872,0.069,-0.075,-0.075,-0.075,0.36107,0.34447,,24.392525,,0.24328
497,ZBRA,20216264704,0.313,,,,0.17247,0.15618,69.307,53.180460,0.11874,0.05234
498,ZBH,21794621440,0.040,0.597,0.597,0.597,0.33670,0.17997,53.611,20.893131,0.08713,0.04610


In [248]:
df_no_nan = df.dropna()

In [249]:
df_no_nan

Unnamed: 0,Company,Market Cap,Rev Growth,NI Growth,EV/EBITDA,EV/Rev,EBITDA Margin,Operating Margin,Debt/Equity,PE,ROE,ROA
1,AOS,9758164992,-0.037,-0.089,-0.089,-0.089,0.20779,0.17793,7.471,18.539946,0.29544,0.14337
2,ABT,221889478656,0.049,0.146,0.146,0.146,0.26263,0.18749,37.589,16.744764,0.14840,0.06488
3,ABBV,324977033216,0.038,-0.123,-0.123,-0.123,0.46153,0.28928,1174.815,76.945600,0.56407,0.07720
4,ACN,240778526720,0.026,0.236,0.236,0.236,0.17052,0.14597,14.127,32.267395,0.26675,0.11627
5,ADBE,190421991424,0.106,0.233,0.233,0.233,0.38831,0.36834,41.788,35.335220,0.35355,0.15969
...,...,...,...,...,...,...,...,...,...,...,...,...
492,WDAY,69707956224,0.167,0.661,0.661,0.661,0.08089,0.05324,40.312,43.101974,0.21081,0.01426
494,XEL,38589059072,-0.005,0.018,0.018,0.018,0.40035,0.25165,154.330,19.940653,0.10262,0.02564
495,XYL,30134896640,0.013,0.413,0.413,0.413,0.20050,0.13498,19.840,35.541546,0.08081,0.04429
498,ZBH,21794621440,0.040,0.597,0.597,0.597,0.33670,0.17997,53.611,20.893131,0.08713,0.04610


In [250]:
from sklearn.preprocessing import StandardScaler

# Drop non-numeric columns
df_numeric = df.drop(columns=["Company"])

# Normalize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numeric)

# Convert back to DataFrame
df_scaled = pd.DataFrame(df_scaled, columns=df_numeric.columns)
df_scaled.insert(0, "Company", df["Company"])  # Reinsert company names
df_scaled = df_scaled.dropna()

In [251]:
from sklearn.cluster import KMeans

# Set number of clusters (can be adjusted)
k = 10  # Typically, 3-5 clusters work well for comps
kmeans = KMeans(n_clusters=k, random_state=42)

# Fit K-Means
df_scaled["Cluster"] = kmeans.fit_predict(df_scaled.drop(columns=["Company"]))

print(df_scaled)

    Company  Market Cap  Rev Growth  NI Growth  EV/EBITDA    EV/Rev  \
1       AOS   -0.303929   -0.522875  -0.286957  -0.286957 -0.286957   
2       ABT    0.328063   -0.130480  -0.123836  -0.123836 -0.123836   
3      ABBV    0.635186   -0.180670  -0.310558  -0.310558 -0.310558   
4       ACN    0.384338   -0.235423  -0.061365  -0.061365 -0.061365   
5      ADBE    0.234313    0.129597  -0.063447  -0.063447 -0.063447   
..      ...         ...         ...        ...        ...       ...   
492    WDAY   -0.125324    0.407924   0.233641   0.233641  0.233641   
494     XEL   -0.218035   -0.376868  -0.212685  -0.212685 -0.212685   
495     XYL   -0.243222   -0.294738   0.061497   0.061497  0.061497   
498     ZBH   -0.268070   -0.171544   0.189217   0.189217  0.189217   
499     ZTS   -0.103289    0.147848  -0.112036  -0.112036 -0.112036   

     EBITDA Margin  Operating Margin  Debt/Equity        PE       ROE  \
1        -0.387438         -0.204349    -0.421613 -0.269346 -0.000535   
2

In [254]:
target_stock = "TSLA"  # Change to your stock
target_cluster = df_scaled[df_scaled["Company"] == target_stock]["Cluster"].values[0]

# Get companies in the same cluster
comparable_companies = df_scaled[df_scaled["Cluster"] == target_cluster]["Company"].tolist()
comparable_companies.remove(target_stock)  # Remove target itself

print(f"Comparable companies for {target_stock}: {comparable_companies}")

Comparable companies for TSLA: ['ABT', 'ACN', 'ABNB', 'AKAM', 'ALGN', 'AMCR', 'AIG', 'AON', 'APTV', 'ADM', 'AJG', 'AIZ', 'AVY', 'AXON', 'BKR', 'BALL', 'BAX', 'BBY', 'TECH', 'BSX', 'BR', 'BLDR', 'BG', 'CHRW', 'KMX', 'CCL', 'CARR', 'CBRE', 'CDW', 'CE', 'COR', 'CNC', 'CRL', 'CVX', 'CB', 'CI', 'CSCO', 'CTSH', 'CMCSA', 'CAG', 'CEG', 'COO', 'CSGP', 'COST', 'CMI', 'CVS', 'DHR', 'DRI', 'DVA', 'DE', 'DAL', 'DXCM', 'DLR', 'DG', 'DOV', 'DOW', 'DHI', 'DTE', 'DD', 'EMN', 'ETN', 'EBAY', 'EA', 'ELV', 'EMR', 'ENPH', 'EPAM', 'EFX', 'EXPE', 'EXPD', 'XOM', 'FE', 'F', 'FTV', 'GEHC', 'GNRC', 'GD', 'GIS', 'GM', 'GPC', 'GL', 'HAL', 'HIG', 'HSIC', 'HPE', 'HON', 'HRL', 'HST', 'HUM', 'HII', 'IEX', 'IR', 'PODD', 'IP', 'IPG', 'IQV', 'JBHT', 'JBL', 'J', 'JCI', 'JNPR', 'K', 'KVUE', 'KDP', 'KEYS', 'KR', 'LHX', 'LH', 'LW', 'LVS', 'LEN', 'LYV', 'LKQ', 'LMT', 'L', 'LYB', 'MPC', 'MLM', 'MKC', 'MDT', 'MGM', 'MCHP', 'MOH', 'TAP', 'MDLZ', 'NKE', 'NOC', 'NCLH', 'NUE', 'OMC', 'OKE', 'PCAR', 'PKG', 'PANW', 'PH', 'PYPL', 'PNR'