In [535]:
import requests
import warnings
from tqdm import tqdm, trange
import streamlit as st
import pandas as pd 
import yfinance as yf
from datetime import datetime
from dateutil.relativedelta import relativedelta
import plotly_express  as px
import plotly.graph_objects as go
import random
import unittest
import tables
import pickle
import pyarrow.parquet as pq
import pyarrow as pa
import plotly.io as pio
import math
import investpy
import sklearn.preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import subprocess
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [508]:
pio.templates.default = "plotly_dark"

In [905]:
oi = yf.Ticker("TSLA")

In [906]:
oi.info

{'address1': '1 Tesla Road',
 'city': 'Austin',
 'state': 'TX',
 'zip': '78725',
 'country': 'United States',
 'phone': '512 516 8177',
 'website': 'https://www.tesla.com',
 'industry': 'Auto Manufacturers',
 'industryKey': 'auto-manufacturers',
 'industryDisp': 'Auto Manufacturers',
 'sector': 'Consumer Cyclical',
 'sectorKey': 'consumer-cyclical',
 'sectorDisp': 'Consumer Cyclical',
 'longBusinessSummary': 'Tesla, Inc. designs, develops, manufactures, leases, and sells electric vehicles, and energy generation and storage systems in the United States, China, and internationally. The company operates in two segments, Automotive, and Energy Generation and Storage. The Automotive segment offers electric vehicles, as well as sells automotive regulatory credits; and non-warranty after-sales vehicle, used vehicles, body shop and parts, supercharging, retail merchandise, and vehicle insurance services. This segment also provides sedans and sport utility vehicles through direct and used vehic

In [907]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
sp500_table = pd.read_html(url)[0]  # First table contains S&P 500 tickers

# Keep only the columns of interest
sp500_table = sp500_table[["Symbol"]]
sp500_list = sp500_table['Symbol'].to_list()

data = []
progress = 0
st.markdown("Loading Data...")
for ticker in tqdm(sp500_list):
    stock = yf.Ticker(ticker)
    
    # Extract relevant financial metrics
    try:
        market_cap = stock.info["marketCap"]
        revenue_growth = stock.info.get("revenueGrowth", None)
        earningsGrowth = stock.info.get("earningsGrowth", None)
        enterpriseToEbitda = stock.info.get("earningsGrowth", None)
        enterpriseToRevenue  = stock.info.get("earningsGrowth", None)
        ebitda_margin = stock.info.get("ebitdaMargins", None)
        operatingMargins = stock.info.get("operatingMargins", None)
        de = stock.info.get("debtToEquity", None)
        pe = stock.info.get("trailingPE", None)
        roe = stock.info.get("returnOnEquity", None)
        roa = stock.info.get("returnOnAssets", None)
        industry = stock.info.get("industryKey", None)
        sector = stock.info.get("sectorKey", None)
        longname = stock.info.get("longName", None)
        
        data.append([ticker,market_cap,revenue_growth,earningsGrowth,enterpriseToEbitda,enterpriseToRevenue,ebitda_margin,operatingMargins,de, pe, roe,roa,industry,sector, longname ])
        progress = progress+(1/len(sp500_list))
    
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
    



  2%|▏         | 12/503 [00:05<04:04,  2.01it/s]


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(data, columns=['Company', 'Market Cap', 'Rev Growth', 'NI Growth', 'EV/EBITDA', 'EV/Rev', 'EBITDA Margin', 'Operating Margin', 'Debt/Equity', 'PE', 'ROE','ROA', 'Industry', 'Sector', 'Name'])

In [860]:
df

Unnamed: 0,Company,Market Cap,Rev Growth,NI Growth,EV/EBITDA,EV/Rev,EBITDA Margin,Operating Margin,Debt/Equity,PE,ROE,ROA,Industry,Sector,Name
0,MMM,82694529024,0.004,,,,0.24870,0.21433,295.441,20.951859,1.13294,0.09032,conglomerates,industrials,3M Company
1,AOS,9498622976,-0.037,-0.089,-0.089,-0.089,0.20779,0.17793,7.471,18.046831,0.29544,0.14337,specialty-industrial-machinery,industrials,A. O. Smith Corporation
2,ABT,222392467456,0.049,0.146,0.146,0.146,0.26263,0.18749,37.589,16.782722,0.14840,0.06488,medical-devices,healthcare,Abbott Laboratories
3,ABBV,341005008896,0.038,-0.123,-0.123,-0.123,0.46153,0.28928,1174.815,80.740585,0.56407,0.07720,drug-manufacturers-general,healthcare,AbbVie Inc.
4,ACN,242273419264,0.026,0.236,0.236,0.236,0.17052,0.14597,14.127,32.467728,0.26675,0.11627,information-technology-services,technology,Accenture plc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,XYL,31927830528,0.013,0.413,0.413,0.413,0.20050,0.13498,19.840,36.005478,0.08081,0.04429,specialty-industrial-machinery,industrials,Xylem Inc.
497,YUM,40189157376,0.069,-0.075,-0.075,-0.075,0.36107,0.34447,,27.641073,,0.24328,restaurants,consumer-cyclical,"Yum! Brands, Inc."
498,ZBRA,19435343872,0.313,,,,0.17247,0.15618,69.307,51.265305,0.11874,0.05234,communication-equipment,technology,Zebra Technologies Corporation
499,ZBH,20442908672,0.040,0.597,0.597,0.597,0.33670,0.17997,53.611,23.180588,0.08713,0.04610,medical-devices,healthcare,"Zimmer Biomet Holdings, Inc."


In [861]:
df_no_nan = df.dropna()
df_no_nan = df_no_nan.reset_index(drop=True)
df_no_nan

Unnamed: 0,Company,Market Cap,Rev Growth,NI Growth,EV/EBITDA,EV/Rev,EBITDA Margin,Operating Margin,Debt/Equity,PE,ROE,ROA,Industry,Sector,Name
0,AOS,9498622976,-0.037,-0.089,-0.089,-0.089,0.20779,0.17793,7.471,18.046831,0.29544,0.14337,specialty-industrial-machinery,industrials,A. O. Smith Corporation
1,ABT,222392467456,0.049,0.146,0.146,0.146,0.26263,0.18749,37.589,16.782722,0.14840,0.06488,medical-devices,healthcare,Abbott Laboratories
2,ABBV,341005008896,0.038,-0.123,-0.123,-0.123,0.46153,0.28928,1174.815,80.740585,0.56407,0.07720,drug-manufacturers-general,healthcare,AbbVie Inc.
3,ACN,242273419264,0.026,0.236,0.236,0.236,0.17052,0.14597,14.127,32.467728,0.26675,0.11627,information-technology-services,technology,Accenture plc
4,ADBE,189529620480,0.106,0.233,0.233,0.233,0.38831,0.36834,41.788,35.198060,0.35355,0.15969,software-infrastructure,technology,Adobe Inc.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,WDAY,73900122112,0.167,0.661,0.661,0.661,0.08089,0.05324,40.312,45.694080,0.21081,0.01426,software-application,technology,"Workday, Inc."
372,XEL,38543122432,-0.005,0.018,0.018,0.018,0.40035,0.25165,154.330,19.511630,0.10262,0.02564,utilities-regulated-electric,utilities,Xcel Energy Inc.
373,XYL,31927830528,0.013,0.413,0.413,0.413,0.20050,0.13498,19.840,36.005478,0.08081,0.04429,specialty-industrial-machinery,industrials,Xylem Inc.
374,ZBH,20442908672,0.040,0.597,0.597,0.597,0.33670,0.17997,53.611,23.180588,0.08713,0.04610,medical-devices,healthcare,"Zimmer Biomet Holdings, Inc."


In [862]:
df_numeric = df_no_nan.drop(columns=["Company", "Industry", "Sector", "Name"])
df_numeric

Unnamed: 0,Market Cap,Rev Growth,NI Growth,EV/EBITDA,EV/Rev,EBITDA Margin,Operating Margin,Debt/Equity,PE,ROE,ROA
0,9498622976,-0.037,-0.089,-0.089,-0.089,0.20779,0.17793,7.471,18.046831,0.29544,0.14337
1,222392467456,0.049,0.146,0.146,0.146,0.26263,0.18749,37.589,16.782722,0.14840,0.06488
2,341005008896,0.038,-0.123,-0.123,-0.123,0.46153,0.28928,1174.815,80.740585,0.56407,0.07720
3,242273419264,0.026,0.236,0.236,0.236,0.17052,0.14597,14.127,32.467728,0.26675,0.11627
4,189529620480,0.106,0.233,0.233,0.233,0.38831,0.36834,41.788,35.198060,0.35355,0.15969
...,...,...,...,...,...,...,...,...,...,...,...
371,73900122112,0.167,0.661,0.661,0.661,0.08089,0.05324,40.312,45.694080,0.21081,0.01426
372,38543122432,-0.005,0.018,0.018,0.018,0.40035,0.25165,154.330,19.511630,0.10262,0.02564
373,31927830528,0.013,0.413,0.413,0.413,0.20050,0.13498,19.840,36.005478,0.08081,0.04429
374,20442908672,0.040,0.597,0.597,0.597,0.33670,0.17997,53.611,23.180588,0.08713,0.04610


In [863]:
scaler = StandardScaler()
np_scaled = scaler.fit_transform(df_numeric)
np_scaled

array([[-0.29822088, -0.71864796, -0.28556118, ..., -0.25863331,
        -0.06072772,  1.38152902],
       [ 0.26125562, -0.09930945, -0.13249691, ..., -0.2728396 ,
        -0.24187626, -0.12428141],
       [ 0.57296463, -0.17852716, -0.30770665, ...,  0.44593095,
         0.27021571,  0.11207454],
       ...,
       [-0.23927782, -0.35856743,  0.04141016, ..., -0.05681064,
        -0.32514496, -0.51929526],
       [-0.26945973, -0.16412394,  0.16125622, ..., -0.20093918,
        -0.31735892, -0.48457087],
       [-0.11673862,  0.33998881, -0.12142417, ..., -0.0943188 ,
         0.15805738,  1.44426312]])

In [864]:
df_scaled = pd.DataFrame(np_scaled,columns=df_numeric.columns)
df_scaled.insert(0, "Industry", df_no_nan["Industry"])
df_scaled.insert(0, "Sector", df_no_nan["Sector"])
df_scaled.insert(0, "Name", df_no_nan["Name"])
df_scaled.insert(0, "Company", df_no_nan["Company"])
df_scaled

Unnamed: 0,Company,Name,Sector,Industry,Market Cap,Rev Growth,NI Growth,EV/EBITDA,EV/Rev,EBITDA Margin,Operating Margin,Debt/Equity,PE,ROE,ROA
0,AOS,A. O. Smith Corporation,industrials,specialty-industrial-machinery,-0.298221,-0.718648,-0.285561,-0.285561,-0.285561,-0.426155,-0.257631,-0.395268,-0.258633,-0.060728,1.381529
1,ABT,Abbott Laboratories,healthcare,medical-devices,0.261256,-0.099309,-0.132497,-0.132497,-0.132497,-0.089361,-0.181061,-0.310251,-0.272840,-0.241876,-0.124281
2,ABBV,AbbVie Inc.,healthcare,drug-manufacturers-general,0.572965,-0.178527,-0.307707,-0.307707,-0.307707,1.132160,0.634215,2.899931,0.445931,0.270216,0.112075
3,ACN,Accenture plc,technology,information-technology-services,0.313502,-0.264946,-0.073877,-0.073877,-0.073877,-0.655044,-0.513611,-0.376480,-0.096569,-0.096073,0.861622
4,ADBE,Adobe Inc.,technology,software-infrastructure,0.174893,0.311182,-0.075831,-0.075831,-0.075831,0.682488,1.267438,-0.298398,-0.065885,0.010862,1.694624
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,WDAY,"Workday, Inc.",technology,software-application,-0.128976,0.750481,0.202942,0.202942,0.202942,-1.205497,-1.256322,-0.302564,0.052072,-0.164989,-1.095413
372,XEL,Xcel Energy Inc.,utilities,utilities-regulated-electric,-0.221893,-0.488196,-0.215868,-0.215868,-0.215868,0.756430,0.332822,0.019288,-0.242172,-0.298276,-0.877091
373,XYL,Xylem Inc.,industrials,specialty-industrial-machinery,-0.239278,-0.358567,0.041410,0.041410,0.041410,-0.470926,-0.601634,-0.360353,-0.056811,-0.325145,-0.519295
374,ZBH,"Zimmer Biomet Holdings, Inc.",healthcare,medical-devices,-0.269460,-0.164124,0.161256,0.161256,0.161256,0.365531,-0.241292,-0.265023,-0.200939,-0.317359,-0.484571


In [865]:
for feature in df_scaled.columns.to_list()[4:]: 
    df_scaled_no_outliers = df_scaled.drop(df_scaled[df_scaled[feature] > 2].index)

In [866]:
df_scaled_no_outliers = df_scaled_no_outliers.reset_index(drop=True)
df_scaled_no_outliers

Unnamed: 0,Company,Name,Sector,Industry,Market Cap,Rev Growth,NI Growth,EV/EBITDA,EV/Rev,EBITDA Margin,Operating Margin,Debt/Equity,PE,ROE,ROA
0,AOS,A. O. Smith Corporation,industrials,specialty-industrial-machinery,-0.298221,-0.718648,-0.285561,-0.285561,-0.285561,-0.426155,-0.257631,-0.395268,-0.258633,-0.060728,1.381529
1,ABT,Abbott Laboratories,healthcare,medical-devices,0.261256,-0.099309,-0.132497,-0.132497,-0.132497,-0.089361,-0.181061,-0.310251,-0.272840,-0.241876,-0.124281
2,ABBV,AbbVie Inc.,healthcare,drug-manufacturers-general,0.572965,-0.178527,-0.307707,-0.307707,-0.307707,1.132160,0.634215,2.899931,0.445931,0.270216,0.112075
3,ACN,Accenture plc,technology,information-technology-services,0.313502,-0.264946,-0.073877,-0.073877,-0.073877,-0.655044,-0.513611,-0.376480,-0.096569,-0.096073,0.861622
4,ADBE,Adobe Inc.,technology,software-infrastructure,0.174893,0.311182,-0.075831,-0.075831,-0.075831,0.682488,1.267438,-0.298398,-0.065885,0.010862,1.694624
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,WDAY,"Workday, Inc.",technology,software-application,-0.128976,0.750481,0.202942,0.202942,0.202942,-1.205497,-1.256322,-0.302564,0.052072,-0.164989,-1.095413
359,XEL,Xcel Energy Inc.,utilities,utilities-regulated-electric,-0.221893,-0.488196,-0.215868,-0.215868,-0.215868,0.756430,0.332822,0.019288,-0.242172,-0.298276,-0.877091
360,XYL,Xylem Inc.,industrials,specialty-industrial-machinery,-0.239278,-0.358567,0.041410,0.041410,0.041410,-0.470926,-0.601634,-0.360353,-0.056811,-0.325145,-0.519295
361,ZBH,"Zimmer Biomet Holdings, Inc.",healthcare,medical-devices,-0.269460,-0.164124,0.161256,0.161256,0.161256,0.365531,-0.241292,-0.265023,-0.200939,-0.317359,-0.484571


In [867]:
df_scaled_no_outliers_numeric = df_scaled_no_outliers.drop(columns=["Company", "Industry", "Sector", "Name"])
df_scaled_no_outliers_numeric

Unnamed: 0,Market Cap,Rev Growth,NI Growth,EV/EBITDA,EV/Rev,EBITDA Margin,Operating Margin,Debt/Equity,PE,ROE,ROA
0,-0.298221,-0.718648,-0.285561,-0.285561,-0.285561,-0.426155,-0.257631,-0.395268,-0.258633,-0.060728,1.381529
1,0.261256,-0.099309,-0.132497,-0.132497,-0.132497,-0.089361,-0.181061,-0.310251,-0.272840,-0.241876,-0.124281
2,0.572965,-0.178527,-0.307707,-0.307707,-0.307707,1.132160,0.634215,2.899931,0.445931,0.270216,0.112075
3,0.313502,-0.264946,-0.073877,-0.073877,-0.073877,-0.655044,-0.513611,-0.376480,-0.096569,-0.096073,0.861622
4,0.174893,0.311182,-0.075831,-0.075831,-0.075831,0.682488,1.267438,-0.298398,-0.065885,0.010862,1.694624
...,...,...,...,...,...,...,...,...,...,...,...
358,-0.128976,0.750481,0.202942,0.202942,0.202942,-1.205497,-1.256322,-0.302564,0.052072,-0.164989,-1.095413
359,-0.221893,-0.488196,-0.215868,-0.215868,-0.215868,0.756430,0.332822,0.019288,-0.242172,-0.298276,-0.877091
360,-0.239278,-0.358567,0.041410,0.041410,0.041410,-0.470926,-0.601634,-0.360353,-0.056811,-0.325145,-0.519295
361,-0.269460,-0.164124,0.161256,0.161256,0.161256,0.365531,-0.241292,-0.265023,-0.200939,-0.317359,-0.484571


In [868]:
inertia_list = []
for k in range(3,100):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(df_scaled_no_outliers_numeric)
    inertia_list.append(kmeans.inertia_)
inertia_df = pd.DataFrame(inertia_list)

In [869]:
inertia_fig = go.Figure()
inertia_fig.add_trace(go.Scatter(x = inertia_df.index, y =inertia_df[0] ))

inertia_fig

In [870]:
labels = KMeans(n_clusters=7).fit_predict(df_scaled_no_outliers_numeric)
labels

array([4, 4, 4, 0, 4, 0, 0, 4, 4, 0, 6, 0, 4, 2, 3, 3, 3, 0, 2, 4, 0, 2,
       4, 4, 2, 4, 4, 4, 0, 4, 0, 4, 4, 0, 2, 4, 4, 2, 0, 0, 0, 0, 0, 6,
       0, 0, 2, 6, 4, 2, 0, 4, 0, 0, 0, 4, 0, 4, 0, 4, 4, 0, 0, 0, 0, 0,
       4, 4, 0, 4, 0, 4, 0, 0, 4, 0, 6, 2, 4, 4, 0, 4, 4, 0, 4, 4, 4, 0,
       4, 2, 0, 0, 2, 6, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 6, 0,
       0, 0, 0, 2, 0, 0, 4, 4, 6, 6, 4, 0, 4, 0, 2, 2, 0, 4, 4, 2, 4, 2,
       2, 0, 4, 2, 0, 4, 4, 2, 2, 4, 2, 0, 4, 0, 4, 4, 4, 4, 4, 6, 0, 2,
       0, 0, 0, 0, 0, 2, 2, 4, 0, 0, 5, 2, 0, 4, 2, 0, 4, 4, 4, 0, 0, 4,
       4, 0, 0, 4, 4, 0, 4, 4, 2, 0, 0, 0, 2, 0, 0, 0, 4, 0, 4, 0, 0, 0,
       0, 4, 0, 4, 2, 2, 4, 0, 0, 0, 4, 0, 4, 0, 4, 0, 0, 0, 0, 0, 0, 2,
       4, 4, 4, 0, 0, 4, 3, 6, 0, 0, 3, 2, 0, 0, 0, 4, 4, 2, 4, 4, 4, 4,
       6, 6, 2, 0, 4, 4, 2, 0, 4, 0, 4, 2, 0, 4, 4, 4, 0, 4, 0, 0, 4, 2,
       4, 0, 4, 0, 4, 0, 2, 0, 0, 4, 4, 4, 2, 6, 4, 2, 4, 0, 4, 0, 0, 2,
       2, 4, 4, 4, 1, 0, 4, 2, 0, 4, 2, 4, 4, 0, 0,

In [871]:
df_scaled_no_outliers_numeric["Cluster"] = labels
df_scaled_no_outliers_numeric.insert(0,"Sector", df_scaled_no_outliers["Sector"])
df_scaled_no_outliers_numeric.insert(0,"Industry", df_scaled_no_outliers["Industry"])
df_scaled_no_outliers_numeric.insert(0,"Name", df_scaled_no_outliers["Name"])
df_scaled_no_outliers_numeric.insert(0,"Company", df_scaled_no_outliers["Company"])

In [872]:
df_scaled_no_outliers_numeric

Unnamed: 0,Company,Name,Industry,Sector,Market Cap,Rev Growth,NI Growth,EV/EBITDA,EV/Rev,EBITDA Margin,Operating Margin,Debt/Equity,PE,ROE,ROA,Cluster
0,AOS,A. O. Smith Corporation,specialty-industrial-machinery,industrials,-0.298221,-0.718648,-0.285561,-0.285561,-0.285561,-0.426155,-0.257631,-0.395268,-0.258633,-0.060728,1.381529,4
1,ABT,Abbott Laboratories,medical-devices,healthcare,0.261256,-0.099309,-0.132497,-0.132497,-0.132497,-0.089361,-0.181061,-0.310251,-0.272840,-0.241876,-0.124281,4
2,ABBV,AbbVie Inc.,drug-manufacturers-general,healthcare,0.572965,-0.178527,-0.307707,-0.307707,-0.307707,1.132160,0.634215,2.899931,0.445931,0.270216,0.112075,4
3,ACN,Accenture plc,information-technology-services,technology,0.313502,-0.264946,-0.073877,-0.073877,-0.073877,-0.655044,-0.513611,-0.376480,-0.096569,-0.096073,0.861622,0
4,ADBE,Adobe Inc.,software-infrastructure,technology,0.174893,0.311182,-0.075831,-0.075831,-0.075831,0.682488,1.267438,-0.298398,-0.065885,0.010862,1.694624,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,WDAY,"Workday, Inc.",software-application,technology,-0.128976,0.750481,0.202942,0.202942,0.202942,-1.205497,-1.256322,-0.302564,0.052072,-0.164989,-1.095413,0
359,XEL,Xcel Energy Inc.,utilities-regulated-electric,utilities,-0.221893,-0.488196,-0.215868,-0.215868,-0.215868,0.756430,0.332822,0.019288,-0.242172,-0.298276,-0.877091,2
360,XYL,Xylem Inc.,specialty-industrial-machinery,industrials,-0.239278,-0.358567,0.041410,0.041410,0.041410,-0.470926,-0.601634,-0.360353,-0.056811,-0.325145,-0.519295,0
361,ZBH,"Zimmer Biomet Holdings, Inc.",medical-devices,healthcare,-0.269460,-0.164124,0.161256,0.161256,0.161256,0.365531,-0.241292,-0.265023,-0.200939,-0.317359,-0.484571,4


In [873]:
df_final = df_scaled_no_outliers_numeric
df_final

Unnamed: 0,Company,Name,Industry,Sector,Market Cap,Rev Growth,NI Growth,EV/EBITDA,EV/Rev,EBITDA Margin,Operating Margin,Debt/Equity,PE,ROE,ROA,Cluster
0,AOS,A. O. Smith Corporation,specialty-industrial-machinery,industrials,-0.298221,-0.718648,-0.285561,-0.285561,-0.285561,-0.426155,-0.257631,-0.395268,-0.258633,-0.060728,1.381529,4
1,ABT,Abbott Laboratories,medical-devices,healthcare,0.261256,-0.099309,-0.132497,-0.132497,-0.132497,-0.089361,-0.181061,-0.310251,-0.272840,-0.241876,-0.124281,4
2,ABBV,AbbVie Inc.,drug-manufacturers-general,healthcare,0.572965,-0.178527,-0.307707,-0.307707,-0.307707,1.132160,0.634215,2.899931,0.445931,0.270216,0.112075,4
3,ACN,Accenture plc,information-technology-services,technology,0.313502,-0.264946,-0.073877,-0.073877,-0.073877,-0.655044,-0.513611,-0.376480,-0.096569,-0.096073,0.861622,0
4,ADBE,Adobe Inc.,software-infrastructure,technology,0.174893,0.311182,-0.075831,-0.075831,-0.075831,0.682488,1.267438,-0.298398,-0.065885,0.010862,1.694624,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,WDAY,"Workday, Inc.",software-application,technology,-0.128976,0.750481,0.202942,0.202942,0.202942,-1.205497,-1.256322,-0.302564,0.052072,-0.164989,-1.095413,0
359,XEL,Xcel Energy Inc.,utilities-regulated-electric,utilities,-0.221893,-0.488196,-0.215868,-0.215868,-0.215868,0.756430,0.332822,0.019288,-0.242172,-0.298276,-0.877091,2
360,XYL,Xylem Inc.,specialty-industrial-machinery,industrials,-0.239278,-0.358567,0.041410,0.041410,0.041410,-0.470926,-0.601634,-0.360353,-0.056811,-0.325145,-0.519295,0
361,ZBH,"Zimmer Biomet Holdings, Inc.",medical-devices,healthcare,-0.269460,-0.164124,0.161256,0.161256,0.161256,0.365531,-0.241292,-0.265023,-0.200939,-0.317359,-0.484571,4


In [874]:
target_stock = "WMT"

In [875]:
target_cluster = df_final[df_final["Company"] == target_stock]["Cluster"].values[0]
target_sector = df_final[df_final["Company"] == target_stock]["Sector"].values[0]
target_industry = df_final[df_final["Company"] == target_stock]["Industry"].values[0]
target_industry

'discount-stores'

In [876]:
df_final[df_final["Cluster"] == 8]

Unnamed: 0,Company,Name,Industry,Sector,Market Cap,Rev Growth,NI Growth,EV/EBITDA,EV/Rev,EBITDA Margin,Operating Margin,Debt/Equity,PE,ROE,ROA,Cluster


## Same Cluster

In [877]:
comparable_Companies_list = df_final[df_final["Cluster"]==target_cluster]["Company"].to_list()
comparable_Sectors_list = df_final[df_final["Cluster"]==target_cluster]["Sector"].to_list()
comparable_Industries_list = df_final[df_final["Cluster"]==target_cluster]["Industry"].to_list()
comparable_Names_list = df_final[df_final["Cluster"]==target_cluster]["Name"].to_list()
comparables_df = pd.DataFrame()
comparables_df["Company"] = comparable_Companies_list
comparables_df["Name"] = comparable_Names_list
comparables_df["Sector"] = comparable_Sectors_list
comparables_df["Industry"] = comparable_Industries_list
comparables_df

Unnamed: 0,Company,Name,Sector,Industry
0,ACN,Accenture plc,technology,information-technology-services
1,AMD,"Advanced Micro Devices, Inc.",technology,semiconductors
2,AES,The AES Corporation,utilities,utilities-diversified
3,AKAM,"Akamai Technologies, Inc.",technology,software-infrastructure
4,ALGN,"Align Technology, Inc.",healthcare,medical-instruments-supplies
...,...,...,...,...
148,WAB,Westinghouse Air Brake Technologies Corporation,industrials,railroads
149,WMT,Walmart Inc.,consumer-defensive,discount-stores
150,WY,Weyerhaeuser Company,real-estate,reit-specialty
151,WDAY,"Workday, Inc.",technology,software-application


## Same Cluster and Sector

In [878]:
comparables_df[comparables_df["Sector"] == target_sector]

Unnamed: 0,Company,Name,Sector,Industry
8,ADM,Archer-Daniels-Midland Company,consumer-defensive,farm-products
19,BG,Bunge Global SA,consumer-defensive,farm-products
34,CAG,"Conagra Brands, Inc.",consumer-defensive,packaged-foods
37,COST,Costco Wholesale Corporation,consumer-defensive,discount-stores
47,DG,Dollar General Corporation,consumer-defensive,discount-stores
64,GIS,"General Mills, Inc.",consumer-defensive,packaged-foods
71,HRL,Hormel Foods Corporation,consumer-defensive,packaged-foods
85,K,Kellanova,consumer-defensive,packaged-foods
86,KVUE,Kenvue Inc.,consumer-defensive,household-personal-products
88,KR,The Kroger Co.,consumer-defensive,grocery-stores


## Same Cluster Sector and Industry

In [879]:
comparables_df[comparables_df["Industry"] == target_industry]

Unnamed: 0,Company,Name,Sector,Industry
37,COST,Costco Wholesale Corporation,consumer-defensive,discount-stores
47,DG,Dollar General Corporation,consumer-defensive,discount-stores
131,TGT,Target Corporation,consumer-defensive,discount-stores
149,WMT,Walmart Inc.,consumer-defensive,discount-stores


## Same Sector

In [880]:
df_no_nan[df_no_nan['Sector'] == target_sector].loc[:,['Company', 'Name', 'Sector','Industry' ]]

Unnamed: 0,Company,Name,Sector,Industry
31,ADM,Archer-Daniels-Midland Company,consumer-defensive,farm-products
54,BG,Bunge Global SA,consumer-defensive,farm-products
77,CLX,The Clorox Company,consumer-defensive,household-personal-products
80,KO,The Coca-Cola Company,consumer-defensive,beverages-non-alcoholic
82,CL,Colgate-Palmolive Company,consumer-defensive,household-personal-products
84,CAG,"Conagra Brands, Inc.",consumer-defensive,packaged-foods
92,COST,Costco Wholesale Corporation,consumer-defensive,discount-stores
109,DG,Dollar General Corporation,consumer-defensive,discount-stores
159,GIS,"General Mills, Inc.",consumer-defensive,packaged-foods
170,HSY,The Hershey Company,consumer-defensive,confectioners


## Same Industry

In [881]:
df_no_nan[df_no_nan['Industry'] == target_industry].loc[:,['Company', 'Name', 'Sector','Industry' ]]

Unnamed: 0,Company,Name,Sector,Industry
92,COST,Costco Wholesale Corporation,consumer-defensive,discount-stores
109,DG,Dollar General Corporation,consumer-defensive,discount-stores
329,TGT,Target Corporation,consumer-defensive,discount-stores
363,WMT,Walmart Inc.,consumer-defensive,discount-stores


## Visualisation Using PCA

In [882]:
from sklearn.manifold import TSNE

In [883]:
tsne = TSNE(n_components=3, perplexity=30, random_state=42)
tsne_result = tsne.fit_transform(df_scaled_no_outliers_numeric.drop(columns=["Company", "Industry", "Sector", "Name", "Cluster"]))

In [887]:
df_tsne = pd.DataFrame(tsne_result, columns=["tSNE1", "tSNE2", 'tSNE3'])
df_tsne['Cluster'] = df_final["Cluster"]
df_tsne['Company'] = df_final["Company"]
df_tsne

Unnamed: 0,tSNE1,tSNE2,tSNE3,Cluster,Company
0,-4.376871,0.605239,0.366055,4,AOS
1,0.437651,-0.406004,0.410598,4,ABT
2,4.187847,1.836024,4.493724,4,ABBV
3,-3.260392,0.992293,-1.555878,0,ACN
4,-2.239415,2.711203,4.970285,4,ADBE
...,...,...,...,...,...
358,2.233757,0.207930,-5.392731,0,WDAY
359,2.796302,-2.125481,4.465632,2,XEL
360,0.552523,-0.606837,-2.057818,0,XYL
361,3.271907,-1.346254,0.666654,4,ZBH


In [902]:
tsne_fig = go.Figure()
for cluster in df_tsne["Cluster"].unique():
    
    cluster_data = df_tsne[df_tsne["Cluster"] == cluster]
    tsne_fig.add_trace(go.Scatter3d(x = cluster_data['tSNE1'], y = cluster_data['tSNE2'],z =cluster_data['tSNE3'] , name=f"Cluster: {cluster}", mode = 'markers',), )
    tsne_fig.update_traces(marker={'size': 3,})
    tsne_fig.update_layout(
    autosize=False,
    width=1100,
    height=600,
)

In [903]:
tsne_fig