<a href="https://colab.research.google.com/github/Jangalang2911/ML-Ongoing/blob/master/dimensionality_reduction_stocks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tentative Schema
1. Scrape Data, inspect it
2. Carry out pca for a selection of dimensions
3. Carry out tsne for a selection of dimensions
4. Carry out umap for a selection of dimensions
5. Graph 2d, 3d reductions for all methods
6. Calculate inertia, davies-bouldin score for all dimensions and methods; graph with method as color label

7. Add support for vector embeddings later (for industry)

In [None]:
!pip install yfinance
!pip install gensim

In [1]:
!pip install umap-learn

Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82807 sha256=1e8f8472dff8a433c7fda9033a210bb4e25d9e9d81d00f51265b21291a8b30cf
  Stored in directory: /root/.cache/pip/wheels/a0/e8/c6/a37ea663620bd5200ea1ba0907ab3c217042c1d035ef606acc
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for pyn

In [71]:
import os
import sys

import pandas as pd
import numpy as np
import tensorflow as tf
import yfinance as yf
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [3]:
import mpl_toolkits
import math

# **Data Preprocessing**

In [None]:
#Inspecting yfinance API
ticker = yf.Ticker("MSFT")
ticker.info

In [4]:
#Scraping data from Nasdaq API
nasdaq_df = pd.read_csv('/content/sample_data/nasdaq_screener_1692679334427.csv')
stock_tickers = list(nasdaq_df.pop('Symbol'))
stock_tickers = [str(item) for item in stock_tickers]

In [None]:
#Identifying characters not supported by yfinance API for requests
ticker_vocab = []
for ticker in stock_tickers:
  ticker_vocab.extend(ticker)
ticker_vocab = list(np.unique(np.array(ticker_vocab)))
# Identified characters : "^", "/", " "
ticker_vocab

In [None]:
for ticker in stock_tickers:
  if "^" in ticker or "/" in ticker or " " in ticker:
    stock_tickers.remove(ticker)

check_char = [True for x in stock_tickers if '^' not in x or '/' not in x  or ' ' not in x]
check_char = set(check_char)
check_char

{True}

In [None]:
#Scraping data from yfinance API
list_data = []
for ticker in stock_tickers:
  stock = yf.Ticker(ticker)
  list_data.append(stock.info)

In [None]:
len(list_data)

7104

In [None]:
#Identifting extraneous categorical variables for deletion
list_data[0]

In [None]:
stock_df = pd.DataFrame.from_dict(list_data)
stock_df.to_csv('/content/stock_data.csv') # exporting the csv to avoid making repeated api requests

In [40]:
orig_df = pd.read_csv('/content/sample_data/nasdaq.csv', header=0)
stock_df = orig_df.copy()

In [41]:
print(list(stock_df.keys()))

['Symbol', 'Name', 'Last Sale', 'Net Change', '% Change', 'Market Cap', 'Country', 'IPO Year', 'Volume', 'Sector', 'Industry']


In [42]:
stock_df.shape

(7535, 11)

In [43]:
# industry = stock_df.pop('industry') # the only categorical variables of these worth keeping
# sector = stock_df.pop('sector')
# exchange = stock_df.pop('exchange')
# currency = stock_df.pop('financialCurrency')

# stock_df.pop('address1')
# stock_df.pop('quoteType')
# stock_df.pop('shortName')
# stock_df.pop('zip')
# stock_df.pop('phone')
# stock_df.pop('fax')
# stock_df.pop('city')
# stock_df.pop('country')
# stock_df.pop('state')
# stock_df.pop('website')
# stock_df.pop('industryDisp')
# stock_df.pop('sectorDisp')
# stock_df.pop('longBusinessSummary')
# stock_df.pop('gmtOffSetMilliseconds')
# stock_df.pop('messageBoardId')
# stock_df.pop('uuid')
# stock_df.pop('timeZoneShortName')
# stock_df.pop('timeZoneFullName')
# stock_df.pop('longName')
# stock_df.pop('firstTradeDateEpochUtc')
# stock_df.pop('symbol')
# stock_df.pop('underlyingSymbol')
# stock_df.pop('currency')
# stock_df.pop('fullTimeEmployees')
# stock_df.pop('companyOfficers')
# stock_df.pop('address2')
# stock_df.pop('lastSplitDate')
# stock_df.pop('lastDividendDate')
# stock_df.pop('Unnamed: 0')
# stock_df.pop('governanceEpochDate')
# stock_df.pop('exDividendDate')
# stock_df.pop('industrySymbol')
# stock_df.pop('recommendationKey')
# stock_df.pop('maxAge')

stock_df.pop('Symbol')
stock_df.pop('Name')
country = stock_df.pop('Country')
ipo_year = stock_df.pop('IPO Year')
sector = stock_df.pop('Sector')
stock_df.pop('Industry')

0        Biotechnology: Laboratory Analytical Instruments
1                                                Aluminum
2                                      Metal Fabrications
3                                    Educational Services
4                                            Blank Checks
                              ...                        
7530           Biotechnology: Pharmaceutical Preparations
7531                      Industrial Machinery/Components
7532           Biotechnology: Pharmaceutical Preparations
7533           Biotechnology: Pharmaceutical Preparations
7534    Biotechnology: Electromedical & Electrotherape...
Name: Industry, Length: 7535, dtype: object

In [44]:
stock_df

Unnamed: 0,Last Sale,Net Change,% Change,Market Cap,Volume
0,$118.86,-0.390,-0.327%,3.510837e+10,2249263
1,$28.59,-0.380,-1.312%,5.101874e+09,4694604
2,$10.66,0.010,0.094%,7.674889e+08,44383
3,$1.21,-0.040,-3.20%,3.826283e+07,10670
4,$10.79,0.000,0.00%,0.000000e+00,86
...,...,...,...,...,...
7530,$0.1625,0.007,4.502%,3.286605e+06,222658
7531,$28.79,0.040,0.139%,4.979166e+09,719824
7532,$7.65,0.230,3.10%,5.188742e+08,335844
7533,$1.27,-0.050,-3.788%,6.850308e+07,790985


In [45]:
df_keys = stock_df.keys()
num_obs = stock_df.shape[0]


#Converting strings to floats
for i in range(num_obs):
  last_sale = stock_df['Last Sale'][i]
  per_change = stock_df['% Change'][i]

  if isinstance(last_sale, str): # buffer for nan values
     stock_df['Last Sale'][i] = float(stock_df['Last Sale'][i][1:])
  if isinstance(per_change, str):
    stock_df['% Change'][i] = float(stock_df['% Change'][i][:-1])


# Resolving nan values
stock_df.replace([np.inf, -np.inf], np.nan, inplace=True)

for key in df_keys:
  num_nan = stock_df[key].isnull().sum()
  if ((num_nan/num_obs) >= 0.3):
    stock_df.pop(key)
    print(( 'Popped {}, {} null out of {}').format(key, num_nan, num_obs))

  else:
    median = stock_df[key].median()
    stock_df[key].fillna(median, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [46]:
#Market Cap Labels (in billions)
cap_labels = []
for i in range(num_obs):
  stat = stock_df['Market Cap'][i]/10**9

  if 10 <= stat:
    cap_labels.append('blue')
  elif 2 <= stat < 10:
    cap_labels.append('red')
  else:
    cap_labels.append('green')

In [47]:
#IPO Year Labels
ipo_year.fillna(0, inplace=True)
ipo_year.unique()


array([1999., 2016., 2021., 2008., 2023.,    0., 2020., 2013., 1980.,
       2011., 2015., 2012., 1994., 2002., 2004., 2018., 2022., 1985.,
       2017., 2000., 2007., 2001., 2006., 1988., 1986., 2003., 2019.,
       2009., 2014., 1997., 1995., 2010., 1991., 1983., 1972., 1998.,
       2005., 1996., 1992., 1993., 1971., 1925., 1965., 1981., 1987.,
       1951., 1989., 1990., 1982., 1930., 1960., 1975., 1978., 1929.,
       1968., 1946., 1984., 1976., 1969., 1970., 1973.])

In [12]:
ipo_labels = []
for year in ipo_year:
  if 1950 <= year < 1960:
    ipo_labels.append('red')
  elif 1960 <= year < 1980:
    ipo_labels.append('green')
  elif 1980 <= year <= 2000:
    ipo_labels.append('green')
  elif 2000 <= year < 2010:
    ipo_labels.append('black')
  elif 2010 <= year < 2020:
    ipo_labels.append('brown')
  elif 2020 <= year:
    ipo_labels.append('purple')
  else:
    ipo_labels.append('grey')


In [13]:
#Sector Labels
sector.fillna("nan", inplace=True)
sector.unique()

array(['Industrials', 'Consumer Discretionary', 'Finance', 'nan',
       'Health Care', 'Real Estate', 'Technology', 'Consumer Staples',
       'Energy', 'Miscellaneous', 'Utilities', 'Basic Materials',
       'Telecommunications'], dtype=object)

In [14]:
colors = {"Industrials": "red", "Consumer Discretionary" :"green", "Finance" :"blue", "nan" :"yellow",
          "Health Care": "pink", "Real Estate" :"black", "Technology": "orange", "Consumer Staples" :"purple", "Energy" :"beige",
          "Miscellaneous":"brown", "Utilities":"gray", "Basic Materials":"cyan", "Telecommunications":"magenta"} #in order of sectors
sector_labels = []
for key in sector:
  sector_labels.append(colors[key])
sector_labels = np.array(sector_labels)

In [None]:
stock_df.keys()

Index(['Last Sale', 'Net Change', '% Change', 'Market Cap', 'Volume'], dtype='object')

1. 'industry' is a key categorical variable in comparing the other numeric features of a corporation's performance. However, for our purposes, we can afford not to be hyperspecific, and club some kinds of industries together.

2. Instead of identifying similar industries manually, I'll use word embeddings to do so. Apart from being more efficient, it also injects an extra bit of complexity into the project.

In [54]:
is_null =[]
for key in stock_df.keys():
  is_null.append(stock_df[key].isnull().unique())
is_null

[array([False]),
 array([False]),
 array([False]),
 array([False]),
 array([False])]

# **Inspecting Data**
1. Since we have 105 features, attempting to make scatterplot matrices for all would be unfeasible
2. Therefore, we'll be a creating scatterplot matrices of just a few of the features to get a broad idea of the underlying patterns (if any)

In [None]:
#features = ['regularMarketVolume', 'marketCap', 'profitMargins', 'regularMarketOpen']

features = ['Last Sale', 'Net Change', '% Change', 'Volume']
scatter_matrix = px.scatter_matrix(orig_df, dimensions=features, color='Sector', width=1500, height=1000)
scatter_matrix.show()

In [None]:
# metrics = [stock_df['regularMarketVolume'], stock_df['marketCap'],
#            sector]
# plot_df = pd.DataFrame(metrics, columns=['regularMarketVolume', 'marketCap', 'sector'])
# sns.pairplot(plot_df, hue='sector')

# **Models begin here**

In [55]:
np.random.seed(42)

**PCA**

In [56]:
# Standardizing the data to avoid feature domination
pca_data2 = stock_df.copy()
scaler = StandardScaler()

scaler.fit(pca_data2)
scaled_data = scaler.transform(pca_data2)

pca_2d = PCA(n_components=2)
pca_2d.fit(pca_data2)
pca_proj2d = pca_2d.transform(pca_data2)

In [57]:
pca_proj2d.shape

(7535, 2)

In [58]:
#matplotlib
# plt.figure(figsize=(5, 5))
# plt.scatter(pca_output2[: , 0], pca_output2[: , 1],c=cap_labels, cmap='plasma')
# plt.xlabel('pc1')
# plt.ylabel('pc2')

#plotly
fig = px.scatter(pca_proj2d, x=0, y=1,color = cap_labels)
fig.show()


In [59]:
pca_data3 = stock_df.copy()
scaler3 = StandardScaler()
scaler3.fit(pca_data3)
scaled_data3 = scaler.transform(pca_data3)


pca_3d = PCA(n_components=3)
pca_3d.fit(scaled_data3)
pca_proj3d = pca_3d.transform(scaled_data3)


In [60]:
from mpl_toolkits.mplot3d import Axes3D

In [61]:
#matplotlib
# fig = plt.figure(figsize=[7, 7])
# axis = fig.add_subplot(111, projection='3d')

# axis.scatter(pca_proj3d[:, 0], pca_proj3d[:, 1], pca_proj3d[:, 2], c=cap_labels, cmap='plasma')
# axis.set_xlabel('pc1')
# axis.set_ylabel('pc2')
# axis.set_zlabel('pc3')


#plotly
fig = px.scatter_3d(pca_proj3d, x=0, y=1, z=2, color=cap_labels, labels={'color': 'Market Cap'})
fig.show()

# **TSNE**

In [62]:
tsne_data2 = stock_df.copy()
tsne_2d = TSNE(n_components=2)
tsne_proj2d = tsne_2d.fit_transform(tsne_data2)

In [63]:
tsne_proj2d.shape

(7535, 2)

In [64]:
#matplotlib
# fig = plt.figure(figsize=[5,5])
# x, y = tsne_output2.T
# fig = plt.scatter(x, y, c=cap_labels, cmap='plasma')
# plt.xlabel('tsne1')
# plt.ylabel('tsne2')

#plotly
fig = px.scatter(tsne_proj2d, x=0, y=1, color=cap_labels )
fig.show()

In [65]:
tsne_data3 = stock_df.copy()
tsne_3d = TSNE(n_components=3)
tsne_proj3d = tsne_3d.fit_transform(tsne_data3)

In [66]:
#matplotlib
# x, y, z = tsne_output3.T

# fig = plt.figure(figsize=[10, 10])
# axis = fig.add_subplot(111, projection='3d')

# axis.scatter(x, y, z, c=cap_labels, cmap='plasma')
# axis.set_xlabel('tsne1')
# axis.set_ylabel('tsne2')
# axis.set_zlabel('tsne3')

#plotly
fig = px.scatter_3d(tsne_proj3d, x=0, y=1, z=2, color=cap_labels)
fig.show()

# **UMAP**

In [81]:
umap_data2 = stock_df.copy()
scaler = StandardScaler()
scaled_data2 = scaler.fit_transform(umap_data2)
umap_2 = UMAP(n_components=2)
umap_proj2d = umap_2.fit_transform(scaled_data2)

In [82]:
#matplotlib
# x, y = proj2.T
# plt.scatter(x, y, c=cap_labels, cmap='plasma')
# plt.xlabel('umap1')
# plt.ylabel('umap2')

#plotly
fig = px.scatter(umap_proj2d, x=0, y=1, color=cap_labels)
fig.show()

In [83]:
umap_data3 = stock_df.copy()
scaled_data3 = StandardScaler().fit_transform(umap_data3)
umap3 = UMAP(n_components=3)
umap_proj3d = umap3.fit_transform(umap_data3)

In [84]:
#matplotlib
# x, y, z = proj3.T

# fig = plt.figure(figsize=[10, 10])
# axis = fig.add_subplot(111, projection='3d')

# axis.scatter(x, y, z, c=cap_labels, cmap='plasma')
# axis.set_xlabel('umap1')
# axis.set_ylabel('umap2')
# axis.set_zlabel('umap3')

#plotly
fig = px.scatter_3d(umap_proj3d, x=0, y=1, z=2, color=cap_labels)
fig.show()

# **Evaluation and Comparison**
1. Picking evaluation metrics for dimensionality reduction techniques can be tricky, since these metrics take into account the nuances of the technique itself.
2. For e.g., cumulative explained variance can be used for PCA, but not for TSNE or UMAP since the latter two focus on preserving relationships rather than explaining variance.
3. To compare all three techniques, we'll be assessing the
     Quality of Clusters, Reconstruction Error, and Neighbourhood Preservation.

**Quality of Clusters**

In [85]:
orig_data = stock_df.copy()

kmeans_orig = KMeans(n_clusters=3)
kmeans_pca2d = KMeans(n_clusters=3)
kmeans_pca3d = KMeans(n_clusters=3)
kmeans_tsne2d = KMeans(n_clusters=3)
kmeans_tsne3d = KMeans(n_clusters=3)
kmeans_umap2d = KMeans(n_clusters=3)
kmeans_umap3d = KMeans(n_clusters=3)

In [None]:
labels_orig = kmeans_orig.fit_predict(orig_data)

labels_pca2d = kmeans_pca2d.fit_predict(pca_proj2d)
labels_pca3d = kmeans_pca3d.fit_predict(pca_proj3d)

labels_tsne2d = kmeans_tsne2d.fit_predict(tsne_proj2d)
labels_tsne3d = kmeans_tsne3d.fit_predict(tsne_proj3d)

labels_umap2d = kmeans_umap2d.fit_predict(umap_proj2d)
labels_umap3d = kmeans_umap3d.fit_predict(umap_proj3d)


In [88]:
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [120]:
#Calculating silhouette scores, Davies-Bouldin scores for clusters
sil_orig = silhouette_score(orig_data, labels_orig)

sil_pca2d = silhouette_score(pca_proj2d, labels_pca2d)
sil_pca3d = silhouette_score(pca_proj3d, labels_pca3d)

sil_tsne2d = silhouette_score(tsne_proj2d, labels_tsne2d)
sil_tsne3d = silhouette_score(tsne_proj3d, labels_tsne3d)

sil_umap2d = silhouette_score(umap_proj2d, labels_umap2d)
sil_umap3d = silhouette_score(umap_proj2d, labels_umap3d)

In [121]:
db_orig = davies_bouldin_score(orig_data, labels_orig)

db_pca2d = davies_bouldin_score(pca_proj2d, labels_pca2d)
db_pca3d = davies_bouldin_score(pca_proj3d, labels_pca3d)

db_tsne2d = davies_bouldin_score(tsne_proj2d, labels_tsne2d)
db_tsne3d = davies_bouldin_score(tsne_proj3d, labels_tsne3d)

db_umap2d = davies_bouldin_score(umap_proj2d, labels_umap2d)
db_umap3d = davies_bouldin_score(umap_proj2d, labels_umap3d)

In [126]:
import plotly.graph_objects as go


fig = go.Figure(data=[go.Table(
    header=dict(values=[' ', 'PCA 2d', 'PCA 3d', 'TSNE 2d', 'TSNE 3d', 'UMAP 2d', 'UMAP 3d'],
                line_color='darkslategray',
                fill_color='lightskyblue',
                align='left'),
    cells=dict(values=[['Silhouette Score', 'Davies-Bouldin Score'], # 1st column
                       ["{0: .4f}".format(sil_pca2d), "{0: .4f}".format(db_pca2d)],
                       ["{0: .4f}".format(sil_pca3d), "{0: .4f}".format(db_pca3d)],
                        ["{0: .4f}".format(sil_tsne2d), "{0: .4f}".format(db_tsne2d)],
                         ["{0: .4f}".format(sil_tsne3d), "{0: .4f}".format(db_tsne3d)],
                       ["{0: .4f}".format(sil_umap2d), "{0: .4f}".format(db_umap2d)],
                        ["{0: .4f}".format(sil_umap3d), "{0: .4f}".format(db_umap3d)]], # 2nd column
               line_color='darkslategray',
               fill_color='lightcyan',
               align='left'))
])

fig.update_layout(width=700, height=300)
fig.show()

**Reconstruction Error (PCA)**

In [132]:
recon_pca2d = pca_2d.inverse_transform(pca_proj2d)
recon_pca2d_scaled = scaler.inverse_transform(recon_pca2d)
recon_pca3d = pca_3d.inverse_transform(pca_proj3d)
recon_pca3d_scaled = scaler3.inverse_transform(recon_pca3d)

orig = stock_df.copy()
re_2d = np.mean((orig - recon_pca2d_scaled)**2)
re_3d = np.mean((orig - recon_pca3d_scaled)**2)

print("Reconstruction Errors:\n")
print("1. 2d: {}\n".format(re_2d))
print("2. 3d: {}".format(re_3d))


Reconstruction Errors:

1. 2d: Last Sale     3.279807e+13
Net Change    5.699121e+02
% Change      4.418214e+01
Market Cap    1.603510e+43
Volume        1.611365e+27
dtype: float64

2. 3d: Last Sale     2.871848e+05
Net Change    1.578650e+00
% Change      1.382881e-01
Market Cap    1.524411e+21
Volume        1.383615e+13
dtype: float64



In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'


In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'

