In [2]:
pip install kagglehub

Collecting kagglehub
  Using cached kagglehub-0.3.4-py3-none-any.whl.metadata (22 kB)
Using cached kagglehub-0.3.4-py3-none-any.whl (43 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.4
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install yfinance

Collecting yfinance
  Using cached yfinance-0.2.50-py2.py3-none-any.whl.metadata (5.5 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Using cached multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting lxml>=4.9.1 (from yfinance)
  Using cached lxml-5.3.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Collecting frozendict>=2.3.4 (from yfinance)
  Using cached frozendict-2.4.6-py311-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Using cached peewee-3.17.8-cp311-cp311-linux_x86_64.whl
Collecting html5lib>=1.1 (from yfinance)
  Using cached html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Using cached yfinance-0.2.50-py2.py3-none-any.whl (102 kB)
Using cached frozendict-2.4.6-py311-none-any.whl (16 kB)
Using cached html5lib-1.1-py2.py3-none-any.whl (112 kB)
Using cached lxml-5.3.0-cp311-cp311-manylinux_2_28_x86_64.whl (5.0 MB)
Using cached multitasking-0.0.11-py3-none-any.whl (8.5 kB)
Installing collected packages: peewee, multitaskin

In [None]:
pip install ipywidgets

In [6]:
pip install streamlit

Collecting streamlit
  Using cached streamlit-1.40.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Using cached cachetools-5.5.0-py3-none-any.whl.metadata (5.3 kB)
Collecting rich<14,>=10.14.0 (from streamlit)
  Using cached rich-13.9.4-py3-none-any.whl.metadata (18 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Using cached tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Using cached toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Using cached watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Using cached pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting markdown-it-py>=2.2.0 (from rich<14,>=10.14.0->streamlit)
  Using cached markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)
Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich<14,>=10.14.0->streamlit)
 

In [7]:
import os
import pandas as pd
import kagglehub
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import widgets, interact
from datetime import datetime
import yfinance as yf
import streamlit as st

In [8]:
# Path to dataset folder
main_path = kagglehub.dataset_download("jacksoncrow/stock-market-dataset")
path = main_path + "/stocks"
path_dict = kagglehub.dataset_download("gonzalezfrancisco/full-nasdaq-stocks-data")
# Load the validation csv with company names
symbols_csv_path = path_dict + "/dividend_stocks_only.csv"
symbols_df = pd.read_csv(symbols_csv_path)

# Initialize combined_df as an empty dataframe
combined_df = pd.DataFrame()

# Create a dropdown for the end user to select a company
company_selector = widgets.Combobox(
    placeholder="Choose company to analyse",
    options=symbols_df["security_name"].tolist(),
    description="Combobox:",
    ensure_option=True,
    disabled=False
)

# Function to update based on the selected company
def update(selected_company):
    global combined_df #makes sure it updates Combined_df outside the function
    if selected_company:
        selected_industry = symbols_df[symbols_df["security_name"] == selected_company]["sector"].values
        if selected_industry.size > 0:
            selected_industry = selected_industry[0]
            
            # Ensure the selected company is included and shown first
            selected_company_row = symbols_df[symbols_df["security_name"] == selected_company]
            top_symbols_df = symbols_df[symbols_df["sector"] == selected_industry].head(4)
            top_symbols_df = pd.concat([selected_company_row, top_symbols_df]).drop_duplicates().head(5)
            
            symbol_to_company = top_symbols_df.set_index("symbol")["security_name"].to_dict()
            csv_files = [f for f in os.listdir(path) if os.path.splitext(f)[0] in symbol_to_company]

            dfs = []
            for file in csv_files:
                symbol = os.path.splitext(file)[0]  # Extract the stock symbol from the filename
                df = pd.read_csv(os.path.join(path, file))
                df["Symbol"] = symbol
                df["Company Name"] = symbol_to_company.get(symbol, "Unknown")  # Add company name
                dfs.append(df)

            combined_df = pd.concat(dfs, ignore_index=True)
            
            # Ensure the selected company rows are at the top
            combined_df = pd.concat([combined_df[combined_df["Symbol"] == selected_company_row["symbol"].values[0]], 
                                     combined_df[combined_df["Symbol"] != selected_company_row["symbol"].values[0]]])

            print(f"Companies in the same industry as {selected_company} ({selected_industry}):")
            display(top_symbols_df)
        else:
            print("Selected company not found. Please choose a valid company from the dropdown.")
    else:
        print("Please select a company from the dropdown.")

# Add observer to the combobox to trigger update on change
company_selector.observe(lambda change: update(change.new), names="value")

# Display the combobox
display(company_selector)


Combobox(value='', description='Combobox:', ensure_option=True, options=('Agilent Technologies, Inc. Common St…

In [9]:
#Use Yfinance to get basic company info based on the selected company based on the company symbol

def get_company_info(symbol):
    company = yf.Ticker(symbol)
    info = company.info

    company_info = {
        "Business Description": info.get("longBusinessSummary", "Not available"),
        "Full Time Employees": info.get("fullTimeEmployees", "Not available"),
        "Latest Total Revenue (Millions)": round(info.get("totalRevenue","Not available")/1_000_000,2),
        "Currency": info.get("currency","Not available"),
        "industry": info.get("industry","Not available")
    }
    
    return company_info

get_company_info(combined_df.iloc[0]["Symbol"])

{'Business Description': 'ACCO Brands Corporation designs, manufactures, and markets consumer, school, technology, and office products. It operates through three segments: ACCO Brands North America, ACCO Brands EMEA, and ACCO Brands International. The company provides computer and gaming accessories, planners, dry erase boards, school notebooks, and janitorial supplies; storage and organization products, such as lever-arch binders, sheet protectors, and indexes; sheet protectors and indexes; laminating, binding, and shredding machines; writing instruments and art products; stapling and punching products; and do-it-yourself tools. It offers its products under the AT-A-GLANCE, Barrilito, Derwent, Esselte, Five Star, Foroni, GBC, Hilroy, Kensington, Leitz, Marbig, Mead, NOBO, PowerA, Quartet, Rapid, Rexel, Swingline, Tilibra, Artline, and Spirax brand names. The company markets and sells its products through various channels, including mass retailers, e-tailers, discount, drug/grocery, an

In [13]:
#add the Rebased Adj Close Column

# Ensure the Date column is in datetime format
combined_df["Date"] = pd.to_datetime(combined_df["Date"])

# Calculate the Rebased Adj Close column with proper index alignment
combined_df = combined_df.set_index("Date")
combined_df["Rebased Adj Close"] = combined_df.groupby("Symbol")["Close"].transform(
    lambda x: (x / x.iloc[0]) * 100).values

# Reset the index to restore the original dataframe structure
combined_df = combined_df.reset_index()

# Function to plot the chart based on selected date range and recalculate Rebased Adj Close based on slider

def plot_filtered_chart(start_date, end_date):

    # Filter the dataframe for the selected date range
    filtered_df = combined_df[(combined_df["Date"] >= start_date) & (combined_df["Date"] <= end_date)].copy()
    
    # Recalculate the Rebased Adj Close based on the selected start_date
    for symbol in filtered_df["Symbol"].unique():
        symbol_df = filtered_df[filtered_df["Symbol"] == symbol]
        if not symbol_df.empty:
            start_close = symbol_df[symbol_df["Date"] == start_date]["Close"]
            if not start_close.empty:
                start_close = start_close.iloc[0]
                filtered_df.loc[filtered_df["Symbol"] == symbol, "Rebased Adj Close"] = (
                    filtered_df.loc[filtered_df["Symbol"] == symbol, "Close"] / start_close * 100
                )
    
    # Plot the filtered data
    plt.figure(figsize=(14, 7))
    sns.lineplot(data=filtered_df, x="Date", y="Rebased Adj Close", hue="Company Name")
    
    plt.xlabel("Date")
    plt.ylabel("Rebased Adj Close")
    plt.legend(title="Company Name")
    plt.grid(True)
    plt.show();

# Date filter sliders using ipywidgets
date_range_slider = widgets.SelectionRangeSlider(
    options=[datetime.strftime(d, "%Y-%m-%d") for d in combined_df["Date"].sort_values().unique()],
    index=(0, len(combined_df["Date"].unique())-1),
    description="Date Range",
    orientation="horizontal",
    layout={"width": "1200px"}
)

# Interactive plot with the sliders
def update_plot(date_range):
    start_date = datetime.strptime(date_range[0], "%Y-%m-%d")
    end_date = datetime.strptime(date_range[1], "%Y-%m-%d")
    plot_filtered_chart(start_date, end_date)

# Display the date range slider and interactive plot
interact(update_plot, date_range=date_range_slider);


interactive(children=(SelectionRangeSlider(description='Date Range', index=(0, 10097), layout=Layout(width='12…

In [None]:
#export to CSV to understand the data
combined_df.to_csv("combined_data.csv", index=False)