In [3]:
import pandas as pd
import glob
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process

# Load basket data
basket_df = pd.read_csv("data/company_basket.csv", sep=";")

# Load all Sweden stock data files
stock_files = glob.glob("data/Sweden_*.txt")
stock_data = pd.concat([pd.read_csv(f, sep=";") for f in stock_files])

# Ensure all companies have an industry and sector
stock_data["industry_name"] = stock_data["industry_name"].fillna("Unknown")
stock_data["economic_sector_name"] = stock_data["economic_sector_name"].fillna("Unknown")

# One-hot encode industry_name and economic_sector_name
industry_encoded = pd.get_dummies(stock_data[['company_name', 'industry_name']], columns=['industry_name'])
sector_encoded = pd.get_dummies(stock_data[['company_name', 'economic_sector_name']], columns=['economic_sector_name'])

# Merge one-hot encoded data to create stock_features
stock_features = pd.concat([industry_encoded, sector_encoded.drop("company_name", axis=1)], axis=1)

# Group by company_name to consolidate multiple entries per company
stock_features = stock_features.groupby("company_name").sum().reset_index()

# Compute cosine similarity matrix (n_basket, n_basket)
similarity_matrix = cosine_similarity(stock_features.iloc[:, 1:].values)

# Create a mapping between basket names and indices
basket_idx = {name: idx for idx, name in enumerate(basket_df['basket_name'].unique())}

# Function to find the closest matching basket title using fuzzy matching
def basket_finder(user_input, choices, threshold=80):
    match, score = process.extractOne(user_input, choices)
    if score >= threshold:
        return match
    return None

# Function to get similar baskets
def get_similar_baskets(basket_name, n=10):
    # Find closest matching basket
    best_match = basket_finder(basket_name, list(basket_idx.keys()))
    if not best_match:
        return "Basket not found!"

    idx = basket_idx[best_match]
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:n+1]

    # Get basket names corresponding to top similar indices
    similar_baskets = [list(basket_idx.keys())[i[0]] for i in similarity_scores]

    return similar_baskets

# Example usage: Finding similar baskets to "Australia tech index"
print(get_similar_baskets("Australia tech index", n=10))


['Blended companies profitable', 'European airlines', 'Global index great valuers', 'Healthcare southern Europé', 'my best choice ever', 'Software Americas, small comp', 'Software Americas, small mcap', 'Strong value growth Europe', 'Swedish mibile tech', 'UK as a sector']
