In [None]:
import pandas as pd
import glob
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load basket data
basket_df = pd.read_csv("data/company_basket.csv", sep=";")

# Load all stock data files
sweden_stock_files = glob.glob("data/Sweden_*.txt")
new_txt_files = glob.glob("data/data-*-*-*.txt")
new_csv_files = glob.glob("data/data-*-*-*.csv")
all_files = sweden_stock_files + new_txt_files + new_csv_files

# Function to read stock data files
def read_file(file_path):
    if file_path.endswith('.txt') or file_path.endswith('.csv'):
        return pd.read_csv(file_path, sep=";")
    else:
        raise ValueError(f"Unsupported file extension for {file_path}")

# Load and merge stock data
stock_data = pd.concat([read_file(f) for f in all_files])

# Ensure all companies have an industry and sector
stock_data["industry_name"] = stock_data["industry_name"].fillna("Unknown")
stock_data["economic_sector_name"] = stock_data["economic_sector_name"].fillna("Unknown")

# --------- One-Hot Encoding Industry and Sector ---------
industry_encoded = pd.get_dummies(stock_data[['company_name', 'industry_name']], columns=['industry_name'])
sector_encoded = pd.get_dummies(stock_data[['company_name', 'economic_sector_name']], columns=['economic_sector_name'])

# Merge one-hot encoded data
stock_features = pd.concat([industry_encoded, sector_encoded.drop("company_name", axis=1)], axis=1)

# Aggregate at the company level
stock_features = stock_features.groupby("company_name").max().astype(int).reset_index()

# --------- Market Cap Classification (Size & Risk Level) ---------
# Define Market Cap bins
mcap_bins = [0, 2e9, 10e9, float("inf")]  # Small, Mid, Large Cap
mcap_labels = ["Small-Cap", "Mid-Cap", "Large-Cap"]

# Filter Mcap data and classify stocks
mcap_data = stock_data[stock_data["finparametername"] == "Mcap"].copy()
mcap_data["size_category"] = pd.cut(mcap_data["finval"], bins=mcap_bins, labels=mcap_labels)

# Assign Risk Level based on Market Cap classification
risk_mapping = {"Small-Cap": "High Risk", "Mid-Cap": "Medium Risk", "Large-Cap": "Low Risk"}
mcap_data["risk_level"] = mcap_data["size_category"].map(risk_mapping)

# One-hot encode size and risk level
size_encoded = pd.get_dummies(mcap_data[['company_name', 'size_category']], columns=['size_category'])
risk_encoded = pd.get_dummies(mcap_data[['company_name', 'risk_level']], columns=['risk_level'])

# Merge size & risk category into stock features
stock_features = pd.merge(stock_features, size_encoded, on="company_name", how="left").fillna(0)
stock_features = pd.merge(stock_features, risk_encoded, on="company_name", how="left").fillna(0)

# --------- Volatility Calculation (Closing Price Std Dev) ---------
closing_price_data = stock_data[stock_data["finparametername"] == "closingPrice"].copy()

# Compute standard deviation of closing price per company
volatility = closing_price_data.groupby("company_name")["finval"].std().reset_index()
volatility.rename(columns={"finval": "volatility"}, inplace=True)

# Normalize volatility using Min-Max Scaling
scaler = MinMaxScaler()
volatility["scaled_volatility"] = scaler.fit_transform(volatility[["volatility"]])

# Merge volatility into stock features
stock_features = pd.merge(stock_features, volatility[["company_name", "scaled_volatility"]], on="company_name", how="left").fillna(0)

# --------- Create Basket-Level Features ---------
def create_basket_features(basket_df, stock_features):
    basket_df['company_share'] = basket_df['company_share'] / 100.0
    
    # Get feature columns
    feature_cols = stock_features.columns.drop('company_name')
    
    # Merge stock features with basket data
    merged = pd.merge(basket_df, stock_features, on='company_name')

    # Apply weighting based on company share
    for col in feature_cols:
        merged[col] = merged[col] * merged['company_share']
    
    # Aggregate at basket level
    basket_features = merged.groupby('basket_name')[feature_cols].sum()

    return basket_features.reset_index()

# Generate basket features
basket_features = create_basket_features(basket_df, stock_features)

# --------- Compute Cosine Similarity Matrix ---------
similarity_matrix = cosine_similarity(basket_features.iloc[:, 1:].values)

# Map basket names to indices
basket_to_idx = {name: idx for idx, name in enumerate(basket_features['basket_name'])}

# --------- Recommendation Function ---------
def get_similar_baskets(basket_name, similarity_matrix, n=10):
    if basket_name not in basket_features['basket_name'].values:
        print("\nAvailable baskets:")
        print(sorted(basket_features['basket_name'].unique()))
        raise ValueError(f"\nBasket '{basket_name}' not found in the data")

    # Get index for target basket
    target_idx = basket_to_idx[basket_name]
    
    similarity_scores = list(enumerate(similarity_matrix[target_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:n+1]

    # Get basket names corresponding to top similar indices
    similar_baskets = [list(basket_to_idx.keys())[i[0]] for i in similarity_scores]

    return similar_baskets

# Example: Recommend baskets similar to "BIG INDEX Global"
recommended_baskets = get_similar_baskets("BIG INDEX Global", similarity_matrix, n=5)
print("\nRecommended baskets based on size, risk level, and volatility:")
print(recommended_baskets)


In [None]:
# Function to validate basket recommendations
def validate_recommendations(input_basket, recommended_baskets, basket_features):
    print(f"\nValidating Recommendations for Basket: {input_basket}\n")

    # Get input basket features
    input_features = basket_features[basket_features["basket_name"] == input_basket].iloc[:, 1:]

    # Print input basket feature values
    print(f"\nInput Basket: {input_basket}")
    print(input_features)

    # Store recommended baskets' features
    recommended_data = []
    
    for basket in recommended_baskets:
        if basket in basket_features["basket_name"].values:
            basket_feature_values = basket_features[basket_features["basket_name"] == basket].iloc[:, 1:]
            avg_mcap = basket_feature_values["size_category_Large-Cap"].values[0] * 10 + \
                       basket_feature_values["size_category_Mid-Cap"].values[0] * 5 + \
                       basket_feature_values["size_category_Small-Cap"].values[0] * 2
            avg_risk = basket_feature_values["risk_level_High Risk"].values[0] * 3 + \
                       basket_feature_values["risk_level_Medium Risk"].values[0] * 2 + \
                       basket_feature_values["risk_level_Low Risk"].values[0] * 1
            avg_volatility = basket_feature_values["scaled_volatility"].values[0]

            recommended_data.append((basket, avg_mcap, avg_risk, avg_volatility))
    
    # Convert to DataFrame for easier analysis
    validation_df = pd.DataFrame(recommended_data, columns=["Basket", "Avg Mcap Score", "Avg Risk Score", "Avg Volatility"])

    print("\nRecommended Baskets and Their Financial Features:\n")
    print(validation_df)

    # ----- Visualization -----
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))

    # Market Cap Comparison
    sns.barplot(x="Basket", y="Avg Mcap Score", data=validation_df, ax=axes[0], palette="Blues")
    axes[0].set_title("Market Cap Score Comparison")
    axes[0].tick_params(axis="x", rotation=45)

    # Risk Level Comparison
    sns.barplot(x="Basket", y="Avg Risk Score", data=validation_df, ax=axes[1], palette="Oranges")
    axes[1].set_title("Risk Level Score Comparison")
    axes[1].tick_params(axis="x", rotation=45)

    # Volatility Comparison
    sns.barplot(x="Basket", y="Avg Volatility", data=validation_df, ax=axes[2], palette="Greens")
    axes[2].set_title("Volatility Comparison")
    axes[2].tick_params(axis="x", rotation=45)

    plt.tight_layout()
    plt.show()

# Run validation
validate_recommendations("BIG INDEX Global", recommended_baskets, basket_features)
