# Cosine Distance 

## Overview
The notebook is designed to process and analyze a collection of papers represented by their embedding vector data over a series of years. It calculates the cosine similarities between the vectors of papers for a given year and those from the previous five years, aiming to measure the similarity in content. The results, including the average and maximum cosine similarities for each paper, are then saved to a CSV file for further analysis or reference.
The code is optimized to run it on the GPU if available.

## Workflow
- **Loading Data**: The notebook starts by loading vector data of papers for a specific year from a designated directory. Each row in the data files corresponds to a paper, with one column representing the paper's ID and the remaining columns representing the vector.

> **_NOTE:_**  All vectors are assumed to be stored in csv files divided by years.

- **Data Segmentation**: To optimize memory usage, the data is divided into manageable chunks. This segmentation facilitates efficient processing, especially for large datasets.

- **Rolling Data Collection**: The vector data for the current year is added to a rolling collection that holds the data for the current and previous five years. This rolling mechanism ensures that only the most relevant five years of data are considered at any given time.

- **Cosine Similarity Calculation**: If there are at least six years of data in the rolling collection, the notebook proceeds to calculate the cosine similarities. It compares the vectors of the current year’s papers with the combined vectors of the papers from the previous five years.

- **Average and Maximum Similarities**: For each paper in the current year, both the average and maximum cosine similarities are calculated in relation to the papers from the previous years.

- **Result Storage**: The calculated average and maximum cosine similarities, along with the paper IDs, are saved to a CSV file.

- **Iteration**: The notebook repeats this process for each year in the specified range, ensuring that each year’s data is compared with the data from its preceding years.

## Note

- Ensure `cupy` is installed to run on GPU, you can install it via `pip install cupy`.
- Adjust the CHUNK_SIZE based on GPU memory availability if running on GPU.
- This notebook assumes that vectors are stored in consecutive years

In [None]:
import numpy as np
import csv
import os
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
import pandas as pd

# Try to import cupy for GPU acceleration, fall back to numpy if not available
try:
    import cupy as xp
    print("Running on GPU")
except ImportError:
    import numpy as xp
    print("Running on CPU")

# Constants
path_vectors = '../data/vectors/'
CHUNK_SIZE = 1000  # Adjust based on memory availability
OUTPUT_PATH = '../data/metrics/papers_cosine.csv'  # Adjust this path as needed
N_JOBS = -1  # Use all available cores

def load_vectors_for_year(year):
    """Load vectors for a specific year using efficient reading."""
    
    file_path = os.path.join(path_vectors, f"{year}_vectors.csv")
    
    if not os.path.exists(file_path):
        return None, None
    
    print(f'Reading {year}...')
    # Load the entire CSV into a single numpy array
    data = xp.loadtxt(file_path, delimiter=',', dtype=np.float32, skiprows = 1)
    
    # Check if there is only one paper in the year
    if len(data) == 769:
        papers_ids = [data[0].astype(xp.int64)]
        vectors = [data[1:]]
        
    else:
        # Slice the array to get the desired columns
        papers_ids = data[:, 0].astype(xp.int64)  # Assuming the first column is the PaperId
        vectors = data[:, 1:]  # Assuming the rest of the columns are the vectors

    return papers_ids, vectors

def cosine_similarity(vector_a, vector_b):
    """Simple cosine similarity function"""
    
    norm_a = xp.linalg.norm(vector_a)
    norm_b = xp.linalg.norm(vector_b)
    
    dot_product = xp.dot(vector_a, vector_b)
    
    similarity = dot_product / (norm_a * norm_b)
    
    return similarity

def calculate_similarity_for_chunk(chunk, prior_data):
    """Calculate similarity for a chunk using matrix multiplication."""
    # Normalize the vectors
    chunk_norm = chunk / xp.linalg.norm(chunk, axis=1, keepdims=True)
    prior_data_norm = prior_data / xp.linalg.norm(prior_data, axis=1, keepdims=True)
    
    # Compute cosine similarities using matrix multiplication
    similarities = xp.dot(chunk_norm, prior_data_norm.T)
    
    avg_dists = xp.mean(similarities, axis=1)
    max_dists = xp.max(similarities, axis=1)
    
    return avg_dists, max_dists

def calculate_avg_max_similarity(current_data, prior_data):
    """Calculate average and max cosine similarities for chunks."""
    results = Parallel(n_jobs=N_JOBS)(
        delayed(calculate_similarity_for_chunk)(current_data[i:i+CHUNK_SIZE], prior_data)
        for i in tqdm(range(0, len(current_data), CHUNK_SIZE))
    )
    avg_similarities = xp.concatenate([res[0] for res in results])
    max_similarities = xp.concatenate([res[1] for res in results])
    return avg_similarities, max_similarities

def initialize_output_file():
    """Initialize the output CSV file with headers."""
    with open(OUTPUT_PATH, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['PaperId', 'cosine_max', 'cosine_avg'])

def save_to_csv(papers_ids, avg_similarities, max_similarities):
    """Append results to CSV."""
    with open(OUTPUT_PATH, 'a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for paper_id, avg_sim, max_sim in zip(papers_ids, avg_similarities, max_similarities):
            writer.writerow([paper_id, max_sim, avg_sim])

### Define the years 

In [None]:
start_year = 1896
end_year = 2020

### Run the similarity calculation

In [None]:
rolling_data = []
years = range(start_year, end_year + 1) # +1 to include the last year

# Initialize the output CSV file
initialize_output_file()

for year in tqdm(years):
    papers_ids, current_year_data = load_vectors_for_year(year)
    
    if current_year_data is None:
        continue

    # Add current year data to rolling data
    rolling_data.append((year, current_year_data))
    
    # Remove data that is more than 5 years old
    rolling_data = [(y, data) for y, data in rolling_data if year - y < 6]

    # If there's not enough prior data, skip the calculations for this year
    if len(rolling_data) < 6:
        continue

    # Combine prior years data
    prior_data = xp.vstack([data for y, data in rolling_data if y != year])
    
    print('Calculating similarities for %d...'%(year))
    # Calculate cosine similarities
    avg_year_similarities, max_year_similarities = calculate_avg_max_similarity(current_year_data, prior_data)

    # Save results to CSV
    save_to_csv(papers_ids, avg_year_similarities, max_year_similarities)

In [None]:
prior_data

In [None]:
file_path = os.path.join(path_vectors, f"{year}_vectors.csv")

# Load the entire CSV into a single numpy array
data = xp.loadtxt(file_path, delimiter=',', dtype=np.float32, skiprows = 1)
data

papers_ids = data[:, 0]#.astype(xp.int64)  # Assuming the first column is the PaperId
vectors = data[:, 1:]  # Assuming the rest of the columns are the vectors

In [None]:
len(data)