# Installation

In [2]:
!C:\\Windows\\System32\\nvidia-smi.exe

#!nvcc --version

Thu Feb 20 01:04:43 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 572.42                 Driver Version: 572.42         CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1650      WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   55C    P8              1W /   50W |       0MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import torch

def check_gpu(): # Check if GPU is available
    if torch.cuda.is_available():
        print(f"GPU Device: {torch.cuda.get_device_name(0)}")
        print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
        print(f"CUDA Version: {torch.version.cuda}")
    else:
        print("No GPU available!")

check_gpu()

No GPU available!


In [None]:
pip install transformers#Use if necessary

In [3]:
import torch
device = 'cude' if torch.cuda.is_available() else 'cpu'
device

'cpu'

# Hugging Face Tasks

In [None]:
from transformers import pipeline
#---------------------------------------------------#
#                     NLP TASKS                     #
#---------------------------------------------------#

'''
1. Text Classification: Assigning a category to a piece of text.
Sentiment Analysis
Topic Classification
Spam Detection '''

classifier = pipeline("text-classification")

'''
2. Token Classification: Assigning labels to individual tokens in a sequence.
Named Entity Recognition (NER)
Part-of-Speech Tagging
'''

token_classifier = pipeline("token-classification")

'''
3. Question Answering: Extracting an answer from a given context based on a question.
'''
question_answerer = pipeline("question-answering")

'''
4. Text Generation: Generating text based on a given prompt.
Language Modeling
Story Generation

'''

text_generator = pipeline("text-generation")

'''
5. Summarization: Condensing long documents into shorter summaries.
'''

summarizer = pipeline("summarization")

'''
Translation: Translating text from one language to another.
'''

translator = pipeline("translation",
                      model="Helsinki-NLP/opus-mt-en-fr")

'''
6. Text2Text Generation: General-purpose text transformation, including summarization and translation.
'''

text2text_generator = pipeline("text2text-generation")

'''
7. Fill-Mask: Predicting the masked token in a sequence.
'''

fill_mask = pipeline("fill-mask")

'''
8. Feature Extraction: Extracting hidden states or features from text.
'''

feature_extractor = pipeline("feature-extraction")

'''
9. Sentence Similarity: Measuring the similarity between two sentences.
'''
#sentence_similarity = pipeline("sentence-similarity")

#---------------------------------------------------#
#             Computer Vision TASKS                 #
#---------------------------------------------------#

'''
1. Image Classification: Classifying the main content of an image.

'''

image_classifier = pipeline("image-classification")

'''
2. Object Detection: Identifying objects within an image and their bounding boxes.
'''

object_detector = pipeline("object-detection")

'''
3. Image Segmentation: Segmenting different parts of an image into classes.
'''

image_segmenter = pipeline("image-segmentation")

'''
4. Image Generation: Generating images from textual descriptions (using DALL-E or similar models).
'''

#---------------------------------------------------#
#             Speech Processing TASKS               #
#---------------------------------------------------#

'''
1. utomatic Speech Recognition (ASR): Converting spoken language into text.
'''

speech_recognizer = pipeline("automatic-speech-recognition")

'''
2. Speech Translation: Translating spoken language from one language to another.
3. Audio Classification: Classifying audio signals into predefined categories.
'''

#---------------------------------------------------#
#                   Multimodal TASKS                #
#---------------------------------------------------#

'''
1. Image Captioning: Generating a textual description of an image.
'''
image_captioner = pipeline("image-to-text")
'''
2. Visual Question Answering (VQA): Answering questions about the content of an image.
'''

#---------------------------------------------------#
#                     Other TASKS                   #
#---------------------------------------------------#
'''
1. Table Question Answering: Answering questions based on tabular data.
'''
table_qa = pipeline("table-question-answering")

'''
2. Document Question Answering: Extracting answers from documents like PDFs.

'''
doc_qa = pipeline("document-question-answering")
'''
3. Time Series Forecasting: Predicting future values in time series data (not directly supported in the main Transformers library but available through extensions).
'''

# NLP Tasks

## Sentiment Analysis

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
result = classifier("I was so not happy with the last Mission Impossible Movie")
print(result)


In [None]:
pipeline(task = "sentiment-analysis")("I was confused with the Barbie Movie")

In [None]:
pipeline(task = "sentiment-analysis")\
                                      ("Everyday lots of LLMs papers are published about LLMs Evlauation. \
                                      Lots of them Looks very Promising. \
                                      I am not sure if we CAN actually Evaluate LLMs. \
                                      There is still lots to do.\
                                      Don't you think?")

In [None]:
pipeline(task = "sentiment-analysis", model="facebook/bart-large-mnli")\
                                      ("Everyday lots of LLMs papers are published about LLMs Evlauation. \
                                      Lots of them Looks very Promising. \
                                      I am not sure if we CAN actually Evaluate LLMs. \
                                      There is still lots to do.\
                                      Don't you think?")


### Batch Senteniment Analysis

In [None]:
classifier = pipeline(task = "sentiment-analysis")

task_list = ["I really like Autoencoders, best models for Anomaly Detection", \
            "I am not sure if we CAN actually Evaluate LLMs.", \
            "PassiveAgressive is the name of a Linear Regression Model that so many people do not know.",\
            "I hate long Meetings."]
classifier(task_list)

In [None]:
classifier = pipeline(task = "sentiment-analysis", model = "SamLowe/roberta-base-go_emotions")

task_list = ["I really like Autoencoders, best models for Anomaly Detection", \
            "I am not sure if we CAN actually Evaluate LLMs.", \
            "PassiveAgressive is the name of a Linear Regression Model that so many people do not know. It is pretty funny name for a Regression Model.",\
            "I hate long Meetings."]
classifier(task_list)

## Text Generation

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

text_generator = pipeline("text-generation", model="distilbert/distilgpt2")
generated_text = text_generator("Today is a rainy day in London",
                                truncation=True,
                                num_return_sequences = 2)
print("Generated_text:\n ", generated_text[0]['generated_text'])

## Question Answering

In [None]:
from transformers import pipeline

qa_model = pipeline("question-answering")
question = "What is my job?"
context = "I am a student at the University of Toronto studying Computer Science."
qa_model(question = question, context = context)

# Tokenization

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification

In [None]:
model_name2 = "nlptown/bert-base-multilingual-uncased-sentiment"
mymodel2 = AutoModelForSequenceClassification.from_pretrained(model_name2)
mytokenizer2 = AutoTokenizer.from_pretrained(model_name2)

classifier = pipeline("sentiment-analysis", model = mymodel2 , tokenizer = mytokenizer2)
res = classifier("I was so not happy with the Barbie Movie")
print(res)

In [None]:
from transformers import AutoTokenizer

# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Example text
text = "I was so not happy with the Barbie Movie"

# Tokenize the text
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)


In [None]:
# Convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Input IDs:", input_ids)


In [None]:

# Encode the text (tokenization + converting to input IDs)
encoded_input = tokenizer(text)
print("Encoded Input:", encoded_input)


In [None]:

# Decode the text
decoded_output = tokenizer.decode(input_ids)
print("Decode Output: ", decoded_output)


In [None]:
from transformers import AutoTokenizer

# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Example text
text = "I was so not happy with the Barbie Movie"

# Tokenize the text
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

# Convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Input IDs:", input_ids)

# Encode the text (tokenization + converting to input IDs)
encoded_input = tokenizer(text)
print("Encoded Input:", encoded_input)

# Decode the text
decoded_output = tokenizer.decode(input_ids)
print("Decode Output: ", decoded_output)

**token_type_ids**<br>
These IDs are used to distinguish between different sequences in tasks that involve multiple sentences, such as question-answering and sentence-pair classification. BERT uses this mechanism to understand which tokens belong to which segment. For single-sequence tasks like sentiment analysis, token_type_ids are all zeros.

**attention_mask** <br>
The attention mask is used to differentiate between actual tokens and padding tokens (if any). It helps the model focus on non-padding tokens and ignore padding tokens. A value of 1 indicates that the token should be attended to, while a value of 0 indicates padding.

**Why Padding Tokens Are Used**<br>
Uniform Sequence Length: Deep learning models typically process input data in batches. To efficiently process these batches, all sequences in a batch must have the same length. Padding tokens ensure this by extending shorter sequences to match the length of the longest sequence in the batch.
Efficient Computation: Fixed-length sequences allow for more efficient use of hardware resources, as the model can process all sequences in parallel without needing to handle variable-length sequences individually.



# Fine Tunning IMDB

## Step 1: Install Necessary Libraries

In [None]:
pip install datasets

## Step 2: Load and Prepare the Dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset('imdb')

In [None]:
dataset

In [None]:
dataset["train"][0]

## Step 3: Preprocess the Data
Tokenize the dataset using the tokenizer associated with the pre-trained model.

In [None]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [None]:
tokenized_datasets

In [None]:
tokenized_datasets["train"][0]

## Step 4: Set Up the Training Arguments
Specify the hyperparameters and training settings.

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    eval_strategy ="epoch",     # Evaluate every epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=1,              # Number of training epochs
    weight_decay=0.01,               # Strength of weight decay
)
training_args

## Step 5: Initialize the Model
Load the pre-trained model and define the training procedure.

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer

# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)


## Step 6: Train the Model
Fine-tune the pre-trained model on your specific dataset.

In [None]:
# Train the model
trainer.train()

## Step 7: Evaluate the Model
Assess the model's performance on a validation set.

In [None]:
# Evaluate the model
results = trainer.evaluate()
print(results)

## Step 8: Save the Fine-Tuned Model
Save the fine-tuned model for later use.

In [None]:
# Save the model
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-tokenizer')


# ArXiv Project

In [2]:
!pip install arxiv

Collecting arxiv
  Downloading arxiv-2.1.3-py3-none-any.whl.metadata (6.1 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Downloading arxiv-2.1.3-py3-none-any.whl (11 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py): started
  Building wheel for sgmllib3k (setup.py): finished with status 'done'
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6104 sha256=e89a1e5ed140ae38670af33d2e068faf5177fc43df491c3b2aa231cd71e9a20f
  Stored in directory: c:\users\lshre\appdata\local\pip\cache\wheels\03\f5\1a\23761066dac1d0e8e683e5fdb27e12de53209d05a4a37e6246
Successfully built sgmllib3k
Installing collected 

In [3]:
import arxiv
import pandas as pd

In [4]:
# Query to fetch AI-related papers
query = 'ai OR artificial intelligence OR machine learning'
search = arxiv.Search(query=query, max_results=10, sort_by=arxiv.SortCriterion.SubmittedDate)

# Fetch papers
papers = []
for result in search.results():
    papers.append({
      'published': result.published,
        'title': result.title,
        'abstract': result.summary,
        'categories': result.categories
    })

# Convert to DataFrame
df = pd.DataFrame(papers)

pd.set_option('display.max_colwidth', None)
df.head(10)

  for result in search.results():


Unnamed: 0,published,title,abstract,categories
0,2025-02-10 18:59:58+00:00,EVEv2: Improved Baselines for Encoder-Free Vision-Language Models,"Existing encoder-free vision-language models (VLMs) are rapidly narrowing the\nperformance gap with their encoder-based counterparts, highlighting the\npromising potential for unified multimodal systems with structural simplicity\nand efficient deployment. We systematically clarify the performance gap between\nVLMs using pre-trained vision encoders, discrete tokenizers, and minimalist\nvisual layers from scratch, deeply excavating the under-examined\ncharacteristics of encoder-free VLMs. We develop efficient strategies for\nencoder-free VLMs that rival mainstream encoder-based ones. After an in-depth\ninvestigation, we launch EVEv2.0, a new and improved family of encoder-free\nVLMs. We show that: (i) Properly decomposing and hierarchically associating\nvision and language within a unified model reduces interference between\nmodalities. (ii) A well-designed training strategy enables effective\noptimization for encoder-free VLMs. Through extensive evaluation, our EVEv2.0\nrepresents a thorough study for developing a decoder-only architecture across\nmodalities, demonstrating superior data efficiency and strong vision-reasoning\ncapability. Code is publicly available at: https://github.com/baaivision/EVE.","[cs.CV, cs.AI]"
1,2025-02-10 18:59:35+00:00,Visual Agentic AI for Spatial Reasoning with a Dynamic API,"Visual reasoning -- the ability to interpret the visual world -- is crucial\nfor embodied agents that operate within three-dimensional scenes. Progress in\nAI has led to vision and language models capable of answering questions from\nimages. However, their performance declines when tasked with 3D spatial\nreasoning. To tackle the complexity of such reasoning problems, we introduce an\nagentic program synthesis approach where LLM agents collaboratively generate a\nPythonic API with new functions to solve common subproblems. Our method\novercomes limitations of prior approaches that rely on a static, human-defined\nAPI, allowing it to handle a wider range of queries. To assess AI capabilities\nfor 3D understanding, we introduce a new benchmark of queries involving\nmultiple steps of grounding and inference. We show that our method outperforms\nprior zero-shot models for visual reasoning in 3D and empirically validate the\neffectiveness of our agentic framework for 3D spatial reasoning tasks. Project\nwebsite: https://glab-caltech.github.io/vadar/",[cs.CV]
2,2025-02-10 18:59:10+00:00,Matryoshka Quantization,"Quantizing model weights is critical for reducing the communication and\ninference costs of large models. However, quantizing models -- especially to\nlow precisions like int4 or int2 -- requires a trade-off in model quality;\nint2, in particular, is known to severely degrade model quality. Consequently,\npractitioners are often forced to maintain multiple models with different\nquantization levels or serve a single model that best satisfies the\nquality-latency trade-off. On the other hand, integer data types, such as int8,\ninherently possess a nested (Matryoshka) structure where smaller bit-width\nintegers, like int4 or int2, are nested within the most significant bits. This\npaper proposes Matryoshka Quantization (MatQuant), a novel multi-scale\nquantization technique that addresses the challenge of needing multiple\nquantized models. It allows training and maintaining just one model, which can\nthen be served at different precision levels. Furthermore, due to the\nco-training and co-distillation regularization provided by MatQuant, the int2\nprecision models extracted by MatQuant can be up to $10\%$ more accurate than\nstandard int2 quantization (using techniques like QAT or OmniQuant). This\nrepresents significant progress in model quantization, demonstrated by the fact\nthat, with the same recipe, an int2 FFN-quantized Gemma-2 9B model is more\naccurate than an int8 FFN-quantized Gemma-2 2B model.","[cs.LG, cs.AI]"
3,2025-02-10 18:58:52+00:00,DeepCrossAttention: Supercharging Transformer Residual Connections,"Transformer networks have achieved remarkable success across diverse domains,\nleveraging a variety of architectural innovations, including residual\nconnections. However, traditional residual connections, which simply sum the\noutputs of previous layers, can dilute crucial information. This work\nintroduces DeepCrossAttention (DCA), an approach that enhances residual\nlearning in transformers. DCA employs learnable, input-dependent weights to\ndynamically combine layer outputs, enabling the model to selectively focus on\nthe most relevant information in any of the previous layers. Furthermore, DCA\nincorporates depth-wise cross-attention, allowing for richer interactions\nbetween layers at different depths. Our language modeling experiments show that\nDCA achieves improved perplexity for a given training time. Moreover, DCA\nobtains the same model quality up to 3x faster while adding a negligible number\nof parameters. Theoretical analysis confirms that DCA provides an improved\ntrade-off between accuracy and model size when the ratio of collective layer\nranks to the ambient dimension falls below a critical threshold.",[cs.LG]
4,2025-02-10 18:58:40+00:00,RelGNN: Composite Message Passing for Relational Deep Learning,"Predictive tasks on relational databases are critical in real-world\napplications spanning e-commerce, healthcare, and social media. To address\nthese tasks effectively, Relational Deep Learning (RDL) encodes relational data\nas graphs, enabling Graph Neural Networks (GNNs) to exploit relational\nstructures for improved predictions. However, existing heterogeneous GNNs often\noverlook the intrinsic structural properties of relational databases, leading\nto modeling inefficiencies. Here we introduce RelGNN, a novel GNN framework\nspecifically designed to capture the unique characteristics of relational\ndatabases. At the core of our approach is the introduction of atomic routes,\nwhich are sequences of nodes forming high-order tripartite structures. Building\nupon these atomic routes, RelGNN designs new composite message passing\nmechanisms between heterogeneous nodes, allowing direct single-hop interactions\nbetween them. This approach avoids redundant aggregations and mitigates\ninformation entanglement, ultimately leading to more efficient and accurate\npredictive modeling. RelGNN is evaluated on 30 diverse real-world tasks from\nRelBench (Fey et al., 2024), and consistently achieves state-of-the-art\naccuracy with up to 25% improvement.","[cs.LG, cs.AI, cs.DB]"
5,2025-02-10 18:58:11+00:00,Lumina-Video: Efficient and Flexible Video Generation with Multi-scale Next-DiT,"Recent advancements have established Diffusion Transformers (DiTs) as a\ndominant framework in generative modeling. Building on this success,\nLumina-Next achieves exceptional performance in the generation of\nphotorealistic images with Next-DiT. However, its potential for video\ngeneration remains largely untapped, with significant challenges in modeling\nthe spatiotemporal complexity inherent to video data. To address this, we\nintroduce Lumina-Video, a framework that leverages the strengths of Next-DiT\nwhile introducing tailored solutions for video synthesis. Lumina-Video\nincorporates a Multi-scale Next-DiT architecture, which jointly learns multiple\npatchifications to enhance both efficiency and flexibility. By incorporating\nthe motion score as an explicit condition, Lumina-Video also enables direct\ncontrol of generated videos' dynamic degree. Combined with a progressive\ntraining scheme with increasingly higher resolution and FPS, and a multi-source\ntraining scheme with mixed natural and synthetic data, Lumina-Video achieves\nremarkable aesthetic quality and motion smoothness at high training and\ninference efficiency. We additionally propose Lumina-V2A, a video-to-audio\nmodel based on Next-DiT, to create synchronized sounds for generated videos.\nCodes are released at https://www.github.com/Alpha-VLLM/Lumina-Video.",[cs.CV]
6,2025-02-10 18:57:29+00:00,Exploring the Limit of Outcome Reward for Learning Mathematical Reasoning,"Reasoning abilities, especially those for solving complex math problems, are\ncrucial components of general intelligence. Recent advances by proprietary\ncompanies, such as o-series models of OpenAI, have made remarkable progress on\nreasoning tasks. However, the complete technical details remain unrevealed, and\nthe techniques that are believed certainly to be adopted are only reinforcement\nlearning (RL) and the long chain of thoughts. This paper proposes a new RL\nframework, termed OREAL, to pursue the performance limit that can be achieved\nthrough \textbf{O}utcome \textbf{RE}w\textbf{A}rd-based reinforcement\n\textbf{L}earning for mathematical reasoning tasks, where only binary outcome\nrewards are easily accessible. We theoretically prove that behavior cloning on\npositive trajectories from best-of-N (BoN) sampling is sufficient to learn the\nKL-regularized optimal policy in binary feedback environments. This formulation\nfurther implies that the rewards of negative samples should be reshaped to\nensure the gradient consistency between positive and negative samples. To\nalleviate the long-existing difficulties brought by sparse rewards in RL, which\nare even exacerbated by the partial correctness of the long chain of thought\nfor reasoning tasks, we further apply a token-level reward model to sample\nimportant tokens in reasoning trajectories for learning. With OREAL, for the\nfirst time, a 7B model can obtain 94.0 pass@1 accuracy on MATH-500 through RL,\nbeing on par with 32B models. OREAL-32B also surpasses previous 32B models\ntrained by distillation with 95.0 pass@1 accuracy on MATH-500. Our\ninvestigation also indicates the importance of initial policy models and\ntraining queries for RL. Code, models, and data will be released to benefit\nfuture research\footnote{https://github.com/InternLM/OREAL}.","[cs.CL, cs.LG]"
7,2025-02-10 18:56:14+00:00,KARST: Multi-Kernel Kronecker Adaptation with Re-Scaling Transmission for Visual Classification,"Fine-tuning pre-trained vision models for specific tasks is a common practice\nin computer vision. However, this process becomes more expensive as models grow\nlarger. Recently, parameter-efficient fine-tuning (PEFT) methods have emerged\nas a popular solution to improve training efficiency and reduce storage needs\nby tuning additional low-rank modules within pre-trained backbones. Despite\ntheir advantages, they struggle with limited representation capabilities and\nmisalignment with pre-trained intermediate features. To address these issues,\nwe introduce an innovative Multi-Kernel Kronecker Adaptation with Re-Scaling\nTransmission (KARST) for various recognition tasks. Specifically, its\nmulti-kernel design extends Kronecker projections horizontally and separates\nadaptation matrices into multiple complementary spaces, reducing parameter\ndependency and creating more compact subspaces. Besides, it incorporates extra\nlearnable re-scaling factors to better align with pre-trained feature\ndistributions, allowing for more flexible and balanced feature aggregation.\nExtensive experiments validate that our KARST outperforms other PEFT\ncounterparts with a negligible inference cost due to its re-parameterization\ncharacteristics. Code is publicly available at:\nhttps://github.com/Lucenova/KARST.","[cs.CV, cs.AI]"
8,2025-02-10 18:54:41+00:00,Learning an Optimal Assortment Policy under Observational Data,"We study the fundamental problem of offline assortment optimization under the\nMultinomial Logit (MNL) model, where sellers must determine the optimal subset\nof the products to offer based solely on historical customer choice data. While\nmost existing approaches to learning-based assortment optimization focus on the\nonline learning of the optimal assortment through repeated interactions with\ncustomers, such exploration can be costly or even impractical in many\nreal-world settings. In this paper, we consider the offline learning paradigm\nand investigate the minimal data requirements for efficient offline assortment\noptimization. To this end, we introduce Pessimistic Rank-Breaking (PRB), an\nalgorithm that combines rank-breaking with pessimistic estimation. We prove\nthat PRB is nearly minimax optimal by establishing the tight suboptimality\nupper bound and a nearly matching lower bound. This further shows that ""optimal\nitem coverage"" - where each item in the optimal assortment appears sufficiently\noften in the historical data - is both sufficient and necessary for efficient\noffline learning. This significantly relaxes the previous requirement of\nobserving the complete optimal assortment in the data. Our results provide\nfundamental insights into the data requirements for offline assortment\noptimization under the MNL model.","[stat.ML, cs.LG, math.OC, math.ST, stat.TH]"
9,2025-02-10 18:54:05+00:00,Towards Internet-Scale Training For Agents,"The predominant approach for training web navigation agents gathers human\ndemonstrations for a set of popular websites and hand-written tasks, but it is\nbecoming clear that human data are an inefficient resource. We develop a\npipeline to facilitate Internet-scale training for agents without laborious\nhuman annotations. In the first stage, an LLM generates tasks for 150k diverse\nwebsites. In the next stage, LLM agents complete tasks and produce\ntrajectories. In the final stage, an LLM reviews the trajectories and judges\ntheir success. Language models are competitive with human annotators, detecting\nand filtering out harmful content with an accuracy of 97%, generating feasible\ntasks with an 89% rate, and judging successful trajectories with an 82.6%\naccuracy. Scaling the pipeline, agents based on Llama 3.1 70B solve 16.7% of\ntasks for 150k sites. Training on the data generated by our pipeline is\ncompetitive with training on human demonstrations. In data-limited settings\nderived from Mind2Web and WebLINX, we improve Step Accuracy by up to +89.5% and\n+122.1% respectively for agents trained on mixtures of data from our pipeline,\nand human data. When training agents with all available human data from these\nbenchmarks, agents fail to generalize to diverse real sites, and adding our\ndata improves their generalization by +149.0% for WebLINX and +156.3% for\nMind2Web. Code will be available at: data-for-agents.github.io.","[cs.LG, cs.AI]"


In [13]:
from transformers import pipeline
# Example abstract from API
abstract = df['abstract'][9]

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Summarization
summarization_result = summarizer(abstract)

Device set to use cpu


In [14]:
summarization_result

[{'summary_text': 'The predominant approach for training web navigation agents gathers humandemonstrations for a set of popular websites and hand-written tasks. It is becoming clear that human data are an inefficient resource. We develop a Pipeline to facilitate Internet-scale training for agents without laborious human annotations. Code will be available at: data-for-agents.io.'}]