In [2]:
import pandas as pd

In [3]:
data_path = 'dataset/research'
train_df = pd.read_csv(data_path + '/train.csv')
test_df = pd.read_csv(data_path + '/test.csv')

print(train_df.head())
print(test_df.head())

   ID                                              TITLE  \
0   1        Reconstructing Subject-Specific Effect Maps   
1   2                 Rotation Invariance Neural Network   
2   3  Spherical polyharmonics and Poisson kernels fo...   
3   4  A finite element approximation for the stochas...   
4   5  Comparative study of Discrete Wavelet Transfor...   

                                            ABSTRACT  Computer Science  \
0    Predictive models allow subject-specific inf...                 1   
1    Rotation invariance and translation invarian...                 1   
2    We introduce and develop the notion of spher...                 0   
3    The stochastic Landau--Lifshitz--Gilbert (LL...                 0   
4    Fourier-transform infra-red (FTIR) spectra o...                 1   

   Physics  Mathematics  Statistics  Quantitative Biology  \
0        0            0           0                     0   
1        0            0           0                     0   
2        0 

In [4]:
train_text = train_df[['TITLE', 'ABSTRACT']]
train_labels = train_df[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']]
test_text = test_df[['TITLE', 'ABSTRACT']]

print("Train Text and Labels Head:")
print(train_text.head())
print(train_labels.head())

print("\nTest Text Head:")
print(test_text.head())

Train Text and Labels Head:
                                               TITLE  \
0        Reconstructing Subject-Specific Effect Maps   
1                 Rotation Invariance Neural Network   
2  Spherical polyharmonics and Poisson kernels fo...   
3  A finite element approximation for the stochas...   
4  Comparative study of Discrete Wavelet Transfor...   

                                            ABSTRACT  
0    Predictive models allow subject-specific inf...  
1    Rotation invariance and translation invarian...  
2    We introduce and develop the notion of spher...  
3    The stochastic Landau--Lifshitz--Gilbert (LL...  
4    Fourier-transform infra-red (FTIR) spectra o...  
   Computer Science  Physics  Mathematics  Statistics  Quantitative Biology  \
0                 1        0            0           0                     0   
1                 1        0            0           0                     0   
2                 0        0            1           0               

In [32]:
# print the number of samples that have more than one label
train_labels = train_df[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']]
# when axis=1, it means we are summing the number of labels in each row
num_labels = train_labels.sum(axis=1)
print("Number of samples with more than one label: ", len(num_labels[num_labels > 1]))

Number of samples with more than one label:  5044


# Basic Prompt

In [9]:
import pandas as pd
import ollama

# Load the data
data_path = 'dataset/research'
train_df = pd.read_csv(data_path + '/train.csv')
test_df = pd.read_csv(data_path + '/test.csv')

# print(train_df.head())
# print(test_df.head())

train_text = train_df[['TITLE', 'ABSTRACT']]
train_labels = train_df[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']]
test_text = test_df[['TITLE', 'ABSTRACT']]

labels = []
for index, row in train_labels.iterrows():
    for i, label in enumerate(row):
        if label == 1:
            labels.append(train_labels.columns[i])

# Function to interact with the Ollama API
def get_prediction(title, abstract):
    response = ollama.chat(model='llama3', messages=[
        {
            'role': 'user',
            'content': f"Title: {title}\nAbstract: {abstract}\nPredict the category from one of the followings categories: [Computer Science, Physics, Mathematics, Statistics, Quantitative Biology, Quantitative Finance]. Only respond with the category name.",
        },
    ])
    return response['message']['content']

# Predict for the test set
test_predictions = []
for index, row in train_text.iterrows():
    if index % 100 == 0:
        print(f"Predicting for sample {index+1}...")
    # if index == 1000:
    #     break
    prediction = get_prediction(row['TITLE'], row['ABSTRACT'])
    test_predictions.append(prediction)

# Display some predictions
for i, prediction in enumerate(test_predictions[:5]):
    print(f"Test Sample {i+1}:")
    print(f"Title: {test_text.iloc[i]['TITLE']}")
    print(f"Abstract: {test_text.iloc[i]['ABSTRACT']}")
    print(f"Predicted Category: {prediction}\n")
    print(f"Actual Category: {labels[i]}\n")\
    
# calculate the accuracy
correct = 0
for i, prediction in enumerate(test_predictions):
    if prediction == labels[i]:
        correct += 1
accuracy = correct / len(test_predictions)
print(f"Accuracy: {accuracy}")

Predicting for sample 1...
Predicting for sample 101...
Predicting for sample 201...
Predicting for sample 301...
Predicting for sample 401...
Predicting for sample 501...
Predicting for sample 601...
Predicting for sample 701...
Predicting for sample 801...
Predicting for sample 901...
Predicting for sample 1001...
Predicting for sample 1101...
Predicting for sample 1201...
Predicting for sample 1301...
Predicting for sample 1401...
Predicting for sample 1501...
Predicting for sample 1601...
Predicting for sample 1701...
Predicting for sample 1801...
Predicting for sample 1901...
Predicting for sample 2001...
Predicting for sample 2101...
Predicting for sample 2201...
Predicting for sample 2301...
Predicting for sample 2401...
Predicting for sample 2501...
Predicting for sample 2601...
Predicting for sample 2701...
Predicting for sample 2801...
Predicting for sample 2901...
Predicting for sample 3001...
Predicting for sample 3101...
Predicting for sample 3201...
Predicting for sample 

In [28]:
import ollama

# Function to interact with the Ollama API
def chat_with_context(history):
    response = ollama.chat(model='llama3', messages=history)
    return response['message']['content']

# Initialize the chat history
chat_history = []

# Step 1: Add Initial Task Description
task_description = "We need to predict the research paper type into one of the following categories: (Computer Science, Physics, Mathematics, Statistics, Quantitative Biology, and Quantitative Finance) based on its title and abstract. Think step-by-step through how you would approach this task"
chat_history.append({
    'role': 'user',
    'content': task_description
})

# Step 2: Generate Samples
generate_samples_request = "Please generate a few samples that are likely to be identified as a certain label and a few samples that are likely to confuse the model into making wrong predictions."
chat_history.append({
    'role': 'user',
    'content': generate_samples_request
})

samples_response = chat_with_context(chat_history)
chat_history.append({
    'role': 'assistant',
    'content': samples_response
})
print("Generated Samples:")
print(samples_response)

# Step 3: Analyze Samples
analyze_samples_request = f"Here are some samples: {samples_response}\n  What keywords, phrases, or patterns distinguish each label? What pitfalls or sources of confusion did you encounter? "
chat_history.append({
    'role': 'user',
    'content': analyze_samples_request
})

analysis_response = chat_with_context(chat_history)
chat_history.append({
    'role': 'assistant',
    'content': analysis_response
})
print("Analysis of Samples:")
print(analysis_response)

# Step 4: Identify Criteria
identify_criteria_request = f"Based on the following analysis: {analysis_response}\n provide guidelines for accurately classifying sample into specific label. What features or cues should the model focus on?"
chat_history.append({
    'role': 'user',
    'content': identify_criteria_request
})

criteria_response = chat_with_context(chat_history)
chat_history.append({
    'role': 'assistant',
    'content': criteria_response
})
print("Criteria for Identification:")
print(criteria_response)

# Step 5: Generate Optimized Prompt
generate_prompt_request = f"Based on the following criteria: {criteria_response}\n Generate an optimized prompt for the model to predict the research paper type. Make sure the prompt highlights essential features and minimizes potential sources of confusion. Respond with only the content of optimized prompt."
chat_history.append({
    'role': 'user',
    'content': generate_prompt_request
})

optimized_prompt_response = chat_with_context(chat_history)
chat_history.append({
    'role': 'assistant',
    'content': optimized_prompt_response
})
print("Optimized Prompt:")
print(optimized_prompt_response)


Generated Samples:
Here are some sample research paper titles and abstracts, along with their intended categories:

**Samples that should be easily classified:**

1. **Computer Science:** "Efficient Algorithm for Solving Traveling Salesman Problems using Genetic Programming"
	* Abstract: This paper proposes a novel algorithm for solving the classic traveling salesman problem using genetic programming. The proposed approach is compared to existing methods and shows improved performance.
2. **Physics:** "Measurement of Quantum Entanglement in Superconducting Circuits"
	* Abstract: In this study, we investigate the measurement of quantum entanglement in superconducting circuits. Our results demonstrate a significant improvement in entanglement measurement precision using a novel technique.
3. **Mathematics:** "New Proof of the Four-Color Theorem using Topological Methods"
	* Abstract: This paper presents a new proof of the four-color theorem using topological methods. We show that our app

In [26]:
import ollama

# Function to interact with the Ollama API
def chat_with_context(history):
    response = ollama.chat(model='llama3', messages=history)
    return response['message']['content']

# Initialize the chat history
chat_history = []

# Step 1: Add Initial Task Description
task_description = "Classify research papers into their respective fields (Computer Science, Physics, Mathematics, Statistics, Quantitative Biology, Quantitative Finance) based on their titles and abstracts. Describe the task and relevant labels."
chat_history.append({
    'role': 'user',
    'content': task_description
})

# Step 2: Generate Samples
generate_samples_request = "Generate a series of thoughts to classify a research paper into its field. Start with the title and abstract, then identify key terms, concepts, and patterns, and finally, determine the paper's field. Show your thought process:"
chat_history.append({
    'role': 'user',
    'content': generate_samples_request
})

samples_response = chat_with_context(chat_history)
chat_history.append({
    'role': 'assistant',
    'content': samples_response
})
print("Generated Samples:")
print(samples_response)

# Step 3: Analyze Samples
analyze_samples_request = f"Reflect on your thought process. What were the key decisions and features that led you to classify the paper into its field? Provide guidelines for accurately classifying research papers..."
chat_history.append({
    'role': 'user',
    'content': analyze_samples_request
})

analysis_response = chat_with_context(chat_history)
chat_history.append({
    'role': 'assistant',
    'content': analysis_response
})
print("Analysis of Samples:")
print(analysis_response)

# # Step 4: Identify Criteria
# identify_criteria_request = f"Based on the following analysis: {analysis_response}\n provide guidelines for accurately classifying sample into specific label. What features or cues should the model focus on?"
# chat_history.append({
#     'role': 'user',
#     'content': identify_criteria_request
# })

# criteria_response = chat_with_context(chat_history)
# chat_history.append({
#     'role': 'assistant',
#     'content': criteria_response
# })
# print("Criteria for Identification:")
# print(criteria_response)

# Step 5: Generate Optimized Prompt
generate_prompt_request = f"Use your reflected thought process to generate an optimized prompt for classifying research papers into their fields. Make sure the prompt highlights essential features and minimizes potential sources of confusion. Output only the content of the optimized prompt."
chat_history.append({
    'role': 'user',
    'content': generate_prompt_request
})

optimized_prompt_response = chat_with_context(chat_history)
chat_history.append({
    'role': 'assistant',
    'content': optimized_prompt_response
})
print("Optimized Prompt:")
print(optimized_prompt_response)


Generated Samples:
Let's say we have a research paper with the following title and abstract:

**Title:** "Efficient Computation of Optimal Trading Strategies using Deep Reinforcement Learning"

**Abstract:** "We propose a novel approach to computing optimal trading strategies by combining deep reinforcement learning techniques with Monte Carlo methods. Our method leverages the power of neural networks to learn complex market dynamics, while also incorporating risk management constraints. We demonstrate the effectiveness of our approach through simulations and experiments on real-world financial data."

Here's my thought process to classify this paper:

**Step 1: Initial Impression**
The title immediately catches my attention with its mention of "deep reinforcement learning" and "optimal trading strategies". I get a sense that this paper might be related to computer science, possibly even machine learning or artificial intelligence.

**Step 2: Key Terms Extraction**
I extract the follow

In [27]:
import pandas as pd
import ollama

# Load the data
data_path = 'dataset/research'
train_df = pd.read_csv(data_path + '/train.csv')
test_df = pd.read_csv(data_path + '/test.csv')

# print(train_df.head())
# print(test_df.head())

train_text = train_df[['TITLE', 'ABSTRACT']]
train_labels = train_df[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']]
test_text = test_df[['TITLE', 'ABSTRACT']]

labels = []
for index, row in train_labels.iterrows():
    for i, label in enumerate(row):
        if label == 1:
            labels.append(train_labels.columns[i])

# Function to interact with the Ollama API
def get_prediction(title, abstract):
    response = ollama.generate(model='llama3', prompt=f"Title: {title}\nAbstract:{abstract}\n {optimized_prompt_response} Only respond with the category name.")
    return response['response']

# Predict for the test set
test_predictions = []
for index, row in train_text.iterrows():
    if index % 100 == 0:
        print(f"Predicting for sample {index+1}...")
    if index == 1000:
        break
    prediction = get_prediction(row['TITLE'], row['ABSTRACT'])
    test_predictions.append(prediction)

# Display some predictions
for i, prediction in enumerate(test_predictions[:5]):
    print(f"Test Sample {i+1}:")
    print(f"Title: {test_text.iloc[i]['TITLE']}")
    print(f"Abstract: {test_text.iloc[i]['ABSTRACT']}")
    print(f"Predicted Category: {prediction}\n")
    print(f"Actual Category: {labels[i]}\n")\
    
# calculate the accuracy
correct = 0
for i, prediction in enumerate(test_predictions):
    if prediction == labels[i]:
        correct += 1
accuracy = correct / len(test_predictions)
print(f"Accuracy: {accuracy}")

Predicting for sample 1...
Predicting for sample 101...
Predicting for sample 201...
Predicting for sample 301...
Predicting for sample 401...
Predicting for sample 501...
Predicting for sample 601...
Predicting for sample 701...
Predicting for sample 801...
Predicting for sample 901...
Predicting for sample 1001...
Test Sample 1:
Title: Closed-form Marginal Likelihood in Gamma-Poisson Matrix Factorization
Abstract:   We present novel understandings of the Gamma-Poisson (GaP) model, a
probabilistic matrix factorization model for count data. We show that GaP can
be rewritten free of the score/activation matrix. This gives us new insights
about the estimation of the topic/dictionary matrix by maximum marginal
likelihood estimation. In particular, this explains the robustness of this
estimator to over-specified values of the factorization rank, especially its
ability to automatically prune irrelevant dictionary columns, as empirically
observed in previous work. The marginalization of the 

In [24]:
# print some misclassified samples
count = 0
for i, prediction in enumerate(test_predictions):
    if prediction != labels[i]:
        print(f"Test Sample {i+1}:")
        print(f"Title: {test_text.iloc[i]['TITLE']}")
        print(f"Abstract: {test_text.iloc[i]['ABSTRACT']}")
        print(f"Predicted Category: {prediction}\n")
        print(f"Actual Category: {labels[i]}\n")
        count += 1
    if count == 5:
        break

Test Sample 6:
Title: Pairwise Difference Estimation of High Dimensional Partially Linear Model
Abstract:   This paper proposes a regularized pairwise difference approach for estimating
the linear component coefficient in a partially linear model, with consistency
and exact rates of convergence obtained in high dimensions under mild scaling
requirements. Our analysis reveals interesting features such as (i) the
bandwidth parameter automatically adapts to the model and is actually
tuning-insensitive; and (ii) the procedure could even maintain fast rate of
convergence for $\alpha$-Hölder class of $\alpha\leq1/2$. Simulation studies
show the advantage of the proposed method, and application of our approach to a
brain imaging data reveals some biological patterns which fail to be recovered
using competing methods.

Predicted Category: Mathematics

Actual Category: Statistics

Test Sample 7:
Title: Dissecting the multivariate extremal index and tail dependence
Abstract:   A central issue in