In [13]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load your dataset
df = pd.read_csv('dataset/train.csv')

# Define a preprocessing function
def preprocess(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Apply stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    
    return ' '.join(stemmed_tokens)

# Apply preprocessing to a specific column (adjust 'text_column' to your column name)
df['processed_context'] = df['context'].apply(preprocess)
df['processed_question'] = df['question'].apply(preprocess)
df['processed_answer'] = df['Answer'].apply(preprocess)


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:992)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:992)>


In [12]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    # Use the pooled output for sentence-level representation
    embeddings = outputs.pooler_output
    return embeddings.detach().numpy()

# Apply BERT embeddings extraction to each preprocessed column
df['bert_context'] = df['processed_context'].apply(get_bert_embedding)
df['bert_question'] = df['processed_question'].apply(get_bert_embedding)
df['bert_answer'] = df['processed_answer'].apply(get_bert_embedding)


In [17]:
df['bert_answer'] = df['processed_answer'].apply(get_bert_embedding)

In [16]:
df

Unnamed: 0,context,question,Answer,Label,processed_context,processed_question,processed_answer,context_logical_markers,question_logical_markers,answer_logical_markers
0,"Some Cantonese don't like chili, so some south...",Which of the following can guarantee the above...,D.Some Cantonese like neither peppers nor sweets,0,"cantones n't like chili , southern n't like ch...",follow guarante argument ?,d.some cantones like neither pepper sweet,0,0,0
1,Continuous exposure to indoor fluorescent ligh...,Which of the following questions was the initi...,A.Can hospital light therapy be proved to prom...,1,continu exposur indoor fluoresc light benefici...,follow question initi motiv conduct experi ?,a.can hospit light therapi prove promot patien...,0,0,0
2,There is no doubt that minors should be prohib...,"In order to evaluate the above argument, which...",B.How much inconvenience does the ban on the u...,1,"doubt minor prohibit smoking.howev , explicitl...","order evalu argument , follow question import ?",b.how much inconveni ban use automat vend mach...,0,0,0
3,A research report states that a special educat...,Which of the following best illustrates the lo...,B.Establishing such education and training pro...,0,research report state special educ program chi...,follow best illustr logic loophol summar ?,b.establish educ train program nation basi req...,0,0,0
4,"The traitor is a traitor, so you are a traitor...",Which of the following makes the same logical ...,"C.The earth is a sphere, which can be proved f...",1,"traitor traitor , traitor , patriot.th word pa...",follow make logic mistak ?,"c.the earth sphere , prove fact stand height w...",0,0,0
...,...,...,...,...,...,...,...,...,...,...
7371,"A warehouse was stolen.After investigation, it...",Now suppose that only one of the four people s...,C.C is the criminal who steals the warehouse.,0,"warehous stolen.aft investig , ascertain zuomo...",suppos one four peopl spoke truth.then,c.c crimin steal warehous .,0,0,0
7372,"A warehouse was stolen.After investigation, it...",Now suppose that only one of the four people c...,A.A is a criminal who steals a warehouse.,0,"warehous stolen.aft investig , ascertain zuomo...",suppos one four peopl confess falsehood.then,a.a crimin steal warehous .,0,0,0
7373,"The three members A.B, and C discuss the meani...",The following conclusion is correct?,"D.B's opinion is correct, A and C's opinion is...",1,"three member a.b , c discuss mean principl `` ...",follow conclus correct ?,"d.b 's opinion correct , c 's opinion incorrect .",0,0,0
7374,"In a restaurant, all dishes are either Sichuan...",Which of the following can enhance the above a...,A.The restaurant stipulates that when ordering...,1,"restaur , dish either sichuan cuisin cantones ...",follow enhanc argument ?,"a.th restaur stipul order cantones cuisin , or...",0,0,0


In [14]:
def count_logical_markers(text):
    # Define a set of logical markers
    logical_markers = {
        'because', 'therefore', 'if', 'then', 'not', 'never'
    }
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Count the occurrences of each logical marker in the text
    return sum(token in logical_markers for token in tokens)

# Apply the function to each preprocessed text column
df['context_logical_markers'] = df['processed_context'].apply(count_logical_markers)
df['question_logical_markers'] = df['processed_question'].apply(count_logical_markers)
df['answer_logical_markers'] = df['processed_answer'].apply(count_logical_markers)


In [15]:
import numpy as np

df['combined_features'] = df.apply(lambda row: np.concatenate([
    row['bert_context'],
    row['bert_question'],
    row['bert_answer'],
    np.array([
        row['context_logical_markers'],
        row['question_logical_markers'],
        row['answer_logical_markers']
    ])
]), axis=1)

KeyError: 'bert_context'

In [None]:
def get_prompt(df):
    context = df.context
    question = df.question
    answer = df.Answer
    return f"""Given that "{context}", do you think this is logically correct: "{answer}".
Answer 1 if it is correct, otherwise answer 0. Your answer:"""

In [None]:
test = pd.read_csv('../dataset/test.csv')
# for i in range(len(test)):
for i in range(10):
    prompt = get_prompt(test.iloc[i])