In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Step 1: Preprocess the dataset
df = pd.read_csv('answers.csv')  
df = df[['j', 'qid', 'as']]  # Keep answerer user id, question id, answer score
df = df.dropna()  

In [None]:
# Step 2: Data preparation and evaluation for each question
df['HitRate'] = 0  # Add a new column 'HitRate' initialized to 0
similarity_threshold = 0.8  # Set the similarity threshold for considering recommendations

total_hit_count = 0
total_count = 0

for question_id in df['qid'].unique():
    question_df = df[df['qid'] == question_id]  # Filter dataframe for the current question
    reader = Reader(rating_scale=(question_df['as'].min(), question_df['as'].max()))  # Define the rating scale for the current question
    data = Dataset.load_from_df(question_df[['j', 'qid', 'as']], reader)  # Load the dataset for the current question
    trainset = data.build_full_trainset()  # Build the full trainset for the current question

    # Collaborative filtering (Matrix Factorization with SVD)
    model = SVD()  # Use Singular Value Decomposition (SVD) algorithm
    model.fit(trainset)

    # Evaluation for the current question
    testset_question = [(test_tuple[0], question_id, test_tuple[1]) for test_tuple in trainset.build_testset() if test_tuple[1] == question_id]
    predictions = model.test(testset_question)

    hit_count = 0
    count = 0
    for test_tuple, prediction in zip(testset_question, predictions):
        user_id = test_tuple[0]
        actual_answerer = test_tuple[2]
        predicted_answerer = int(prediction.uid)

        if actual_answerer == predicted_answerer or model.predict(user_id, actual_answerer).est >= similarity_threshold:
            hit_count += 1
        count += 1

    hit_rate = hit_count / count if count > 0 else 0  # Calculate hit rate, handle division by zero
    df.loc[df['qid'] == question_id, 'HitRate'] = hit_rate  # Update the 'HitRate' column for the specific question

    total_hit_count += hit_count
    total_count += count

overall_hit_rate = total_hit_count / total_count if total_count > 0 else 0
print("Overall Hit Rate:", overall_hit_rate)


Overall Hit Rate: 0.6513933181473045
