In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans


Item based

In [25]:
# Data Parsing
def parse_subject_data(row):
    subjects = {}
    items = row.split(", ")
    for i in range(0, len(items), 2):
        subject = items[i].split(": ")[1]
        enjoyment = int(items[i+1].split(": ")[1])
        subjects[subject] = enjoyment
    return subjects

# Load data
data = pd.read_csv('form3.csv')

# Parse the subject data
data['subjects'] = data['subjects'].apply(parse_subject_data)

# Convert the parsed data into a DataFrame
subject_ratings = pd.DataFrame(data['subjects'].tolist(), index=data['student_id'])
print("original subject_ratings\n",subject_ratings)

# Column mean imputation
imputed_df = subject_ratings.fillna(subject_ratings.mean())
#print("imputed_df\n",imputed_df)

def get_itembased_scores(student_id, item, df, n=3):
    """
    Return the predicted `student_id` rating for `item`, using 3 most similar items.
    """
    
    # Get the original ratings for the current student_id
    current_ratings = df.loc[student_id,:]
    
    # Get the imputed ratings for the current item
    x = imputed_df.loc[:,item]
    
    # Initialise a predicted dictionary
    similarity = {}
    
    # Only include items that student_id has rated
    rated_items = [x for x in df.columns if not np.isnan(current_ratings[x])]
    
    # Calculate the similarity scores
    for compare_item in rated_items:
        y = imputed_df.loc[:, compare_item]
        eucl_dist = np.sqrt(np.sum([(a-b)*(a-b) for a, b in zip(x, y)]))
        similarity[compare_item] = 1/(1+eucl_dist)
    #print("similarity\n",similarity)

    # Convert `similarity` to a series, and find weights
    similarity = pd.Series(similarity)
    
    # Create `top_n`: a LIST of the top n item labels to calculate the weighted predicted score
    top_n = similarity.sort_values(ascending=False).head(n).index
    
    # Calculate the predicted score
    predicted_score = (current_ratings[top_n]*similarity[top_n]).sum() / similarity[top_n].sum()
    
    return predicted_score

id = 1
item = 'COMP10003'

# Get top 3 recommendations for user `id`
recommendations = []
for item in subject_ratings.columns:
    if np.isnan(subject_ratings.loc[id, item]):
        recommendations.append((item, get_itembased_scores(id, item, subject_ratings)))

recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:3]
print(f"\nTop recommendations for user {id}: ", recommendations)

original subject_ratings
             COMP10001  COMP10002  COMP10003  SWEN10003
student_id                                            
1                 2.0          2        2.0        NaN
2                 9.0          9        1.0        NaN
3                 8.0          8        NaN        1.0
4                 NaN          7        7.0        7.0
5                10.0         10       10.0        NaN
6                 1.0          1        1.0        NaN
7                 5.0          5        5.0        NaN
8                 3.0          3        3.0        NaN
imputed_df
             COMP10001  COMP10002  COMP10003  SWEN10003
student_id                                            
1            2.000000          2   2.000000        4.0
2            9.000000          9   1.000000        4.0
3            8.000000          8   4.142857        1.0
4            5.428571          7   7.000000        7.0
5           10.000000         10  10.000000        4.0
6            1.000000      

User Based - Predicts users rating of a subject

In [8]:
# Load data
df = pd.read_csv('form3.csv')

# Convert input csv file from (1,"Subject code: COMP10001, Enjoyment: 2, Subject code: COMP10002, Enjoyment: 2, Subject code: COMP10003, Enjoyment: 2") to a different row for each subject for each student id
ratings_dict = {
    "item" : [],
    "user" : [],
    "rating" : []
}

# Parse the subject data
for index, row in df.iterrows():
    items = row["subjects"].split(", ")
    for i in range(0, len(items), 2):
        subject = items[i].split(": ")[1]
        ratings_dict['item'].append(subject)
        rating = int(items[i+1].split(": ")[1])
        ratings_dict['rating'].append(rating)
        ratings_dict['user'].append(row['student_id'])

print(ratings_dict)

# Create a DataFrame from the dictionary
df = pd.DataFrame(ratings_dict)

# Create dataset from dataframe
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)

# User-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": True,
}
algo = KNNWithMeans(sim_options=sim_options)

training_set = data.build_full_trainset()

algo.fit(training_set)

prediction = algo.predict(7, "SWEN10003")
print(prediction.est)

{'item': ['COMP10001', 'COMP10002', 'COMP10003', 'COMP10001', 'COMP10002', 'COMP10003', 'COMP10001', 'COMP10002', 'SWEN10003', 'COMP10002', 'SWEN10003', 'COMP10003', 'COMP10001', 'COMP10002', 'COMP10003', 'COMP10001', 'COMP10002', 'COMP10003', 'COMP10001', 'COMP10002', 'COMP10003', 'COMP10001', 'COMP10002', 'COMP10003'], 'user': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8], 'rating': [2, 2, 2, 9, 9, 1, 8, 8, 1, 7, 7, 7, 10, 10, 10, 1, 1, 1, 5, 5, 5, 3, 3, 3]}
Computing the cosine similarity matrix...
Done computing similarity matrix.
2.6666666666666665
