In [3]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.6.2-cp39-cp39-win_amd64.whl (647 kB)
     ------------------------------------ 647.2/647.2 kB 715.7 kB/s eta 0:00:00
Installing collected packages: implicit
Successfully installed implicit-0.6.2


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from implicit.bpr import BayesianPersonalizedRanking
from sklearn.preprocessing import OneHotEncoder


In [2]:

student_info = pd.read_csv('studentInfo.csv')
student_enrollments = student_info[['id_student', 'code_module']]


In [3]:
print(student_enrollments)

       id_student code_module
0           11391         AAA
1           28400         AAA
2           30268         AAA
3           31604         AAA
4           32885         AAA
...           ...         ...
32588     2640965         GGG
32589     2645731         GGG
32590     2648187         GGG
32591     2679821         GGG
32592     2684003         GGG

[32593 rows x 2 columns]


## This here counts the total number of unique courses a specific user is enrolled in 

In [17]:
student_counts = student_enrollments.groupby('id_student')['code_module'].nunique().reset_index(name='counts')
print(student_counts)
print(student_counts.shape)

       id_student  counts
0            3733       1
1            6516       1
2            8462       1
3           11391       1
4           23629       1
...           ...     ...
28780     2698591       1
28781     2702660       1
28782     2707979       1
28783     2710343       1
28784     2716795       1

[28785 rows x 2 columns]
(28785, 2)


In [19]:
students_filtered_counts = student_counts[student_counts['counts'] >= 3]
print(students_filtered_counts)
print(students_filtered_counts.shape)

       id_student  counts
2290       279883       3
4350       399863       3
5237       441540       3
5974       479607       3
6394       490297       3
6532       493793       3
6887       502004       3
8014       524431       3
8876       537811       3
10091      551528       3
10717      557085       3
12012      571950       3
12056      572154       3
15873      600814       3
16547      605180       3
20207      632074       3
22011      649230       3
26020     1554143       3
28279     2599883       3
28667     2681198       3
(20, 2)


## Duplicates here may happen because a student may have been enrolled in the same course at different presentations, which is not of our interest

In [21]:
student_enrollments_filtered = student_enrollments.drop_duplicates()

print(student_enrollments_filtered)

       id_student code_module
0           11391         AAA
1           28400         AAA
2           30268         AAA
3           31604         AAA
4           32885         AAA
...           ...         ...
32588     2640965         GGG
32589     2645731         GGG
32590     2648187         GGG
32591     2679821         GGG
32592     2684003         GGG

[31284 rows x 2 columns]


## There are twenty users with >= total courses of 3, the 60 observations below are each (student, course) pair

In [23]:
filtered_enrollments = student_enrollments_filtered[student_enrollments_filtered['id_student'].isin(students_filtered_counts['id_student'])]
print(filtered_enrollments)
print(filtered_enrollments.shape)

       id_student code_module
6803       557085         BBB
8834       279883         CCC
8983       399863         CCC
9052       441540         CCC
9103       479607         CCC
9138       490297         CCC
9150       493793         CCC
9369       551528         CCC
9488       572154         CCC
9771       600814         CCC
9823       605180         CCC
10250      632074         CCC
10584     2681198         CCC
11096      502004         CCC
11183      524431         CCC
11225      537811         CCC
11405      571950         CCC
12869     1554143         CCC
13062     2599883         CCC
13557      479607         DDD
14587      279883         DDD
14762      399863         DDD
14950      490297         DDD
15158      537811         DDD
15389      571950         DDD
16322     2681198         DDD
16609      493793         DDD
16626      502004         DDD
16892      600814         DDD
17415     1554143         DDD
18602      649230         DDD
19331     2599883         DDD
19537     

## We can use this to encode things the traditional way into np using scikit-learn's OneHotEncoder, but we'll ignore it for now

In [24]:
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
#we can turn this into a numpy array by calling np.array on the ct.fit_transform
filtered_enrollments_encoded = ct.fit_transform(filtered_enrollments)

## This Does Exactly What We Want

In [77]:
filtered_enrollments_encoded = pd.get_dummies(filtered_enrollments, columns=['code_module']).reset_index(drop=True)
print(filtered_enrollments_encoded)

    id_student  code_module_BBB  code_module_CCC  code_module_DDD  \
0       557085                1                0                0   
1       279883                0                1                0   
2       399863                0                1                0   
3       441540                0                1                0   
4       479607                0                1                0   
5       490297                0                1                0   
6       493793                0                1                0   
7       551528                0                1                0   
8       572154                0                1                0   
9       600814                0                1                0   
10      605180                0                1                0   
11      632074                0                1                0   
12     2681198                0                1                0   
13      502004                0   

In [28]:
print(filtered_enrollments_encoded.shape)


(60, 7)


## We split the data, we'll suffice for 5 test rows, because we have 60 rows total

In [42]:
#not using this for now
test_enrollments = filtered_enrollments_encoded.sample(5)
print(test_enrollments.shape)
train_enrollments = filtered_enrollments_encoded.drop(test_enrollments.index)
print(train_enrollments.shape)


(5, 7)
(55, 7)


In [43]:
#not using this for now
train_matrix = train_enrollments.set_index('id_student')
print(train_matrix)

test_matrix = test_enrollments.set_index('id_student')
print(test_matrix)

            code_module_BBB  code_module_CCC  code_module_DDD  \
id_student                                                      
557085                    1                0                0   
279883                    0                1                0   
441540                    0                1                0   
479607                    0                1                0   
490297                    0                1                0   
493793                    0                1                0   
551528                    0                1                0   
572154                    0                1                0   
600814                    0                1                0   
605180                    0                1                0   
632074                    0                1                0   
2681198                   0                1                0   
502004                    0                1                0   
524431                   

In [84]:
train_ready_filtered_enrollments = filtered_enrollments_encoded.drop(columns='id_student')
print(train_ready_filtered_enrollments)

    code_module_BBB  code_module_CCC  code_module_DDD  code_module_EEE  \
0                 1                0                0                0   
1                 0                1                0                0   
2                 0                1                0                0   
3                 0                1                0                0   
4                 0                1                0                0   
5                 0                1                0                0   
6                 0                1                0                0   
7                 0                1                0                0   
8                 0                1                0                0   
9                 0                1                0                0   
10                0                1                0                0   
11                0                1                0                0   
12                0                1  

## Now we train the model

In [85]:
from scipy.sparse import csr_matrix
from implicit.bpr import BayesianPersonalizedRanking
# initialize model
model = BayesianPersonalizedRanking()

# train model on training data
model.fit(csr_matrix(train_ready_filtered_enrollments))

  0%|          | 0/100 [00:00<?, ?it/s]

## Test Recommendations

In [None]:
# get recommendations for a test user
user = test_enrollments.loc['id_student' == 2599883]
print(user)
print(test_matrix.loc[2599883])

In [95]:
user_id = filtered_enrollments_encoded.iloc[0]['id_student']
print(user_id)
user_rows = filtered_enrollments_encoded[filtered_enrollments_encoded['id_student'] == user_id]
user_rows = user_rows.drop(columns='id_student')
print(user_rows)

557085
    code_module_BBB  code_module_CCC  code_module_DDD  code_module_EEE  \
0                 1                0                0                0   
52                0                0                0                0   
59                0                0                0                0   

    code_module_FFF  code_module_GGG  
0                 0                0  
52                1                0  
59                0                1  


In [82]:
print(csr_matrix(user_rows))

  (0, 1)	1
  (1, 2)	1
  (2, 4)	1


In [97]:
user_items = csr_matrix(user_rows)
num_recommendations = 3
recommended_courses = model.recommend(0, user_items, N=num_recommendations)


ValueError: user_items must contain 1 row for every user in userids

In [100]:
grouped_enrollments = filtered_enrollments_encoded.groupby('id_student').sum()
user_id3 = grouped_enrollments.index[0]
user_rows3 = grouped_enrollments.loc[user_id3]
print(grouped_enrollments)
print(user_id3)
print(user_rows3)


            code_module_BBB  code_module_CCC  code_module_DDD  \
id_student                                                      
279883                    0                1                1   
399863                    0                1                1   
441540                    0                1                0   
479607                    0                1                1   
490297                    0                1                1   
493793                    0                1                1   
502004                    0                1                1   
524431                    0                1                0   
537811                    0                1                1   
551528                    0                1                0   
557085                    1                0                0   
571950                    0                1                1   
572154                    0                1                0   
600814                   

In [104]:
user_rows3 = grouped_enrollments.loc[user_id3]
print(user_rows3)

[0 1 1 0 1 0]


In [105]:
user_items = csr_matrix(user_rows3)
num_recommendations = 3
recommended_courses = model.recommend(0, user_items, N=num_recommendations)

In [108]:
print(recommended_courses)

(array([5, 0, 3]), array([-0.00855518, -0.02924612, -0.06822795], dtype=float32))


In [111]:
print(recommended_courses[0])
print(recommended_courses[1])
print(grouped_enrollments.iloc[0])
print(grouped_enrollments.iloc[3])
print(grouped_enrollments.iloc[5])


[5 0 3]
[-0.00855518 -0.02924612 -0.06822795]
code_module_BBB    0
code_module_CCC    1
code_module_DDD    1
code_module_EEE    0
code_module_FFF    1
code_module_GGG    0
Name: 279883, dtype: uint8
code_module_BBB    0
code_module_CCC    1
code_module_DDD    1
code_module_EEE    1
code_module_FFF    0
code_module_GGG    0
Name: 479607, dtype: uint8
code_module_BBB    0
code_module_CCC    1
code_module_DDD    1
code_module_EEE    0
code_module_FFF    1
code_module_GGG    0
Name: 493793, dtype: uint8


In [107]:
recommended_indices = recommended_courses[0]
recommended_scores = recommended_courses[1]

recommended_courses_df = filtered_enrollments_encoded.iloc[recommended_indices][['code_module_BBB', 'code_module_CCC', 'code_module_DDD', 'code_module_EEE', 'code_module_FFF', 'code_module_GGG']]

# Print the recommended courses along with their scores
for index, row in recommended_courses_df.iterrows():
    print(f"Course: {row}\tScore: {recommended_scores[index]}")

IndexError: index 5 is out of bounds for axis 0 with size 3

Fix the train & test sample split, since every user is existent for the number of times he has enrolled, so if he enrolled in 4 courses, he'd be in the dataset 4 times, the problem is, when we sample randomly, we might get let's say 2 of those 4 times he's present in the data, and the data just becomes messed up, so i'll user the filtered_enrollments directly because it's pre seperation of train test, we might keep using that

## The logic for the recommendations is ready, side note: maybe grouped_enrollments is what needs to be fed to the training data, the sum of enrollments encoded, and each user appears once, so that's something to consider so keep it and try it later on, also write the code to display and visualize the recommended courses, also clean the other code cells and try other algorithms