In [1]:
import pandas as pd
import numpy as np

df_reviews_all = pd.read_csv("rating_coursera_final.csv")
df_reviews = df_reviews_all.drop(columns=['course_id'])
df_reviews = df_reviews.sort_values(by='userID')
df_reviews.rename(columns={'number_course_ID': 'courseID'}, inplace=True)
user_mapping = {old_user_id: new_user_id for new_user_id, old_user_id in enumerate(df_reviews['userID'].unique(), start=1)}
df_reviews['userID'] = df_reviews['userID'].map(user_mapping)
df_reviews = df_reviews.drop_duplicates(subset=['userID', 'courseID', 'rating'])
df_reviews

Unnamed: 0,userID,courseID,rating
818688,1,255,5
301798,2,449,5
1127762,3,257,5
511834,4,560,5
745018,5,506,5
...,...,...,...
1236375,287806,172,1
732413,287807,300,4
1259484,287807,495,1
1262903,287807,112,1


In [2]:
duplicates = df_reviews[df_reviews.duplicated(['userID', 'courseID'], keep=False)]
duplicates

Unnamed: 0,userID,courseID,rating
186820,18,211,2
197885,18,211,5
1210437,371,63,4
1210770,371,63,5
1289904,929,253,5
...,...,...,...
238467,282004,339,4
1166584,282432,459,3
1167218,282432,459,5
1167293,283876,459,5


In [3]:
df_reviews = df_reviews.groupby(['userID', 'courseID'], as_index=False).mean()
df_reviews

Unnamed: 0,userID,courseID,rating
0,1,255,5.0
1,2,449,5.0
2,3,257,5.0
3,4,560,5.0
4,5,506,5.0
...,...,...,...
506111,287806,172,1.0
506112,287807,112,1.0
506113,287807,300,4.0
506114,287807,495,1.0


In [114]:
df_reviews.to_csv('no_duplicates_ratings.csv')

In [4]:
user_item_matrix = df_reviews.pivot(index='userID', columns='courseID', values='rating')

# fill NaN values with 0
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix

courseID,0,1,2,3,4,5,6,7,8,9,...,594,595,596,597,598,599,600,601,602,603
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
287805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
287806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
287807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
num_users_with_all_zeros = (user_item_matrix.sum(axis=1) == 0).sum()

print(f"Number of users with all zeros: {num_users_with_all_zeros}")

Number of users with all zeros: 0


In [5]:
type(user_item_matrix)

pandas.core.frame.DataFrame

In [6]:
def print_rating_details_for_a_pair_user_course(user_id, course_id, user_item_matrix=user_item_matrix):
#     we use our user_item_matrix by default
    rating = user_item_matrix.at[user_id, course_id]
    print(f"The rating for userID {user_id} and courseID {course_id} is: {rating}")
    return

In [113]:
print_rating_details_for_a_pair_user_course(283876, 459)

The rating for userID 283876 and courseID 459 is: 4.5


## The concept is to find similar users based on their rating. Using the ratings from each user we aim to suggest similar content to similar users.

### Note that we should also set a similarity threshold

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

def generate_recommendations(user_item_matrix, user_id_to_recommend_for, n_neighbors=5, similarity_threshold=0.2):
    scaler = MinMaxScaler()
    normalized_user_item_matrix = pd.DataFrame(scaler.fit_transform(user_item_matrix),
                                               columns=user_item_matrix.columns, index=user_item_matrix.index)

    # kNN Model
    knn_model = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
    knn_model.fit(normalized_user_item_matrix)

    user_data = normalized_user_item_matrix.loc[user_id_to_recommend_for].values.reshape(1, -1)
    distances, neighbor_indices = knn_model.kneighbors(user_data)

    print("Similar Users:")
    print(normalized_user_item_matrix.iloc[neighbor_indices[0]].index)

    recommended_courses = set()

    for neighbor_index in neighbor_indices[0]:
        neighbor_courses = normalized_user_item_matrix.iloc[neighbor_index]
        for course in neighbor_courses.index:
            similarity = neighbor_courses[course]
            if similarity > similarity_threshold and user_item_matrix.at[user_id_to_recommend_for, course] == 0.0:
                if not any(course == rec[0] for rec in recommended_courses):
                    recommended_courses.add((course, similarity))

    sorted_recommendations = sorted(recommended_courses, key=lambda x: x[1], reverse=True)[:10]

    return sorted_recommendations

In [111]:
user_id_to_recommend_for = 300
recommendations = generate_recommendations(user_item_matrix, user_id_to_recommend_for, n_neighbors=10, similarity_threshold=0)

print(f"Top 10 recommended courses for user {user_id_to_recommend_for}:")
for course, similarity in recommendations:
    print(f"Course: {course}, Similarity: {similarity}")

Similar Users:
Index([115420, 75714, 136771, 97172, 253807, 52008, 262451, 132441, 101142,
       16569],
      dtype='int64', name='userID')
Top 10 recommended courses for user 300:


In [85]:
print_rating_details_for_a_pair_user_course(287808, 166)

The rating for userID 287808 and courseID 166 is: 1.0


In [112]:
my_row_df_reviews_all = df_reviews[df_reviews['userID'] == 300]
my_row_df_reviews_all

Unnamed: 0,userID,courseID,rating
427,300,86,5.0


In [97]:
my_row_df_reviews_all = df_reviews[df_reviews['userID'] == 48749]
my_row_df_reviews_all

Unnamed: 0,userID,courseID,rating
92382,48749,344,5.0
92383,48749,446,5.0
92384,48749,449,5.0


In [83]:
my_row_df_reviews_all = df_reviews[df_reviews['userID'] == 199239]
my_row_df_reviews_all

Unnamed: 0,userID,courseID,rating
362126,199239,339,5.0
362127,199239,344,5.0
362128,199239,388,5.0
362129,199239,446,5.0
362130,199239,449,4.5
362131,199239,457,4.0


In [None]:
df_reviews_all = pd.read_csv("Coursera_courses.csv")

In [11]:
user_item_matrix

courseID,0,1,2,3,4,5,6,7,8,9,...,594,595,596,597,598,599,600,601,602,603
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
287805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
287806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
287807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
