### User Based Traditional Colaborative Filtering 

In [2]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

#### Preprocess: 
Replace NaNs with mean of the user 

In [3]:
df = pd.read_excel('Course Recommendation System.xlsx')
df=df.drop(['ID','Start time','Completion time','Email','Name','Name(not mandatory)\n' ], axis=1)
df['mean'] = df.mean(axis=0,skipna=True,numeric_only=True)
df = df.apply(lambda x : x.fillna(x.mean()),axis=0)
df.drop(axis="columns",labels=["mean"],inplace=True)
df.head()

Unnamed: 0,Data Structures and Algorithms,Computer Architecture\n,Discrete Mathematics\n,Economics,Programming-2,Machine Learning\n,Mathematics For Machine Learning\n,Visual Recognition\n,Natural Language Processing\n,Reinforcement Learning,...,Digital CMOS VLSI Design,System design with FPGA,ASIC design,VLSI Architecture Design\n,High level synthesis and optimization of Digital Circuits,Digital Sociology,Privacy in the Digital Age,Technology Ethics and AI,Techno-economics of networks,The Web and the Mind
0,0.3,0.85,0.85,0.9,0.7,0.7,0.8,0.620098,0.572638,0.603495,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.318243,0.6,0.423286,0.396622,0.510897
1,0.7,0.8,0.2,0.8,0.8,0.5,0.1,0.2,0.572638,0.603495,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.318243,0.5,0.423286,0.396622,0.510897
2,0.7,0.0,1.0,1.0,1.0,1.0,1.0,0.8,0.9,1.0,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.318243,0.4,0.423286,0.396622,0.510897
3,0.65,0.7,0.2,0.75,0.75,0.75,0.75,0.75,0.572638,0.603495,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.8,0.452708,0.423286,0.8,0.510897
4,0.7,0.6,0.2,0.0,0.8,0.5,0.6,0.4,0.4,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Constants 

In [4]:
tt_ratio = 0.05
seed = 42 

In [5]:
df 

Unnamed: 0,Data Structures and Algorithms,Computer Architecture\n,Discrete Mathematics\n,Economics,Programming-2,Machine Learning\n,Mathematics For Machine Learning\n,Visual Recognition\n,Natural Language Processing\n,Reinforcement Learning,...,Digital CMOS VLSI Design,System design with FPGA,ASIC design,VLSI Architecture Design\n,High level synthesis and optimization of Digital Circuits,Digital Sociology,Privacy in the Digital Age,Technology Ethics and AI,Techno-economics of networks,The Web and the Mind
0,0.30,0.85,0.85,0.90,0.70,0.70,0.80,0.620098,0.572638,0.603495,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.318243,0.600000,0.423286,0.396622,0.510897
1,0.70,0.80,0.20,0.80,0.80,0.50,0.10,0.200000,0.572638,0.603495,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.318243,0.500000,0.423286,0.396622,0.510897
2,0.70,0.00,1.00,1.00,1.00,1.00,1.00,0.800000,0.900000,1.000000,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.318243,0.400000,0.423286,0.396622,0.510897
3,0.65,0.70,0.20,0.75,0.75,0.75,0.75,0.750000,0.572638,0.603495,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.800000,0.452708,0.423286,0.800000,0.510897
4,0.70,0.60,0.20,0.00,0.80,0.50,0.60,0.400000,0.400000,0.200000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,0.00,1.00,0.50,0.70,0.60,0.50,0.30,0.700000,0.572638,0.603495,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.318243,0.000000,0.423286,0.396622,0.510897
158,1.00,0.00,0.00,0.70,0.70,0.00,0.00,0.000000,0.572638,0.603495,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.318243,0.452708,0.423286,0.396622,0.200000
159,0.60,0.30,0.40,0.90,0.80,0.70,0.40,0.500000,0.400000,0.200000,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.318243,1.000000,0.423286,0.396622,1.000000
160,0.80,0.60,0.20,0.40,0.70,0.70,0.80,0.620098,0.572638,0.700000,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.318243,0.452708,0.423286,0.396622,0.510897


#### Train Test Split 

In [6]:
df_train,df_test = train_test_split(df,test_size=tt_ratio,shuffle=True,random_state=42)

### Implementation 
1. Find similar user based on ratings on common items 
2. Identify items rated by similar users for the user in question 
3. Calculated weighted average scores 
4. Rank them 

### Thresholds 

In [7]:
similarity_threshold = 0

#### Code Starts Here

##### Step 1 

In [8]:
df_test

Unnamed: 0,Data Structures and Algorithms,Computer Architecture\n,Discrete Mathematics\n,Economics,Programming-2,Machine Learning\n,Mathematics For Machine Learning\n,Visual Recognition\n,Natural Language Processing\n,Reinforcement Learning,...,Digital CMOS VLSI Design,System design with FPGA,ASIC design,VLSI Architecture Design\n,High level synthesis and optimization of Digital Circuits,Digital Sociology,Privacy in the Digital Age,Technology Ethics and AI,Techno-economics of networks,The Web and the Mind
158,1.0,0.0,0.0,0.7,0.7,0.0,0.0,0.0,0.572638,0.603495,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.318243,0.452708,0.423286,0.396622,0.2
109,1.0,1.0,0.0,1.0,0.5,0.75,0.8,0.620098,0.572638,0.603495,...,0.5,1.0,0.7,0.5,0.168056,0.318243,0.452708,0.423286,0.396622,0.510897
131,0.9,0.7,0.9,0.85,0.8,0.6,0.7,0.7,0.5,0.6,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.318243,0.452708,0.423286,0.9,0.510897
55,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.572638,0.603495,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
94,0.8,0.3,0.7,0.4,0.9,1.0,1.0,0.6,0.8,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.6
29,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,1.0,0.8,0.5,0.3,0.8,1.0,1.0,1.0,0.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.3
51,1.0,1.0,0.6,0.4,1.0,1.0,1.0,1.0,1.0,1.0,...,0.5,0.5,0.4,0.2,0.3,0.4,0.5,1.0,0.3,1.0
100,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
testpoint = np.array([1.0,	0.0,	0.0,	0.70,	0.7	])

In [10]:
#find cosine similarity between core courses of 2 users. One in train system and one in test 
def find_similar_users(users:np.array,testpoint:np.array)->np.array:
    val = []
    for user in users :
        mean = np.mean(user)
        similarity = cosine_similarity(user[0:5].reshape(1,-1)-mean,testpoint.reshape(1,-1))
        val.append(similarity)
 
    return val

In [11]:
users = df_train.to_numpy()
len(users)

153

In [12]:
cosine_similarity_values = find_similar_users(users,testpoint)

In [13]:
cosine_similarity_values[0]

array([[0.54152376]])

In [14]:
def make_user_similarity_map(user,cosine_similarity_values):
    return {
          tuple(user[i]):cosine_similarity_values[i]
            for i in range(len(user))}

In [15]:
us_map = make_user_similarity_map(users,cosine_similarity_values)

def ranked_users(users,map):
    #check with all users which is most similar to given and rank them 
    rank = sorted(users,key=lambda x: map[tuple(x)],reverse=True)
    return rank 

#### Step 2-4 

In [16]:
def get_ratings(users,testpoint,map): 
    rank = ranked_users(users,map)
    predicted_ratings = np.zeros(20)
    count = 0 
    for i in range(len(rank)):
        if(map[tuple(rank[i])][0][0]>similarity_threshold):
            predicted_ratings = np.add(predicted_ratings,map[tuple(rank[i])][0][0]*rank[i][5:])
            count+=1 
    predicted_ratings=predicted_ratings/count
    mean = np.ones(20)*np.mean(testpoint)
    predicted_ratings = np.add(predicted_ratings,mean)
    return predicted_ratings

In [17]:
testpoint

array([1. , 0. , 0. , 0.7, 0.7])

In [18]:
df_test

Unnamed: 0,Data Structures and Algorithms,Computer Architecture\n,Discrete Mathematics\n,Economics,Programming-2,Machine Learning\n,Mathematics For Machine Learning\n,Visual Recognition\n,Natural Language Processing\n,Reinforcement Learning,...,Digital CMOS VLSI Design,System design with FPGA,ASIC design,VLSI Architecture Design\n,High level synthesis and optimization of Digital Circuits,Digital Sociology,Privacy in the Digital Age,Technology Ethics and AI,Techno-economics of networks,The Web and the Mind
158,1.0,0.0,0.0,0.7,0.7,0.0,0.0,0.0,0.572638,0.603495,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.318243,0.452708,0.423286,0.396622,0.2
109,1.0,1.0,0.0,1.0,0.5,0.75,0.8,0.620098,0.572638,0.603495,...,0.5,1.0,0.7,0.5,0.168056,0.318243,0.452708,0.423286,0.396622,0.510897
131,0.9,0.7,0.9,0.85,0.8,0.6,0.7,0.7,0.5,0.6,...,0.251538,0.246316,0.214805,0.234868,0.168056,0.318243,0.452708,0.423286,0.9,0.510897
55,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.572638,0.603495,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
94,0.8,0.3,0.7,0.4,0.9,1.0,1.0,0.6,0.8,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.6
29,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,1.0,0.8,0.5,0.3,0.8,1.0,1.0,1.0,0.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.3
51,1.0,1.0,0.6,0.4,1.0,1.0,1.0,1.0,1.0,1.0,...,0.5,0.5,0.4,0.2,0.3,0.4,0.5,1.0,0.3,1.0
100,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
ratings = get_ratings(users,testpoint,us_map)

In [20]:
names=df.columns 

In [21]:
def top_n_reccomendations(ratings,names,n=5):
    temp = {names[5:][i]:ratings[i] for i in range(20)}
    name = sorted(names[5:],key=lambda x: temp[x],reverse=True)
    return name[:n]

top_n_reccomendations(ratings,names)

['Machine Learning\n',
 'Mathematics For Machine Learning\n',
 'Software Production Engineering\n',
 'Visual Recognition\n',
 'Data Visualization\n']

In [22]:
def accuracy(df_test,n=5):
    test_set = df_test.to_numpy()
    correct = 0 
    total = 0 
    for user in test_set:
        core = user[0:5]
        csv = find_similar_users(users,core)
        us_map = make_user_similarity_map(users,csv)
        ratings = get_ratings(users,core,us_map)
        names = df_test.columns
        rec = top_n_reccomendations(ratings,names,n)
        elective = user[5:]
        actual_ratings = sorted(elective,reverse=True)
        actual_rec = top_n_reccomendations(actual_ratings,names,n)
        for i in range(len(actual_rec)):
            for j in range(len(actual_rec)):
                if(rec[i]==actual_rec[j]):
                    correct+=1
                    break 
        total+=n           
    return correct/total 

In [23]:
100*accuracy(df_test)

62.22222222222222