## Introduction to Dataset

In [None]:
# Importing libraries
import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import apriori, association_rules 

In [None]:
# Reading the dataset
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

ratings.info()
print('\n')
movies.info()

In [None]:
movies.head()

In [None]:
ratings.head()

## Data Preprocessing

In [None]:
final_dataset = ratings.pivot(index='userId',columns='movieId',values='rating')

In [None]:
final_dataset

In [None]:
final_dataset.fillna(0,inplace=True)

In [None]:
no_user_voted = ratings.groupby('movieId')['rating'].agg('count')
no_movies_voted = ratings.groupby('userId')['rating'].agg('count')

# no_user_voted
# no_movies_voted

In [None]:
final_dataset = final_dataset.loc[:, no_user_voted[no_user_voted > 10].index]

In [None]:
final_dataset = final_dataset.loc[no_movies_voted[no_movies_voted > 50].index, :]

In [None]:
final_dataset

In [None]:
def hot_encode(x): 
    if(x < 3.5): 
        return 0
    else: 
        return 1

In [None]:
final_dataset = final_dataset.applymap(hot_encode)

In [None]:
final_dataset

In [None]:
movieIdToName = dict()
for mid in final_dataset.columns:
    movieIdToName[mid] = movies[movies["movieId"] == mid]["title"].values[0]

In [None]:
cnt = 0
for movieId, movieName in movieIdToName.items():
    print(f"{movieId} -> {movieName}")
    cnt += 1
    
    if(cnt == 5):
        break

In [None]:
finalLst = []
for i in final_dataset.index:
    lst = []
    for j in final_dataset.columns:
        if(final_dataset[j][i]):
            lst.append(j)
    finalLst.append(lst)

In [None]:
print(finalLst[0])

In [None]:
# storing data to file
with open("dataset.txt", "w") as fp:
    for lst in finalLst:
        for x in lst:
            fp.write(str(x))
            fp.write(" ")
        fp.write("\n")

## Manual Implementation

In [None]:
# encoding the movie id length to fixed size
movieIdSize = 6

# encoding value
encoder = 100000

# Total users
userCnt = 378

In [None]:
minSupport = 70

In [None]:
# Too generate new (k+1)-itemsets
def generateKPlus1thSet(itemSet):
    length = len(itemSet)
    candidates = []   # all (k + 1) candidates
    
    # for each candidate
    for (i, candidate) in enumerate(itemSet):
        # for next all candidates in itemSet
        for j in range(i + 1, length):
            nextCandidate = itemSet[j]
            # matching first (k - 1) elements
            if(candidate[:-movieIdSize] == nextCandidate[:-movieIdSize]):    
                newItem = candidate[:-movieIdSize] + candidate[-movieIdSize:] + nextCandidate[-movieIdSize:]
                candidates.append(newItem)
            
    return candidates

In [None]:
# Prune step
def prune(Ck):
    Lk = []

    for item in Ck:
        if(Ck[item] >= minSupport):
            Lk.append(item)
    
    return Lk

In [None]:
# calculating support for new itemset
def calculateSupport(candidates):
    
    Ck = dict()
    
    for line in finalLst:
        line = list(map(lambda x: str(x + encoder), line))
        
        for candidate in candidates:
            
            if(candidate not in Ck):
                Ck[candidate] = 0
                
            present = True
            
            for k in range(0, len(candidate), movieIdSize):
                item = candidate[k: k + movieIdSize]
                
                if(item not in line):
                    present = False
                    break
                    
            if(present):
                Ck[candidate] += 1
                
    return Ck

In [None]:
C1 = dict()

for line in finalLst:
    for item in line:
        item = str(item + encoder)
        C1[item] = C1.get(item, 0) + 1        # C1 encoded no along with occurance.       
        
L1 = prune(C1)      # divide based on minSup.

print('====================================')
print('     Generating 1 itemset')
print('====================================')

L = generateKPlus1thSet(L1)
# print(L)

k = 2
while(L != []):
    
    C = calculateSupport(L)
    
    frequentItemset = prune(C)

    print('     Generating', k, 'itemset')
    print('====================================')
    
    L = generateKPlus1thSet(frequentItemset)
    
    k += 1

In [None]:
def decoder(frequentItemset):
    
    y = [[itemSet[x : x + movieIdSize] for x in range(0, len(itemSet), movieIdSize)] for itemSet in frequentItemset]

    x1 = [list(map(lambda x: str(int(x) - encoder), z)) for z in y]
    
    movieItemSet = []
    
    # for each itemset
    for itemSet in x1:
        tempSet = []
        for movieId in itemSet:
            tempSet.append(movieIdToName[int(movieId)])
            
        movieItemSet.append(tempSet)
    
    return movieItemSet

In [None]:
frequentItems = decoder(frequentItemset)

print("Final Frequent ItemSets\n\n")

for itemSet in frequentItems:   
    for movie in itemSet:
        print(movie)
        
    print("\n")

In [None]:
# Formating frequent itemset to generate association rules
freqItems = []

items = "".join(frequentItemset)

for k in range(0, len(items), movieIdSize):
    item = items[k: k + movieIdSize]
    support = (C1[item] / userCnt)
    movieName = frozenset([movieIdToName[int(item) - encoder]])
    freqItems.append([support, movieName])
    
freqDf = pd.DataFrame(freqItems, columns=["support", "itemsets"])
print(freqDf)

In [None]:
rules = association_rules(freqDf, metric ="confidence", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])

rules

## Generating rules from frequent itemset

In [None]:
final_dataset.columns = [movieIdToName[mid] for mid in final_dataset.columns]

In [None]:
final_dataset

In [None]:
# Building the model 
frq_items = apriori(final_dataset, min_support = 0.3, use_colnames = True) 
print(frq_items)

# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 

In [None]:
for i in rules.index:
    antecedents = []
    consequents = []
    for j in rules.antecedents[i]:
        antecedents.append(j[:30])
    for k in rules.consequents[i]:
        consequents.append(k[:30])
        
    print(f'({",".join(antecedents)}) --> ({",".join(consequents)})\n')

In [None]:
# Recommendation for a particular movie
def getRecommendation(movie):
    similarMovies = []
    for movies in frequentItems:
        if movie in movies:
            similarMovies.extend(movies)
    return similarMovies

In [None]:
movie = 'Star Wars: Episode IV - A New Hope (1977)' 
print("The Recommended Movies are\n")
recommended_movies = getRecommendation(movie)
for movies in recommended_movies:
    if(movies != movie):
        print(movies)

## The End