-
Notifications
You must be signed in to change notification settings - Fork 0
/
MovieRecommander.R
91 lines (72 loc) · 2.51 KB
/
MovieRecommander.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
install.packages("tidyverse")
install.packages("ggplot2")
install.packages("caret")
install.packages("data.table")
install.packages("lubridate")
install.packages("dplyr")
install.packages("ggthemes")
install.packages("scales")
install.packages("matrixStats") # used for matrix Matrix Factorization
library(dplyr) # we used for left_join
library(stringr)
library(lubridate)# we used
library(ggplot2)
library(matrixStats)
as.data.frame(trainSet)
# MovieLens small dataset:
# https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
movielens <- left_join(ratings, movies, by = "movieId")
set.seed(100 , sample.kind="Rounding")
sampleIndex <- sample(1:nrow(movielens) , size = 0.9*nrow(movielens))
# %90 TrainSet %10 TestSet
trainSet <- movielens[sampleIndex , ]
testSet <- movielens[-sampleIndex , ]
str(trainSet) # column names and classes
dim(trainSet) # 90752 row and 6 column
head(trainSet)
trainSet %>% group_by(genres) %>% summarise(n=n()) #first 6 genres
# movies classified in more than one genre
tibble(count = str_count(trainSet$genres, fixed("|")), genres = trainSet$genres) %>%
group_by(count, genres) %>%
summarise(n = n()) %>%
arrange(-count) %>%
head()
# Rating period almost 22 and half year
tibble(`Initial Date` = date(as_datetime(min(trainSet$timestamp), origin="1970-01-01")),
`Final Date` = date(as_datetime(max(trainSet$timestamp), origin="1970-01-01"))) %>%
mutate(Period = duration(max(trainSet$timestamp)-min(trainSet$timestamp)))
visual<- trainSet %>% mutate(year = year(as_datetime(timestamp, origin="1970-01-01")))
hist(visual$year ,
main = "Rating Distribution Per Year" ,
xlab = "Year" ,
ylab = "Number of ratings")
# most active users
trainSet %>% group_by(userId) %>%
summarise(n=n()) %>%
arrange(n) %>%
head()
# frequency of ratings
hist(trainSet$rating ,
main = "Frequancy of ratings" ,
xlab = "Rates")
head(trainSet)
model1 <- lm(rating ~ movieId , data = trainSet)
model1
summary(model1)
predictions <- predict(model1 , testSet)
predictions
# Difference between predictions and real values
library(caret)
R2(predictions , testSet$rating)
RMSE(predictions , testSet$rating)
MAE(predictions , testSet$rating)
summary(testSet$rating)
df_user62 <- trainSet[trainSet$userId == "62" ,]
summarise(df_user62, group_by(genres))
gentabl<- table(df_user62$genres)
popular_genre <- sort(gentabl, decreasing = T)
popular_genre #
popular_genre <- as.data.frame(popular_genre)
pop <- popular_genre[[1,1]]
pop <- movies[movies$genres == pop,]
pop[sample(nrow(pop), 5), ]