In [None]:
import pandas as pd
import sklearn
import string
import nltk
nltk.download('stopwords')
from nltk.stem.porter import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Jupyter notebook created for movies recomendation based on tags.
The first step is to load data set and check tags appearance.

In [None]:
df = pd.read_csv("/content/tags.csv.zip")

The most popular is sci-fi tags.

In [None]:
df['tag'].value_counts()

sci-fi              8330
atmospheric         6516
action              5907
comedy              5702
surreal             5326
                    ... 
teen sleuth            1
evil twins             1
paternity test         1
QVC                    1
cornetto triolgy       1
Name: tag, Length: 73050, dtype: int64

In [None]:
translator = str.maketrans('','', string.punctuation)
tags = list(df['tag'].str.lower().unique())
tags = [str(t).translate(translator) for t in tags]

df[df['tag'].map(lambda x: type(x)) != str]

Unnamed: 0,userId,movieId,tag,timestamp
860902,121710,33826,,1221450908
976395,141727,123,,1199450867
976396,141727,346,,1199451946
976400,141727,1184,,1199452261
976407,141727,1785,,1199452006
976408,141727,2194,,1199450677
976410,141727,2691,,1199451002
976418,141727,4103,,1199451920
976420,141727,4473,,1199451040
976422,141727,4616,,1199452441


#Stemming 
Used to reduce number of tags by removal of conjugated forms.

In [None]:
def transform(vector):
  t = str(vector)
  t = t.translate(translator)
  return stemmer.stem(t)

In [None]:
stemmer = PorterStemmer()
stags = [stemmer.stem(t) for t in tags]
stags = set(stags)

Tags which appears less then 200 times are droped.

In [None]:
df['tag'] = df['tag'].str.lower()
df['ttag'] = df['tag'].map(transform)
c = df['ttag'].value_counts()
c = set(c[c>200].index)
df = df[df['ttag'].isin(c)]
filtered_df = df[df['ttag'].isin(c)]

#Creation of the movies' and users' profiles

##Movie profile

In [None]:
movies = filtered_df[['movieId', 'userId', 'ttag']].groupby('movieId')['ttag'].apply(set)
movies = filtered_df.join(movies, on='movieId', rsuffix='_g')
movies = movies.drop_duplicates("movieId")
movies_profile = movies[["movieId", 'ttag_g']]
print(movies_profile)

         movieId                                             ttag_g
0            260  {intellig, betamax, horror, feelgood, western,...
2           1732  {violenc, humor, funni, marijuana, atmospher, ...
4           7569  {70mm, assassin, so bad its good, submarin, sp...
6         115569  {car chas, intellig, creepi, satir, geniu, dar...
7         115713  {twist end, violenc, intellig, creepi, dystopi...
...          ...                                                ...
1092704   163458                                            {india}
1092707   163460                                            {india}
1092710   163462                                            {india}
1092801   167466                                     {music, drama}
1093357   189169                          {comedi, disabl, robberi}

[38832 rows x 2 columns]


##User profile

In [None]:
users = filtered_df[['movieId', 'userId', 'ttag']].groupby('userId')['ttag'].apply(set)
users = filtered_df.join(users, on='userId', rsuffix='_u')
users_profile = users.drop_duplicates('userId')
users_profile = users_profile[['userId', 'ttag_u']]
print(users_profile)

         userId                                             ttag_u
0             3                                   {scifi, classic}
2             4  {artificial intellig, music, tens, philosoph, ...
16           19  {adventur, anim, postapocalypt, fantasi, hayao...
24           43                                   {clint eastwood}
25           68                                            {music}
...         ...                                                ...
1093336  162462                                     {space, scifi}
1093344  162492                              {epic, classic scifi}
1093346  162495                         {nudity full frontal  not}
1093348  162501                                            {crime}
1093357  162534                          {comedi, disabl, robberi}

[12646 rows x 2 columns]


#Recomendation
To recomend movies Jaccard Index is used.

In [None]:
def jaccard(A, B):
  return len(A.intersection(B)) / len(A.union(B))

## movie recomendation fo every user

In [None]:
for index, user_row in users_profile.iterrows():
  for index, movie_row in movies_profile.iterrows():
    if jaccard(user_row['ttag_u'], movie_row['ttag_g']) >= 0.5:
      print(user_row['userId'], user_row['ttag_u'])
      print(movie_row['movieId'], movie_row['ttag_g'])
      print("======================")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
58434 {'suspens'}
6912 {'suspens', 'crime'}
64548 {'suspens'}
6912 {'suspens', 'crime'}
68597 {'suspens'}
6912 {'suspens', 'crime'}
69670 {'suspens'}
6912 {'suspens', 'crime'}
74064 {'suspens'}
6912 {'suspens', 'crime'}
78698 {'suspens'}
6912 {'suspens', 'crime'}
78838 {'suspens'}
6912 {'suspens', 'crime'}
80233 {'suspens'}
6912 {'suspens', 'crime'}
81195 {'suspens'}
6912 {'suspens', 'crime'}
82308 {'suspens'}
6912 {'suspens', 'crime'}
83227 {'suspens'}
6912 {'suspens', 'crime'}
91694 {'suspens'}
6912 {'suspens', 'crime'}
95650 {'suspens'}
6912 {'suspens', 'crime'}
95896 {'suspens'}
6912 {'suspens', 'crime'}
95975 {'suspens'}
6912 {'suspens', 'crime'}
97853 {'suspens'}
6912 {'suspens', 'crime'}
98004 {'suspens'}
6912 {'suspens', 'crime'}
98475 {'crime'}
6912 {'suspens', 'crime'}
100906 {'suspens'}
6912 {'suspens', 'crime'}
106401 {'suspens'}
6912 {'suspens', 'crime'}
110975 {'suspens'}
6912 {'suspens', 'crime'}
116130 {'s

KeyboardInterrupt: ignored

## Recomendation for chosen user

In [None]:
def recomendation_for_user(user, treshold):
  f = list(df[df['userId']==user]['movieId'].unique())
  for index, movie_row in movies_profile.iterrows():
    if jaccard(users_profile.loc[users_profile['userId']==user, 'ttag_u'].iloc[0], movie_row['ttag_g']) >= treshold and f !=movie_row['movieId']:
      print(movie_row['movieId'], movie_row['ttag_g'])
      print("======================")


User with threshold

In [None]:
recomendation_for_user(19, 0.5)

In [None]:
recomendation_for_user(91, 0.2)

356 {'vietnam war', 'histori', 'intellig', 'feel good movi', 'reflect', 'oscar best supporting actor', 'feelgood', 'plot', 'goofi', 'slow', 'comedi', 'new york c', 'good act', 'unpredict', 'mental il', 'clich', 'forest', 'great act', 'good dialogu', 'happy end', 'bechdel testfail', 'drama', 'inspir', 'whimsic', 'bulli', 'gay', 'footbal', 'uplift', 'fun', 'dark comedi', 'memasas movi', 'tumeys dvd', 'biopic', 'vietnam', 'long', 'sweet', 'sentiment', 'heartwarm', 'funni', 'disappoint', 'box', 'act', 'based on novel or book', 'flashback', 'philosoph', 'thoughtprovok', 'poignant', 'based on book', 'biographi', 'action', 'watch', 'friendship', '1970', 'origin', 'oscar best pictur', 'los angel', 'nostalgia', 'beauti', 'interest', 'psycholog', 'steven spielberg', 'famili', 'overr', 'book', 'militari', 'underdog', 'clv', 'family relationship', 'good soundtrack', 'oscar best actor', 'masterpiec', 'humor', 'excellent script', 'cute', 'war', 'emot', 'special effect', 'storytel', '1960', 'silli', 

In [None]:
recomendation_for_user(3, 0.5)

126579 {'scifi'}
5722 {'classic'}
81736 {'classic'}


In [None]:
list(df[df['userId']==3]['movieId'].unique())

[260]