# Basic recommender for MovieLens data

## Imports

In [66]:
import pandas as pd
import numpy as np
import re

from matplotlib import pyplot as plt


## Read data

In [67]:
data_path = 'data/movielens_latest_small/{}.csv'

ratings_data = pd.read_csv(data_path.format('ratings'))
movies_data = pd.read_csv(data_path.format('movies'))
tags_data = pd.read_csv(data_path.format('tags'))
links_data = pd.read_csv(data_path.format('links'))

## Data preprocessing

In [68]:
def get_movie_year(title):
    title_re = re.compile(r'.+[\s,-]\(?(\d\d\d\d)\)')
    year = title_re.search(title)
    if year:
        return year.group(1)
    
def get_movie_title(full_title):
    title_re = re.compile(r'(.+)[\s,-]\(?(\d\d\d\d)\)')
    title = title_re.search(full_title)
    if title:
        return title.group(1)
    
movies_data["year"] = movies_data["title"].apply(get_movie_year).fillna("1990").astype(int)
movies_data["title"] = movies_data["title"].apply(get_movie_title)

movie_genres = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", 
                "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
for genre in movie_genres:
    movies_data[genre] = movies_data["genres"].apply(lambda x: int(genre in x))

movies_data = movies_data.drop("genres", 1)



In [75]:
ratings_data["time"] = ratings_data["timestamp"].apply(pd.to_datetime, unit='s')
ratings_data["year"] = ratings_data["time"].apply(lambda x: x.year)
ratings_data["month"] = ratings_data["time"].apply(lambda x: x.month)
ratings_data["day"] = ratings_data["time"].apply(lambda x: x.day)
ratings_data["hour"] = ratings_data["time"].apply(lambda x: x.hour)

ratings_data = movies_data.drop("timestamp", 1)
ratings_data = movies_data.drop("time", 1)


In [76]:
print(ratings_data.head())

   userId  movieId  rating  timestamp                time  year  month  day  \
0       1        1       5  847117005 1996-11-04 14:16:45  1996     11    4   
1       1        2       3  847642142 1996-11-10 16:09:02  1996     11   10   
2       1       10       3  847641896 1996-11-10 16:04:56  1996     11   10   
3       1       32       4  847642008 1996-11-10 16:06:48  1996     11   10   
4       1       34       4  847641956 1996-11-10 16:05:56  1996     11   10   

   hour  
0    14  
1    16  
2    16  
3    16  
4    16  


## Collaborative filtering