In [1]:
from collections import Counter, defaultdict
import math
import numpy as np
import os
import pandas as pd
import re
from scipy.sparse import csr_matrix
import urllib.request
import zipfile

def download_data():
    """ DONE. Download and unzip data.
    """
    url = 'https://www.dropbox.com/s/h9ubx22ftdkyvd5/ml-latest-small.zip?dl=1'
    urllib.request.urlretrieve(url, 'ml-latest-small.zip')
    zfile = zipfile.ZipFile('ml-latest-small.zip')
    zfile.extractall()
    zfile.close()


def tokenize_string(my_string):
    """ DONE. You should use this in your tokenize function.
    """
    return re.findall('[\w\-]+', my_string.lower())



In [14]:
download_data()
path = 'ml-latest-small'
ratings = pd.read_csv(path + os.path.sep + 'ratings.csv')
movies = pd.read_csv(path + os.path.sep + 'movies.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [13]:
movies.sort_values('title').head(100)

Unnamed: 0,movieId,title,genres
6647,51372,"""Great Performances"" Cats (1998)",Musical
7460,74486,$9.99 (2008),Animation
8169,97757,'Hellboy': The Seeds of Creation (2004),Action|Adventure|Comedy|Documentary|Fantasy
7280,70121,'Neath the Arizona Skies (1934),Western
5821,26564,'Round Midnight (1986),Drama|Musical
5938,27751,'Salem's Lot (2004),Drama|Horror|Mystery|Thriller
643,779,'Til There Was You (1997),Drama|Romance
1632,2072,"'burbs, The (1989)",Comedy
2504,3112,'night Mother (1986),Drama
7269,69757,(500) Days of Summer (2009),Comedy|Drama|Romance


In [5]:
len(ratings)

100004

In [6]:
len(movies)

9125

In [8]:
def tokenize(movies):
    """
    Append a new column to the movies DataFrame with header 'tokens'.
    This will contain a list of strings, one per token, extracted
    from the 'genre' field of each movie. Use the tokenize_string method above.

    Note: you may modify the movies parameter directly; no need to make
    a new copy.
    Params:
      movies...The movies DataFrame
    Returns:
      The movies DataFrame, augmented to include a new column called 'tokens'.

    >>> movies = pd.DataFrame([[123, 'Horror|Romance'], [456, 'Sci-Fi']], columns=['movieId', 'genres'])
    >>> movies = tokenize(movies)
    >>> movies['tokens'].tolist()
    [['horror', 'romance'], ['sci-fi']]
    """
    ###TODO
    movies['tokens'] = movies['genres'].apply(lambda x: tokenize_string(x))
    return movies


In [15]:

movies = tokenize(movies)
movies.head()

Unnamed: 0,movieId,title,genres,tokens
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[adventure, animation, children, comedy, fantasy]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[adventure, children, fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,"[comedy, romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[comedy, drama, romance]"
4,5,Father of the Bride Part II (1995),Comedy,[comedy]


In [127]:
def featurize(movies):
    """
    Append a new column to the movies DataFrame with header 'features'.
    Each row will contain a csr_matrix of shape (1, num_features). Each
    entry in this matrix will contain the tf-idf value of the term, as
    defined in class:
    tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i))
    where:
    i is a term
    d is a document (movie)
    tf(i, d) is the frequency of term i in document d
    max_k tf(k, d) is the maximum frequency of any term in document d
    N is the number of documents (movies)
    df(i) is the number of unique documents containing term i

    Params:
      movies...The movies DataFrame
    Returns:
      A tuple containing:
      - The movies DataFrame, which has been modified to include a column named 'features'.
      - The vocab, a dict from term to int. Make sure the vocab is sorted alphabetically as in a2 (e.g., {'aardvark': 0, 'boy': 1, ...})
    """
    ###TODO
    
    vocab = defaultdict(lambda:len(vocab))
    tokens_list = []
    #Created the vocabulary
    for tokens in movies['tokens']:
        tokens_list += tokens
    for tokens in sorted(tokens_list):
        vocab[tokens]
    N = len(movies)
    df = defaultdict(int)
    #We need to create a dictionary for each term with frequency
    for term in vocab.keys():
        for tokens in movies['tokens']:
            if term in tokens:
                df[term] += 1
    #freq of term i in document d
    tf = defaultdict(lambda:Counter())
    for i, tokens in enumerate(movies['tokens']):
        tf[i].update(tokens)
    
    features = []
    for i, tokens in enumerate(movies['tokens']):
        feature_array = np.zeros(shape = (1, len(vocab)))
        for token in tokens:
            #print(token)
            feature_array[0, vocab[token]] = tf[i][token] / tf[i].most_common(1)[0][1] * math.log10(N / df[token])
        features.append(csr_matrix(feature_array))
    movies['features'] = features
    return movies , df
        
    
        


In [128]:
movies_example = movies.loc[:10]

In [129]:
movies_example

Unnamed: 0,movieId,title,genres,tokens
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[adventure, animation, children, comedy, fantasy]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[adventure, children, fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,"[comedy, romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[comedy, drama, romance]"
4,5,Father of the Bride Part II (1995),Comedy,[comedy]
5,6,Heat (1995),Action|Crime|Thriller,"[action, crime, thriller]"
6,7,Sabrina (1995),Comedy|Romance,"[comedy, romance]"
7,8,Tom and Huck (1995),Adventure|Children,"[adventure, children]"
8,9,Sudden Death (1995),Action,[action]
9,10,GoldenEye (1995),Action|Adventure|Thriller,"[action, adventure, thriller]"


In [130]:
movies_example_V1 , df= featurize(movies_example)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [124]:
movies_example_V1['features'][0].toarray()

array([[0.        , 0.43933269, 1.04139269, 0.56427143, 0.26324143,
        0.        , 0.        , 0.74036269, 0.        , 0.        ]])

In [134]:
sorted(df.items(), key = lambda x : x[1])

[('animation', 1),
 ('crime', 1),
 ('drama', 2),
 ('fantasy', 2),
 ('thriller', 2),
 ('action', 3),
 ('children', 3),
 ('adventure', 4),
 ('romance', 4),
 ('comedy', 6)]

In [132]:
vocab

defaultdict(<function __main__.featurize.<locals>.<lambda>>,
            {'action': 0,
             'adventure': 1,
             'animation': 2,
             'children': 3,
             'comedy': 4,
             'crime': 5,
             'drama': 6,
             'fantasy': 7,
             'romance': 8,
             'thriller': 9})