# Project 3: Weenkend movie trip
## Import Packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import preprocessing
import matplotlib.pyplot as plt

## Loading the datasets and drop some unused rows
Source:https://grouplens.org/datasets/movielens/

In [2]:
movies = pd.read_csv("Data\\movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings = pd.read_csv("Data\\ratings.csv")
ratings = ratings.drop(['timestamp','userId'],axis = 1)
ratings = ratings.sort_values(by=['movieId'])
ratings.head()

Unnamed: 0,movieId,rating
0,1,4.0
81531,1,4.0
30517,1,3.5
81082,1,4.0
30601,1,3.0


In [4]:
tags = pd.read_csv("Data\\tags.csv")
tags = tags.drop(['timestamp','userId'],axis = 1)
tags = tags.sort_values(by=['movieId'])
tags['tag'] = preprocessing.LabelEncoder().fit_transform(tags['tag'])
tags.head()

Unnamed: 0,movieId,tag
2886,1,934
981,1,1244
629,1,1244
35,2,446
34,2,1117


## My ideas
The timestamp is not relevent to movie recommendation so I should drop it at the beginning. I can use tag and rating for building model.

## Merge movies, ratings and tag together
### calculate the average rating score for all movies

In [5]:
def AverageRating(rating):
    return ratings[['rating']].where(ratings[['movieId']].values == rating['movieId']).stack().mean()                  

movies['AvgRating'] = movies.apply(AverageRating, axis=1)
movies.head()

Unnamed: 0,movieId,title,genres,AvgRating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143
4,5,Father of the Bride Part II (1995),Comedy,3.071429


In [6]:
def AverageTags(tag):
    return tags[['tag']].where(tags[['movieId']].values == tag['movieId']).stack().mean()

movies['tag'] = movies.apply(AverageTags, axis=1)
movies.head()

Unnamed: 0,movieId,title,genres,AvgRating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,1140.666667
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,851.0
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,1186.5
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,
4,5,Father of the Bride Part II (1995),Comedy,3.071429,1294.5


### Double check the empty set

In [7]:
df = movies
df.dropna(inplace=True)
df.head()

Unnamed: 0,movieId,title,genres,AvgRating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,1140.666667
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,851.0
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,1186.5
4,5,Father of the Bride Part II (1995),Comedy,3.071429,1294.5
6,7,Sabrina (1995),Comedy|Romance,3.185185,1320.0


In [8]:
movies.to_csv('Data\\CompleteMovieData.csv')

### Convert string feature to integer

In [9]:
dataset = df
dataset['genres'] = preprocessing.LabelEncoder().fit_transform(dataset['genres'])
dataset.head()

Unnamed: 0,movieId,title,genres,AvgRating,tag
0,1,Toy Story (1995),115,3.92093,1140.666667
1,2,Jumanji (1995),145,3.431818,851.0
2,3,Grumpier Old Men (1995),259,3.259615,1186.5
4,5,Father of the Bride Part II (1995),219,3.071429,1294.5
6,7,Sabrina (1995),259,3.185185,1320.0


## Training K-Means cluster
I will use K-means cluster to make prediction.

In [10]:
X = dataset.loc[:, ['genres','AvgRating','tag']].values
X

array([[ 115.        ,    3.92093023, 1140.66666667],
       [ 145.        ,    3.43181818,  851.        ],
       [ 259.        ,    3.25961538, 1186.5       ],
       ...,
       [  65.        ,    3.875     ,  699.33333333],
       [  12.        ,    3.9       ,  797.5       ],
       [  48.        ,    3.5       ,  912.25      ]])

### Collect the movies that are satisfy the certain requirements

In [11]:
def recommendation_list(movie):
    target = dataset.loc[dataset['title'] == movie]['cluster'][0]
    recommend = dataset.loc[(dataset['cluster']==target) & (dataset['AvgRating']>=3.8)]
    return pd.DataFrame(recommend['title'].values)

### At beginning, we just choose a random nubmer as k and to see how it works.

In [12]:
kmeans=KMeans(n_clusters=4, random_state= 0).fit_predict(X)
dataset['cluster']=kmeans
dataset.head()

Unnamed: 0,movieId,title,genres,AvgRating,tag,cluster
0,1,Toy Story (1995),115,3.92093,1140.666667,0
1,2,Jumanji (1995),145,3.431818,851.0,0
2,3,Grumpier Old Men (1995),259,3.259615,1186.5,2
4,5,Father of the Bride Part II (1995),219,3.071429,1294.5,2
6,7,Sabrina (1995),259,3.185185,1320.0,2


### Toy Story (1995) K=4
It recommend too much film and a lot films are not related to each other. we should increase the size of k to make the result more accurate.

In [13]:
recommendation_list('Toy Story (1995)')

Unnamed: 0,0
0,Toy Story (1995)
1,"City of Lost Children, The (Cité des enfants p..."
2,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
3,Braveheart (1995)
4,Apollo 13 (1995)
...,...
214,John Wick: Chapter Two (2017)
215,Logan (2017)
216,Black Mirror: White Christmas (2014)
217,Blade Runner 2049 (2017)


### Toy Story (1995) K = 15
I tried k as 15 at second time and the result is much better than k = 6. I think this is because there are too many differnt genres and they influence the result a lot.

In [14]:
kmeans=KMeans(n_clusters=15, random_state= 0).fit_predict(X)
dataset['cluster']=kmeans
recommendation_list('Toy Story (1995)')

Unnamed: 0,0
0,Toy Story (1995)
1,"City of Lost Children, The (Cité des enfants p..."
2,2001: A Space Odyssey (1968)
3,"African Queen, The (1951)"
4,Aliens (1986)
5,Star Wars: Episode VI - Return of the Jedi (1983)
6,"Grand Day Out with Wallace and Gromit, A (1989)"
7,"Terminator, The (1984)"
8,"Femme Nikita, La (Nikita) (1990)"
9,"French Connection, The (1971)"


### Toy Story (1995) K = 30
When I increase the clusters number to 30, the result is even better than last one. It provide 15 movies which has close relationship with Toy Story. I am satify with this result.

In [15]:
kmeans=KMeans(n_clusters=30, random_state= 0).fit_predict(X)
dataset['cluster']=kmeans
recommendation_list('Toy Story (1995)')

Unnamed: 0,0
0,Toy Story (1995)
1,"African Queen, The (1951)"
2,Star Wars: Episode VI - Return of the Jedi (1983)
3,"Grand Day Out with Wallace and Gromit, A (1989)"
4,"Matrix, The (1999)"
5,"Gold Rush, The (1925)"
6,City of God (Cidade de Deus) (2002)
7,Kill Bill: Vol. 1 (2003)
8,Hero (Ying xiong) (2002)
9,Touching the Void (2003)


## Conclusion
For this project, I loaded three files: movies, ratings and tags. I droped some unused rows and caculate the average data for certain values to prevent duplication. Then I use K-means to pick number of cluster and form centroids based on distance function. It is little bit tricky when we choose the number of clusters. In the end, I obtained pretty good result by increase the number of clusters. 