In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
!pwd

/content


#Collaborative Based Filtering:

####Dataset: MovieLens - Open source. https://grouplens.org/datasets/movielens/latest/

####Technique: KNN with Cosine Similarity.

In [None]:
os.mkdir('movie_dataset')
os.chdir(os.path.join(os.getcwd(),'movie_dataset'))
!pwd

/content/movie_dataset


In [None]:
movies=pd.read_csv('movies.csv')
rating=pd.read_csv('ratings.csv')

FileNotFoundError: ignored

In [None]:
rating.head()

In [None]:
movies.head()

In [None]:
print('ratings:{}'.format(rating.shape))
print('movies:{}'.format(movies.shape))

In [None]:
df=pd.merge(rating,movies,on='movieId')
df.head()

In [None]:
df.info()

In [None]:
combine_movie_rating = df.dropna(axis = 0, subset = ['title'])
movie_ratingCount = (combine_movie_rating.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )
movie_ratingCount.head()

#Final Data Frame - with  rating counts.

In [None]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

In [None]:
rating_with_totalRatingCount.describe()

In [None]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
print(movie_ratingCount['totalRatingCount'].describe())

In [None]:
#setting threhsold for min ratings to test against
popularity_threshold = 50
rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie=rating_popular_movie.drop(columns=['genres','timestamp'])
rating_popular_movie.head()

In [None]:
rating_popular_movie.shape

In [None]:
movie_features=rating_popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
print(movie_features.shape)
movie_features.head()

In [None]:
from scipy.sparse import csr_matrix

features_matrix = csr_matrix(movie_features.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(features_matrix)

In [None]:
#finding and sorting the nearest neighbours for a random movie
random_movie = np.random.choice(movie_features.shape[0])
print(random_movie)
distances, indices = model_knn.kneighbors(movie_features.iloc[random_movie,:].values.reshape(1, -1), n_neighbors = 6)

In [None]:
distances.shape

In [None]:
movies,distance=[],[]
print('Recommendations for {0}:\n'.format(movie_features.index[random_movie]))
for i in range(0,len(distances.flatten())):
  movies.append(movie_features.index[indices.flatten()[i]])
  distance.append(distances.flatten()[i])

dict={'movies': movies, 'distances': distances[0]}
display_dataframe= pd.DataFrame(dict)
display_dataframe.head()

#Content based filtering


In [None]:
from google.colab import files
files.upload()

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
!pwd

In [None]:
!kaggle datasets download -d tmdb/tmdb-movie-metadata

In [None]:
from zipfile import ZipFile
with ZipFile('/content/movie_dataset/tmdb-movie-metadata.zip')as z:
  z.extractall()
  print('Extracted Data')

In [None]:
credits=pd.read_csv('/content/movie_dataset/tmdb_5000_credits.csv')
movies_df=pd.read_csv('/content/movie_dataset/tmdb_5000_movies.csv')

In [None]:
credits.head()

In [None]:
movies_df.head()

In [None]:
credits_column_renamed = credits.rename(index=str, columns={"movie_id": "id"})
movies_df_merge = movies_df.merge(credits_column_renamed, on='id')
movies_df_merge.head()

In [None]:
movies_cleaned_df = movies_df_merge.drop(columns=['homepage', 'title_x', 'title_y', 'status','production_countries'])
movies_cleaned_df.head()

In [None]:
movies_cleaned_df.shape

In [None]:
movies_cleaned_df.info()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfv = TfidfVectorizer(min_df=3,  max_features=None,
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

# Filling NaNs with empty string
movies_cleaned_df['overview'] = movies_cleaned_df['overview'].fillna('')

In [None]:
tfv_matrix = tfv.fit_transform(movies_cleaned_df['overview'])

In [None]:
tfv_matrix,tfv_matrix.shape

In [None]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

In [None]:
indices = pd.Series(movies_cleaned_df.index, index=movies_cleaned_df['original_title']).drop_duplicates()

In [None]:
indices[:15]

In [None]:
#sorting sigmoid kernel

In [None]:
sorted(list(enumerate(sig[indices['Newlyweds']])), key=lambda x: x[1], reverse=True)


In [None]:
def recommend(title,sig=sig):
  idx = indices[title]

  # Get the pairwsie similarity scores
  sig_scores = list(enumerate(sig[idx]))

    # Sort the movies
  sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 5 most similar movies
  sig_scores = sig_scores[1:6]

    # Movie indices
  movie_indices = [i[0] for i in sig_scores]

    # Top 5 most similar movie
  return movies_cleaned_df['original_title'].iloc[movie_indices]



In [None]:
recommend("Pirates of the Caribbean: Dead Man's Chest")