# **Movie Recommendation with Content-Based Filtering**

### **Import the library**

In [20]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ipywidgets as widgets
from IPython.display import display

### **Prepare the Dataset**

In [2]:
movie_df = pd.read_csv('dataset.csv')

In [3]:
movie_df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [5]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.3+ KB


### **Preprocessing the Data**

In [14]:
#check missing value
missing_values = movie_df.isnull().sum()

for col, missing in missing_values.items():
  if missing > 0:
    print(f"terdapat missing value di kolom '{col}' sebanyak {missing}")

if missing_values.sum() == 0:
  print("Tidak ada missing value")

terdapat missing value di kolom 'genre' sebanyak 3
terdapat missing value di kolom 'overview' sebanyak 13


In [23]:
#drop the missing value

movie_df = movie_df.dropna(subset=["genre", "overview"])

In [25]:
movie_df.isnull().sum()

Unnamed: 0,0
id,0
title,0
genre,0
original_language,0
overview,0
popularity,0
release_date,0
vote_average,0
vote_count,0
features,0


In [26]:
#combine features

movie_df['features'] = movie_df['genre'] + ' ' + movie_df['overview']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_df['features'] = movie_df['genre'] + ' ' + movie_df['overview']


### **CreateTF-IDF and Cosine Similarity**

In [27]:
# Compute similarity matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movie_df['features'])

In [28]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

### **Create input widgets, sliders, and recommendation functions**

In [36]:
# Widget untuk input judul
movie_input = widgets.Text(
    placeholder='Masukkan judul film',
    description='Film:',
    layout=widgets.Layout(width='50%')
)

rating_slider = widgets.FloatSlider(
    value=7.0,
    min=0,
    max=10,
    step=0.1,
    description='Min Rating:',
    continuous_update=False
)

output = widgets.Output()

In [45]:
# Event handler untuk mendapatkan rekomendasi
def get_recommendations():
    title = movie_input.value.lower()
    min_rating = rating_slider.value
    idx = movie_df[movie_df['title'].str.lower() == title].index
    if idx.empty:
        with output:
            output.clear_output()
            print("Film tidak ditemukan. Coba judul lain.")
    else:
        idx = idx[0]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]
        movie_indices = [i[0] for i in sim_scores]
        recommended_movies = movie_df.iloc[movie_indices]
        recommended_movies = recommended_movies[recommended_movies['vote_average'] >= min_rating]
        with output:
            output.clear_output()
            print(f"rekomendasi film dari dengan judul '{title}' adalah : \n")
            if recommended_movies.empty:
                print("Tidak ada film yang sesuai dengan kriteria.")
            else:
                # print("Rekomendasi Film:")
                display(recommended_movies[['title', 'genre', 'vote_average']])


movie_input.observe(lambda change: get_recommendations(), names='value')
rating_slider.observe(lambda change: get_recommendations(), names='value')

display(movie_input, rating_slider, output)

Text(value='Dilwale Dulhania Le Jayenge', description='Film:', layout=Layout(width='50%'), placeholder='Masukk…

FloatSlider(value=8.8, continuous_update=False, description='Min Rating:', max=10.0)

Output()