## Import Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

## Load Dataset

In [2]:
movies = pd.read_csv("movie.csv")
tags = pd.read_csv("genome_tags.csv")
scores = pd.read_csv("genome_scores.csv")

In [3]:
print('Jumlah data film yang tersedia: ', len(movies.movieId.unique()))
print('Jumlah tag unik: ', len(tags.tagId.unique()))
print('Jumlah skor relevansi: ', len(scores))

Jumlah data film yang tersedia:  27278
Jumlah tag unik:  1128
Jumlah skor relevansi:  11709768


## EDA
Tahap eksplorasi penting untuk memahami variabel-variabel pada data serta korelasi antar variabel. Pemahaman terhadap variabel pada data dan korelasinya akan membantu kita dalam menentukan pendekatan atau algoritma yang cocok untuk data kita.

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tagId   1128 non-null   int64 
 1   tag     1128 non-null   object
dtypes: int64(1), object(1)
memory usage: 17.8+ KB


In [7]:
scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11709768 entries, 0 to 11709767
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   tagId      int64  
 2   relevance  float64
dtypes: float64(1), int64(2)
memory usage: 268.0 MB


## Data Preparation
Pada tahap data preparation, dilakukan penggabungan dan transformasi data tag agar setiap film memiliki representasi teks yang merefleksikan kontennya berdasarkan tag relevan, yang kemudian digunakan untuk proses vektorisasi dan perhitungan kemiripan.


In [8]:
print("\nCek missing value:")
print("Movies:")
print(movies.isnull().sum())
print("Tags:")
print(tags.isnull().sum())
print("Scores:")
print(scores.isnull().sum())


Cek missing value:
Movies:
movieId    0
title      0
genres     0
dtype: int64
Tags:
tagId    0
tag      0
dtype: int64
Scores:
movieId      0
tagId        0
relevance    0
dtype: int64


### Mengecek apakah ada nilai kosong (missing value) dalam masing-masing dataset. Ini penting sebelum proses pemodelan. Disini tidak terdapat missing value

In [9]:
tag_data = scores.merge(tags, on="tagId")

### Menggabungkan scores dan tags berdasarkan tagId sehingga setiap baris menyertakan movieId, tag, dan relevance. Ini adalah fondasi untuk representasi konten film.

In [10]:
film_tag_repr = tag_data.groupby("movieId").apply(lambda x: ' '.join(
    x.sort_values("relevance", ascending=False).head(20)["tag"].values))
film_tag_repr = film_tag_repr.reset_index()
film_tag_repr.columns = ["movieId", "tags"]

### Membuat representasi teks (tag) untuk setiap film:
- Mengambil 20 tag teratas berdasarkan relevansi.
- Menggabungkannya menjadi satu string per film.
- Output: DataFrame dengan movieId dan teks tags.

In [11]:
movies = movies.merge(film_tag_repr, on="movieId")

### Menambahkan kolom tags hasil dari proses sebelumnya ke dalam DataFrame movies. Sekarang setiap film memiliki deskripsi berbasis tag.



In [12]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(movies['tags'])

### Mengubah kolom tags menjadi matriks TF-IDF (representasi numerik) yang akan digunakan untuk menghitung kemiripan antar film.

In [13]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [15]:
cosine_df = pd.DataFrame(cosine_sim, index=movies['title'], columns=movies['title'])
cosine_df.iloc[:5, :5]

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Toy Story (1995),1.0,0.191271,0.036026,0.059826,0.050819
Jumanji (1995),0.191271,1.0,0.037201,0.037412,0.084219
Grumpier Old Men (1995),0.036026,0.037201,1.0,0.041308,0.413571
Waiting to Exhale (1995),0.059826,0.037412,0.041308,1.0,0.248635
Father of the Bride Part II (1995),0.050819,0.084219,0.413571,0.248635,1.0


### Menghitung matriks kemiripan kosinus antar semua film berdasarkan vektor TF-IDF mereka. Semakin besar nilai, semakin mirip dua film tersebut.

## Membuat Fungsi Rekomendasi
### Fungsi utama sistem rekomendasi:
- Menerima judul film dan jumlah rekomendasi.
- Mencari film paling mirip berdasarkan nilai kemiripan kosinus.

In [16]:
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def rekomendasikan_film(judul, n=5):
    if judul not in indices:
        return f"Film '{judul}' tidak ditemukan dalam dataset."
    idx = indices[judul]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  # Skip film itu sendiri
    film_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[film_indices].tolist()


In [17]:
print("Rekomendasi untuk 'Toy Story (1995)':")
print(rekomendasikan_film("Toy Story (1995)"))

Rekomendasi untuk 'Toy Story (1995)':
['Monsters, Inc. (2001)', 'Toy Story 2 (1999)', "Bug's Life, A (1998)", 'Toy Story 3 (2010)', 'Incredibles, The (2004)']


### Menguji fungsi rekomendasi dengan film “Toy Story (1995)” untuk melihat film-film yang paling mirip berdasarkan kontennya.



## Evaluasi
- Evaluasi dilakukan secara kualitatif dengan melihat apakah hasil rekomendasi memang mirip secara konten.
- Uji coba beberapa film dan lihat apakah genre/tag-nya relevan terhadap input.

In [28]:
movies[['title', 'genres']].head()


Unnamed: 0,title,genres
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji (1995),Adventure|Children|Fantasy
2,Grumpier Old Men (1995),Comedy|Romance
3,Waiting to Exhale (1995),Comedy|Drama|Romance
4,Father of the Bride Part II (1995),Comedy


In [29]:
movies['genre_set'] = movies['genres'].apply(lambda x: set(x.lower().split('|')))


In [36]:
def evaluate_precision_at_k(top_k=5, sample_size=100):
    total_precision = 0
    count = 0

    for idx, row in movies.sample(sample_size, random_state=42).iterrows():
        judul = row['title']
        true_genres = row['genre_set']
        
        rekomendasi = rekomendasikan_film(judul, n=top_k)
        
        if isinstance(rekomendasi, str):  # artinya film tidak ditemukan
            continue
        
        overlap_count = 0
        for rec_title in rekomendasi:
            rec_genres = movies[movies['title'] == rec_title]['genre_set'].values
            if len(rec_genres) == 0:
                continue
            rec_genres = rec_genres[0]
            if len(true_genres & rec_genres) > 0:
                overlap_count += 1
        
        precision = overlap_count / top_k
        total_precision += precision
        count += 1

    avg_precision = total_precision / count
    print(f"Average Precision@{top_k} (based on genre overlap): {avg_precision:.4f}")
    return avg_precision


In [37]:
evaluate_precision_at_k(top_k=5, sample_size=100)


Average Precision@5 (based on genre overlap): 0.8520


0.8519999999999998

In [38]:
test_titles = ["Toy Story (1995)", "Jumanji (1995)", "Heat (1995)"]

for title in test_titles:
    print(f"\nRekomendasi untuk '{title}':")
    print(rekomendasikan_film(title))


Rekomendasi untuk 'Toy Story (1995)':
['Monsters, Inc. (2001)', 'Toy Story 2 (1999)', "Bug's Life, A (1998)", 'Toy Story 3 (2010)', 'Incredibles, The (2004)']

Rekomendasi untuk 'Jumanji (1995)':
['Super Mario Bros. (1993)', 'In the Name of the King: A Dungeon Siege Tale (2008)', 'Prince of Persia: The Sands of Time (2010)', 'Resident Evil: Degeneration (Baiohazâdo: Dijenerêshon) (2008)', 'Mortal Kombat (1995)']

Rekomendasi untuk 'Heat (1995)':
['Boyz N the Hood (1991)', 'Town, The (2010)', 'Crash (2004)', 'Die Hard (1988)', 'Takers (2010)']
