In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

df = pd.read_csv("movie_data.csv")


In [67]:
# 1. Data Cleaning
    # - Handling Missing Value
print("Missing Values:\n", df.isna().sum())
    # - Handling Duplicate Value
print("\nJumlah Duplikat:", df.duplicated().sum())

# Konversi kolom duration menjadi menit(int)
def convert_duration(d):
    if isinstance(d, str):
        parts = d.split()
        minutes = 0
        for p in parts:
            if "h" in p:
                minutes += int(p.replace("h", "")) * 60
            elif "min" in p:
                minutes += int(p.replace("min", ""))
        return minutes
    return np.nan

df["duration_minutes"] = df["duration"].apply(convert_duration)

# Isi missing value di kolom durasi dengan median
imputer = SimpleImputer(strategy="median")
df["duration_minutes"] = imputer.fit_transform(df[["duration_minutes"]])

    # - Handling outlier pada rating dengan metode IQR
Q1 = df["rating"].quantile(0.25)
Q3 = df["rating"].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df = df[(df["rating"] >= lower) & (df["rating"] <= upper)]

print("\nData setelah cleaning:", df.shape)

Missing Values:
 name           0
year           0
duration       0
genre          0
rating         0
description    0
director       0
stars          0
dtype: int64

Jumlah Duplikat: 0

Data setelah cleaning: (239, 9)


In [68]:
# 2. Normalisasi/Standarisasi
scaler = StandardScaler()
df[["year_scaled", "rating_scaled", "duration_scaled"]] = scaler.fit_transform(
    df[["year", "rating", "duration_minutes"]]
)

In [69]:
# 3. Encoding kategorikal
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
genre_encoded = encoder.fit_transform(df[["genre"]])

# dataframe hasil encoding
genre_encoded_df = pd.DataFrame(genre_encoded, columns=encoder.get_feature_names_out(["genre"]))

#  hasil encoding dengan dataframe utama
df = pd.concat([df.reset_index(drop=True), genre_encoded_df.reset_index(drop=True)], axis=1)

In [70]:
# 4. Feature Engineering

# Menghitung usia film
CURRENT_YEAR = 2025
df['movie_age'] = CURRENT_YEAR - df['year']
df.drop('year', axis=1, inplace=True)
numeric_cols = ['rating', 'duration_min', 'movie_age']

In [71]:
# 5. SPLIT DATA

# Target yang ingin diprediksi adalah rating
X = df.drop(columns=["rating", "duration", "description", "name", "director", "stars"])
y = df["rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nShape X_train:", X_train.shape)
print("Shape X_test :", X_test.shape)


Shape X_train: (191, 102)
Shape X_test : (48, 102)


In [72]:
# Dataset hasil preprocessing

df.head()

Unnamed: 0,name,duration,genre,rating,description,director,stars,duration_minutes,year_scaled,rating_scaled,...,genre_Fantasy Horror,genre_Fantasy Horror Thriller,genre_Horror,genre_Horror Mystery,genre_Horror Mystery Sci-Fi,genre_Horror Mystery Thriller,genre_Horror Sci-Fi,genre_Horror Sci-Fi Thriller,genre_Horror Thriller,movie_age
0,Going Overboard,1h 39min,Comedy,1.8,A struggling young comedian takes a menial job...,Valerie Breiman,Adam Sandler Billy Bob Thornton Billy Zane,99.0,-1.706101,-2.760586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36
1,Radhe,1h 53min,Action Thriller,1.8,An honest cop is determined to bring down a co...,Prabhu Deva,Salman Khan Disha Patani Randeep Hooda,113.0,1.145354,-2.760586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,Disaster Movie,1h 27min,Comedy Sci-Fi,1.9,"Over the course of one evening, an unsuspectin...",Jason Friedberg,Carmen Electra Vanessa Lachey Nicole Parker,87.0,-0.013049,-2.613544,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17
3,From Justin to Kelly,1h 21min,Comedy Music Musical,1.9,A waitress from Texas and a college student fr...,Robert Iscove,Kelly Clarkson Justin Guarini Katherine Bailess,81.0,-0.458589,-2.613544,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22
4,Race 3,2h 40min,Action Thriller,1.9,Relationships and loyalties of a criminal fami...,Remo D'Souza,Anil Kapoor Salman Khan Bobby Deol,160.0,0.878031,-2.613544,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7


# Kesimpulan Analisis Data
1. Data cleaning
   - Untuk missing values pada nilai null dan duplikat tidak ada
   - Konversi Durasi dari teks "1h 39m" menjadi kolom numerik dengan label duration_minutes dalam numerik
   - Outlier pada rating dihapus menggunakan IQR, setelah pembersihan data berkurang menjadi 239.
2. Normalisasi/Standarisasi
   Kolom numerik year, rating, dan duration_minutes diubah skalanya menggunakan StandardScaler. mentransformasi data memiliki rata" nol dan deviasi  satu. Hasil standarisasi disimpan di kolom year_scaled, rating_scaled, dan duration_scaled.
3. Encoding Kategorikal
    genre yang bersifat kategorikal diubah menjadi representasi numerik menggunakan one-hot encoding
   - setiap kategori unik dalam kolom genre(misalnya "Action Thriller") menjadi kolom biner baru.
   - Nilai 1.0 berarti genre tersebut terdapat pada film dan 0.0 menandakan ketidakberadaan.
   - Output encoding digabung ke dataframe utama.
4. Feature Engineering
    movie_age dibuat dengan menghitung selisih antara tahun acuan, yaitu 2025 dan tahun rilis film. Untuk mendapatkan informasi umur film.
5. Split Data
    Data disiapkan untuk pelatihan model machine learning
    - Target(y): kolom yang diprediksi "rating"
    - Fitur (x): Semua kolom lain kecuali kolom yang tidak relevan(rating, duration, description, name, director, stars) dan termasuk semua kolom hasil preprocessing(duration_minutes, year_scaled, rating_scaled, duration_scaled, kolom hasil encoding dan movie_age)
    - Data dibagi menjadi data training set dan data test set dengan rasio 80:20
    - Xtrain:(191, 102 kolom)
    - xtest:(48 baris, 102 kolom)