## **Import Library**

In [226]:
import pandas as pd

## **Load data JSON**

In [227]:
# Load dataset dari file JSON
df = pd.read_json("/content/data_film.json")
df.head()

Unnamed: 0,judul,tags,deskripsi,sutradara,bintang,negara,durasi,rating,rilis_date
0,Superman,"Canada, Australia, United States, New Zealand,...",Dalam versi reimajinasi yang epik dari Superma...,James Gunn,"Nicholas Hoult, Alan Tudyk, Rachel Brosnahan, ...","Canada, Australia, United States, New Zealand",00:30,2.3,2006-02-26
1,A Gilded Game,"China, Action",Seorang mahasiswa keuangan berbakat mendapatka...,Herman Yau,"Andy Lau, Hao Ou, James Filbird",China,00:37,1.7,2002-03-02
2,Night Carnage,"United States, Action, Romance, Horror",Seorang blogger manusia serigala bertemu denga...,Sadie Katz,United States,,01:03,9.5,2009-11-12
3,Blood Brothers: Bara Naga,"Malaysia, Drama, Action, Crime",Sekelompok pria yang bekerja di sebuah organis...,Sharnaaz Ahmad,Malaysia,,00:54,9.6,2021-01-30
4,High Five aka Hi Five,"South Korea, Comedy, Action, Adventure","Lima orang biasa, yang mengembangkan kekuatan ...",Yoo Ah-In,South Korea,,02:09,0.7,2011-04-04


In [228]:
# Melihat informasi umum tentang dataset
print("\nInformasi Dataset:")
df.info()


Informasi Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56276 entries, 0 to 56275
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   judul       56276 non-null  object 
 1   tags        51458 non-null  object 
 2   deskripsi   49358 non-null  object 
 3   sutradara   51340 non-null  object 
 4   bintang     51277 non-null  object 
 5   negara      28202 non-null  object 
 6   durasi      56276 non-null  object 
 7   rating      56276 non-null  float64
 8   rilis_date  56276 non-null  object 
dtypes: float64(1), object(8)
memory usage: 3.9+ MB


## **Eksplorasi awal**

In [229]:
# Cek nilai yang hilang
print("\nJumlah Nilai yang Hilang Sebelum Pembersihan:")
print(df.isnull().sum())

# Mengecek apakah ada data duplikat
print("\nJumlah Baris Duplikat:")
print(df.duplicated().sum())


Jumlah Nilai yang Hilang Sebelum Pembersihan:
judul             0
tags           4818
deskripsi      6918
sutradara      4936
bintang        4999
negara        28074
durasi            0
rating            0
rilis_date        0
dtype: int64

Jumlah Baris Duplikat:
0


## **Data Cleaning**

In [230]:
# Bersihkan nama kolom
df.columns = df.columns.str.strip().str.lower()
text_cols = ['tags', 'deskripsi', 'sutradara', 'bintang', 'negara']

for col in text_cols:
    if col in df.columns:
        df[col] = df[col].replace(['NaN', 'nan', 'null', '', None], pd.NA)

        default = 'Unknown' if col in ['tags', 'sutradara', 'bintang', 'negara'] else 'Tidak ada deskripsi'
        df[col] = df[col].fillna(default)
        df[col] = df[col].astype(str).str.strip().str.title()


In [231]:
# Cleaning Kolom Rating
if 'rating' in df.columns:
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce').fillna(0)

# Cleaning Kolom Rilis Date
if 'rilis_date' in df.columns:
    df['rilis_date'] = pd.to_datetime(df['rilis_date'], errors='coerce')

# Hapus baris dengan tanggal invalid (NaT)
    df = df.dropna(subset=['rilis_date'])

In [232]:
def convert_to_minutes(duration):
    if pd.isnull(duration) or str(duration).strip() == "":
        return 0

    duration = str(duration).lower().strip()

    if "min" in duration:
        digits = re.findall(r'\d+', duration)
        return int(digits[0]) if digits else 0

    if ":" in duration:
        parts = duration.split(":")
        if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
            return int(parts[0]) * 60 + int(parts[1])

    if duration.isdigit():
        return int(duration)
    return 0
df['durasi'] = df['durasi'].astype(str).apply(convert_to_minutes)


## **Validasi Hasil Cleaning**

In [233]:
print("\nData setelah cleaning:")
df.head()


Data setelah cleaning:


Unnamed: 0,judul,tags,deskripsi,sutradara,bintang,negara,durasi,rating,rilis_date
0,Superman,"Canada, Australia, United States, New Zealand,...",Dalam Versi Reimajinasi Yang Epik Dari Superma...,James Gunn,"Nicholas Hoult, Alan Tudyk, Rachel Brosnahan, ...","Canada, Australia, United States, New Zealand",30,2.3,2006-02-26
1,A Gilded Game,"China, Action",Seorang Mahasiswa Keuangan Berbakat Mendapatka...,Herman Yau,"Andy Lau, Hao Ou, James Filbird",China,37,1.7,2002-03-02
2,Night Carnage,"United States, Action, Romance, Horror",Seorang Blogger Manusia Serigala Bertemu Denga...,Sadie Katz,United States,Unknown,63,9.5,2009-11-12
3,Blood Brothers: Bara Naga,"Malaysia, Drama, Action, Crime",Sekelompok Pria Yang Bekerja Di Sebuah Organis...,Sharnaaz Ahmad,Malaysia,Unknown,54,9.6,2021-01-30
4,High Five aka Hi Five,"South Korea, Comedy, Action, Adventure","Lima Orang Biasa, Yang Mengembangkan Kekuatan ...",Yoo Ah-In,South Korea,Unknown,129,0.7,2011-04-04


In [234]:
print("\nInfo setelah cleaning:")
print(df.info())

print("\nMissing value setelah cleaning:")
print(df.isnull().sum())


Info setelah cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56276 entries, 0 to 56275
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   judul       56276 non-null  object        
 1   tags        56276 non-null  object        
 2   deskripsi   56276 non-null  object        
 3   sutradara   56276 non-null  object        
 4   bintang     56276 non-null  object        
 5   negara      56276 non-null  object        
 6   durasi      56276 non-null  int64         
 7   rating      56276 non-null  float64       
 8   rilis_date  56276 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(6)
memory usage: 3.9+ MB
None

Missing value setelah cleaning:
judul         0
tags          0
deskripsi     0
sutradara     0
bintang       0
negara        0
durasi        0
rating        0
rilis_date    0
dtype: int64


# **Simpan Data ke File CSV**

In [235]:
cleaned_file_path = "data_film_cleaned.csv"
df.to_csv(cleaned_file_path, index=False, encoding="utf-8")
print(f"\n Data hasil cleaning berhasil disimpan ke {cleaned_file_path}")


 Data hasil cleaning berhasil disimpan ke data_film_cleaned.csv
