In [40]:
import pandas as pd
import ast

# Recleaning Columns

In [41]:
movies_poster_df = pd.read_csv('data/movies_poster.csv')

# Drop unused columns
movies_poster_df.drop(columns=['original_language', 'popularity'], inplace=True)

# Drop rows with missing values
movies_poster_df = movies_poster_df[
    (~movies_poster_df['overview'].str.strip().isin(['No overview found.', 'No Overview', '']) & movies_poster_df['overview'].notna()) &
    (movies_poster_df['poster_url'] != 'https://image.tmdb.org/t/p/original/None') 
]

# Drop duplicates & missing
movies_poster_df.drop_duplicates(subset=['imdb_id'], inplace=True)
movies_poster_df.drop_duplicates(subset=['title'], inplace=True)
movies_poster_df.drop_duplicates(subset=['overview'], inplace=True)
movies_poster_df.drop_duplicates(subset=['poster_url'], inplace=True)
movies_poster_df.dropna(subset=['poster_url'], inplace=True)

In [42]:
movies_poster_df

Unnamed: 0,genres,imdb_id,overview,title,vote_average,poster_url
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",tt0114709,"Led by Woody, Andy's toys live happily in his ...",Toy Story,7.7,https://image.tmdb.org/t/p/original//uXDfjJbdP...
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",tt0113497,When siblings Judy and Peter discover an encha...,Jumanji,6.9,https://image.tmdb.org/t/p/original//vgpXmVaVy...
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",tt0113228,A family wedding reignites the ancient feud be...,Grumpier Old Men,6.5,https://image.tmdb.org/t/p/original//1FSXpj5e8...
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",tt0114885,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale,6.1,https://image.tmdb.org/t/p/original//qJU6rfil5...
4,"[{'id': 35, 'name': 'Comedy'}]",tt0113041,Just when George Banks has recovered from his ...,Father of the Bride Part II,5.7,https://image.tmdb.org/t/p/original//rj4LBtwQ0...
...,...,...,...,...,...,...
44484,"[{'id': 878, 'name': 'Science Fiction'}]",tt0112613,It's the year 3000 AD. The world's most danger...,Caged Heat 3000,3.5,https://image.tmdb.org/t/p/original//un9pLM0Ku...
44486,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",tt6209470,Rising and falling between a man and woman.,Subdue,4.0,https://image.tmdb.org/t/p/original//cp6OyeDkP...
44487,"[{'id': 18, 'name': 'Drama'}]",tt2028550,An artist struggles to finish his work while a...,Century of Birthing,9.0,https://image.tmdb.org/t/p/original//5RuNHleRz...
44489,[],tt0008536,"In a small town live two brothers, one a minis...",Satan Triumphant,0.0,https://image.tmdb.org/t/p/original//kmPJ4iJn3...


# Tidy genres Column

In [None]:
def parse_and_extract(val):
    val = ast.literal_eval(val)
    return [d.get("name") for d in val if isinstance(d, dict)]

In [44]:
movies_poster_df["genres"] = movies_poster_df["genres"].apply(parse_and_extract)

In [45]:
movies_poster_df

Unnamed: 0,genres,imdb_id,overview,title,vote_average,poster_url
0,"[Animation, Comedy, Family]",tt0114709,"Led by Woody, Andy's toys live happily in his ...",Toy Story,7.7,https://image.tmdb.org/t/p/original//uXDfjJbdP...
1,"[Adventure, Fantasy, Family]",tt0113497,When siblings Judy and Peter discover an encha...,Jumanji,6.9,https://image.tmdb.org/t/p/original//vgpXmVaVy...
2,"[Romance, Comedy]",tt0113228,A family wedding reignites the ancient feud be...,Grumpier Old Men,6.5,https://image.tmdb.org/t/p/original//1FSXpj5e8...
3,"[Comedy, Drama, Romance]",tt0114885,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale,6.1,https://image.tmdb.org/t/p/original//qJU6rfil5...
4,[Comedy],tt0113041,Just when George Banks has recovered from his ...,Father of the Bride Part II,5.7,https://image.tmdb.org/t/p/original//rj4LBtwQ0...
...,...,...,...,...,...,...
44484,[Science Fiction],tt0112613,It's the year 3000 AD. The world's most danger...,Caged Heat 3000,3.5,https://image.tmdb.org/t/p/original//un9pLM0Ku...
44486,"[Drama, Family]",tt6209470,Rising and falling between a man and woman.,Subdue,4.0,https://image.tmdb.org/t/p/original//cp6OyeDkP...
44487,[Drama],tt2028550,An artist struggles to finish his work while a...,Century of Birthing,9.0,https://image.tmdb.org/t/p/original//5RuNHleRz...
44489,[],tt0008536,"In a small town live two brothers, one a minis...",Satan Triumphant,0.0,https://image.tmdb.org/t/p/original//kmPJ4iJn3...


In [46]:
movies_poster_df.to_csv('data/movies_cleaned.csv', index=False)