### Cleaning

In [108]:
# import pandas and dataframe
import pandas as pd 
import re


##### Cleaning for Movie Data

In [109]:
# import csv as a df
df = pd.read_csv("../data/digital/movies.csv")
df.drop(labels=["certificate", "votes", "ACTOR 2", "metascore", "ACTOR 1", "DIRECTOR ", "ACTOR 3", "ACTOR 4", "GROSS COLLECTION"], axis=1, inplace=True)
df.columns = ["ranking", "name", "year", "runtime", "genre", "rating", "summary"]
df["summary_length"] =df['summary'].str.len()
df['year'] = df['year'].apply(pd.to_numeric, errors='coerce').apply(abs)
df['genre'] = df['genre'].apply(lambda x : x.split(",")).apply(lambda x: [item.strip() for item in x])
df.to_csv("../data/digital/cleaned_movies.csv")
df

Unnamed: 0,ranking,name,year,runtime,genre,rating,summary,summary_length
0,1,Jai Bhim,2021.0,164 min,"[Crime, Drama]",9.4,When a tribal man is arrested for a case of al...,121
1,2,The Shawshank Redemption,1994.0,142 min,[Drama],9.3,Two imprisoned men bond over a number of years...,118
2,3,The Godfather,1972.0,175 min,"[Crime, Drama]",9.2,"The Godfather follows Vito Corleone, Don of th...",119
3,4,The Dark Knight,2008.0,152 min,"[Action, Crime, Drama]",9.0,When the menace known as the Joker wreaks havo...,189
4,5,The Godfather: Part II,1974.0,202 min,"[Crime, Drama]",9.0,The early life and career of Vito Corleone in ...,164
...,...,...,...,...,...,...,...,...
245,246,Mr. Smith Goes to Washington,1939.0,129 min,"[Comedy, Drama]",8.1,A naive youth leader is appointed to fill a va...,239
246,247,Gone with the Wind,1939.0,238 min,"[Drama, History, Romance]",8.1,The manipulative daughter of a Georgia plantat...,167
247,248,It Happened One Night,1934.0,105 min,"[Comedy, Romance]",8.1,A renegade reporter trailing a young runaway h...,212
248,249,The Passion of Joan of Arc,1928.0,114 min,"[Biography, Drama, History]",8.1,"In 1431, Jeanne d'Arc is placed on trial on ch...",151


#### Cleaing for tv shows

In [113]:
df = pd.read_csv("../data/digital/series.csv")
df.drop(labels=["VOTES"], axis=1, inplace=True)
df.columns = ["ranking", "name", "year", "certificate", "runtime_mins", "genre", "rating", "details", "actor_1", "actor_2", "actor_3", 'actor_4']
df['genre'] = df['genre'].apply(lambda x: x.split(",")).apply(lambda x: [item.strip() for item in x])
df[['start_year', 'end_year']] = df['year'].str.extract(r'\((\d{4})–(\d{4})\)')
df['start_year'] = df['start_year'].apply(pd.to_numeric)
df['end_year'] = df['end_year'].apply(pd.to_numeric)
df.loc[df['start_year'].isna(), 'start_year'] = df.loc[df['start_year'].isna(), 'year'].apply(pd.to_numeric, errors='coerce').apply(abs)
df.loc[~df['runtime_mins'].isna(),'runtime_mins'] = df.loc[~df['runtime_mins'].isna(), 'runtime_mins'].apply(lambda x: str(x).split(' ')[0]).astype(int)
df['runtime_mins'] = df['runtime_mins'].astype(float)
df.drop(labels=['year'], axis=1, inplace=True)
df.to_csv("../data/digital/cleaned_data.csv")
df

Unnamed: 0,ranking,name,certificate,runtime_mins,genre,rating,details,actor_1,actor_2,actor_3,actor_4,start_year,end_year
0,1,Breaking Bad,TV-MA,49.0,"[Crime, Drama, Thriller]",9.4,A high school chemistry teacher diagnosed with...,Bryan Cranston,Aaron Paul,Anna Gunn,Betsy Brandt,2008.0,2013.0
1,2,Game of Thrones,TV-MA,57.0,"[Action, Adventure, Drama]",9.2,Nine noble families fight for control over the...,Emilia Clarke,Peter Dinklage,Kit Harington,Lena Headey,2011.0,2019.0
2,3,Chernobyl,TV-MA,330.0,"[Drama, History, Thriller]",9.4,"In April 1986, an explosion at the Chernobyl n...",Jessie Buckley,Jared Harris,Stellan Skarsgård,Adam Nagaitis,2019.0,
3,4,Band of Brothers,TV-MA,594.0,"[Drama, History, War]",9.4,The story of Easy Company of the U.S. Army 101...,Scott Grimes,Damian Lewis,Ron Livingston,Shane Taylor,2001.0,
4,5,Sherlock,TV-14,88.0,"[Crime, Drama, Mystery]",9.1,A modern update finds the famous sleuth and hi...,Benedict Cumberbatch,Martin Freeman,Una Stubbs,Rupert Graves,2010.0,2017.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,246,Utopia,TV-MA,50.0,"[Drama, Mystery, Sci-Fi]",8.4,"After a group of people, who meet online, disc...",Adeel Akhtar,Paul Higgins,Neil Maskell,Fiona O'Shaughnessy,2013.0,2014.0
246,247,The Adventures of Sherlock Holmes,TV-PG,60.0,"[Crime, Drama, Mystery]",8.7,Sherlock Holmes and Dr Watson solve the myster...,Jeremy Brett,David Burke,Rosalie Williams,Eric Porter,1984.0,1985.0
247,248,Carnivàle,TV-MA,55.0,"[Drama, Fantasy, Mystery]",8.4,"During the Great Depression, an Oklahoma farm ...",Michael J. Anderson,Adrienne Barbeau,Clancy Brown,Debra Christofferson,2003.0,2005.0
248,249,Castlevania,TV-MA,23.0,"[Animation, Action, Adventure]",8.3,A vampire hunter fights to save a besieged cit...,Richard Armitage,James Callis,Alejandra Reynoso,Theo James,2017.0,2021.0


##### Cleaning for video games

In [111]:
df = pd.read_csv("../data/digital/video_games.csv")
df.drop(labels=["VOTES "], axis=1, inplace=True)
df.columns = ["ranking", "name", "year", "genre", "rating", "summary", "director", "actor_1", "actor_2", "actor_3", "actor_4", "certificate"]
df['year'] = df['year'].str.extract(r'\((\d{4})').astype(int)
df['genre'] = df['genre'].str.split(",")
df.to_csv("../data/digital/cleaned_video_games.csv")
df

Unnamed: 0,ranking,name,year,genre,rating,summary,director,actor_1,actor_2,actor_3,actor_4,certificate
0,1,The Witcher 3: Wild Hunt - Blood and Wine,2016,"[Action, Adventure, Drama]",9.8,Geralt is in the southern province of Toussain...,Konrad Tomaszkiewicz,Doug Cockle,MyAnna Buring,Antonia Bernath,Mark Noble,
1,2,Red Dead Redemption II,2018,"[Action, Adventure, Crime]",9.7,Amidst the decline of the Wild West at the tur...,Benjamin Byron Davis,Roger Clark,Rob Wiethoff,Cali Elizabeth Moore,,M
2,3,The Witcher 3: Wild Hunt,2015,"[Action, Adventure, Drama]",9.7,A monster hunter for hire embarks on an epic j...,Konrad Tomaszkiewicz,Doug Cockle,Denise Gough,Jo Wyatt,Jaimi Barbakoff,M
3,4,The Last of Us,2013,"[Action, Adventure, Drama]",9.7,"In a hostile, post-pandemic world, Joel and El...",Neil Druckmann,Jeffrey Pierce,Ashley Johnson,Troy Baker,Hana Hayes,M
4,5,Mass Effect: Legendary Edition,2021,"[Action, Adventure, Sci-Fi]",9.7,A compilation of the first three Mass Effect g...,Casey Hudson,Seth Green,Steven Barr,Kimberly Brooks,Keith David,M
...,...,...,...,...,...,...,...,...,...,...,...,...
245,246,The Legend of Zelda: A Link Between Worlds,2013,"[Action, Adventure, Fantasy]",8.7,Link is yet again tasked with restoring peace ...,Hiromasa Shikata,Mitsuki Saiga,Ayumi Fujimura,Seiro Ogino,Kei Hayami,E
246,247,Black Mesa,2012,"[Action, Sci-Fi, Thriller]",8.7,Scientist Gordon Freeman navigates through the...,Carlos Montero,Adam Dravean,Cris Mertens,Michael Tsarouhas,Kevin Sisk,
247,248,Assassin's Creed IV: Black Flag,2013,"[Action, Adventure, History]",8.7,"A video game where you play as sailor, private...",Ashraf Ismail,Kate Fleetwood,Ralph Ineson,Stuart Martin,Michael McElhatton,M
248,249,Ni no Kuni: Wrath of the White Witch,2011,"[Adventure, Fantasy]",8.7,"Oliver discovers that his mother is not dead, ...",Kentaro Motomura,Adam Wilson,Lauren Mote,Louis Tamone,Steffan Rhodri,E10+
