In [1]:
# Import the Pandas library for data manipulation
import pandas as pd

In [2]:
# Load the dataset from a CSV file
df = pd.read_csv("movie.csv")

In [3]:
# Display the first few rows of the dataset to get an overview
df

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4911,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
4912,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
4913,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
4914,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


In [4]:
# Get the number of rows and columns in the DataFrame
nrows, ncols = df.shape
print(f"Number of rows: {nrows} \nNumber of columns: {ncols}")

Number of rows: 4916 
Number of columns: 28


In [5]:
# Display the data types of each column
df.dtypes

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
m

In [6]:
# Count the occurrences of each data type in the DataFrame
df.dtypes.value_counts()

float64    13
object     12
int64       3
Name: count, dtype: int64

In [7]:
# Check for missing values in each column
df.isnull().sum()

color                         19
director_name                102
num_critic_for_reviews        49
duration                      15
director_facebook_likes      102
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        862
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                152
movie_imdb_link                0
num_user_for_reviews          21
language                      14
country                        5
content_rating               300
budget                       484
title_year                   106
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 326
movie_facebook_likes           0
dtype: int64

In [8]:
# Step 1: Handle Missing Values
# Fill numerical columns with the median value and categorical columns with the mode
for column in df.columns:
    if df[column].dtype == 'float64' or df[column].dtype == 'int64': # Check for numerical columns
        df[column].fillna(df[column].median(), inplace=True) # Fill missing values with median
    else:
        df[column].fillna(df[column].mode()[0], inplace=True) # Fill missing values with mode

In [9]:
# Verify that there are no more missing values
df.isnull().sum()

color                        0
director_name                0
num_critic_for_reviews       0
duration                     0
director_facebook_likes      0
actor_3_facebook_likes       0
actor_2_name                 0
actor_1_facebook_likes       0
gross                        0
genres                       0
actor_1_name                 0
movie_title                  0
num_voted_users              0
cast_total_facebook_likes    0
actor_3_name                 0
facenumber_in_poster         0
plot_keywords                0
movie_imdb_link              0
num_user_for_reviews         0
language                     0
country                      0
content_rating               0
budget                       0
title_year                   0
actor_2_facebook_likes       0
imdb_score                   0
aspect_ratio                 0
movie_facebook_likes         0
dtype: int64

In [10]:
# Step 2: Filter Data
# Filtering the dataset to only include movies with an IMDb score greater than 7.0
filtered_df = df[df['imdb_score'] > 7.0]

In [11]:
# Convert all column names to uppercase for consistency
filtered_df.rename(columns= lambda label: label.upper(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.rename(columns= lambda label: label.upper(), inplace=True)


In [12]:
# Select the "MOVIE_TITLE" column to view movie titles
filtered_df["MOVIE_TITLE"]

0                                           Avatar
1         Pirates of the Caribbean: At World's End
3                            The Dark Knight Rises
4       Star Wars: Episode VII - The Force Awakens
7                                          Tangled
                           ...                    
4900                                    The Circle
4902                                      The Cure
4909                               The Mongol King
4911                       Signed Sealed Delivered
4912                                 The Following
Name: MOVIE_TITLE, Length: 1546, dtype: object

In [13]:
# Set "MOVIE_TITLE" as the index of the DataFrame
filtered_df.set_index('MOVIE_TITLE',inplace=True)

In [14]:
# Display the DataFrame with "MOVIE_TITLE" as the index
filtered_df

Unnamed: 0_level_0,COLOR,DIRECTOR_NAME,NUM_CRITIC_FOR_REVIEWS,DURATION,DIRECTOR_FACEBOOK_LIKES,ACTOR_3_FACEBOOK_LIKES,ACTOR_2_NAME,ACTOR_1_FACEBOOK_LIKES,GROSS,GENRES,...,NUM_USER_FOR_REVIEWS,LANGUAGE,COUNTRY,CONTENT_RATING,BUDGET,TITLE_YEAR,ACTOR_2_FACEBOOK_LIKES,IMDB_SCORE,ASPECT_RATIO,MOVIE_FACEBOOK_LIKES
MOVIE_TITLE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,Color,Doug Walker,108.0,103.0,131.0,366.0,Rob Walker,131.0,25043962.0,Documentary,...,153.0,English,USA,R,19850000.0,2005.0,12.0,7.1,2.35,0
Tangled,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,...,387.0,English,USA,PG,260000000.0,2010.0,553.0,7.8,1.85,29000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Circle,Color,Jafar Panahi,64.0,90.0,397.0,0.0,Nargess Mamizadeh,5.0,673780.0,Drama,...,26.0,Persian,Iran,Not Rated,10000.0,2000.0,0.0,7.5,1.85,697
The Cure,Color,Kiyoshi Kurosawa,78.0,111.0,62.0,6.0,Anna Nakagawa,89.0,94596.0,Crime|Horror|Mystery|Thriller,...,50.0,Japanese,Japan,R,1000000.0,1997.0,13.0,7.4,1.85,817
The Mongol King,Color,Anthony Vallone,108.0,84.0,2.0,2.0,John Considine,45.0,25043962.0,Crime|Drama,...,1.0,English,USA,PG-13,3250.0,2005.0,44.0,7.8,2.35,4
Signed Sealed Delivered,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,25043962.0,Comedy|Drama,...,6.0,English,Canada,R,19850000.0,2013.0,470.0,7.7,2.35,84


In [15]:
# Access data for the movie titled "Avatar"
filtered_df.loc["Avatar"]

COLOR                                                                    Color
DIRECTOR_NAME                                                    James Cameron
NUM_CRITIC_FOR_REVIEWS                                                   723.0
DURATION                                                                 178.0
DIRECTOR_FACEBOOK_LIKES                                                    0.0
ACTOR_3_FACEBOOK_LIKES                                                   855.0
ACTOR_2_NAME                                                  Joel David Moore
ACTOR_1_FACEBOOK_LIKES                                                  1000.0
GROSS                                                              760505847.0
GENRES                                         Action|Adventure|Fantasy|Sci-Fi
ACTOR_1_NAME                                                       CCH Pounder
NUM_VOTED_USERS                                                         886204
CAST_TOTAL_FACEBOOK_LIKES                           

In [16]:
# Display the index of the DataFrame, which should now be "MOVIE_TITLE"
filtered_df.index

Index(['Avatar', 'Pirates of the Caribbean: At World's End',
       'The Dark Knight Rises', 'Star Wars: Episode VII - The Force Awakens',
       'Tangled', 'Avengers: Age of Ultron',
       'Harry Potter and the Half-Blood Prince',
       'Pirates of the Caribbean: Dead Man's Chest', 'Man of Steel',
       'The Avengers',
       ...
       'The Last Waltz', 'Clerks', 'In the Company of Men', 'Slacker',
       'Stories of Our Lives', 'The Circle', 'The Cure', 'The Mongol King',
       'Signed Sealed Delivered', 'The Following'],
      dtype='object', name='MOVIE_TITLE', length=1546)

In [17]:
# Display the column names of the DataFrame
filtered_df.columns

Index(['COLOR', 'DIRECTOR_NAME', 'NUM_CRITIC_FOR_REVIEWS', 'DURATION',
       'DIRECTOR_FACEBOOK_LIKES', 'ACTOR_3_FACEBOOK_LIKES', 'ACTOR_2_NAME',
       'ACTOR_1_FACEBOOK_LIKES', 'GROSS', 'GENRES', 'ACTOR_1_NAME',
       'NUM_VOTED_USERS', 'CAST_TOTAL_FACEBOOK_LIKES', 'ACTOR_3_NAME',
       'FACENUMBER_IN_POSTER', 'PLOT_KEYWORDS', 'MOVIE_IMDB_LINK',
       'NUM_USER_FOR_REVIEWS', 'LANGUAGE', 'COUNTRY', 'CONTENT_RATING',
       'BUDGET', 'TITLE_YEAR', 'ACTOR_2_FACEBOOK_LIKES', 'IMDB_SCORE',
       'ASPECT_RATIO', 'MOVIE_FACEBOOK_LIKES'],
      dtype='object')

In [18]:
# Rename the "MOVIE_IMDB_LINK" column to "IMDB_LINK" for simplicity
filtered_df.rename(columns={"MOVIE_IMDB_LINK" : "IMDB_LINK"})

Unnamed: 0_level_0,COLOR,DIRECTOR_NAME,NUM_CRITIC_FOR_REVIEWS,DURATION,DIRECTOR_FACEBOOK_LIKES,ACTOR_3_FACEBOOK_LIKES,ACTOR_2_NAME,ACTOR_1_FACEBOOK_LIKES,GROSS,GENRES,...,NUM_USER_FOR_REVIEWS,LANGUAGE,COUNTRY,CONTENT_RATING,BUDGET,TITLE_YEAR,ACTOR_2_FACEBOOK_LIKES,IMDB_SCORE,ASPECT_RATIO,MOVIE_FACEBOOK_LIKES
MOVIE_TITLE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,Color,Doug Walker,108.0,103.0,131.0,366.0,Rob Walker,131.0,25043962.0,Documentary,...,153.0,English,USA,R,19850000.0,2005.0,12.0,7.1,2.35,0
Tangled,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,...,387.0,English,USA,PG,260000000.0,2010.0,553.0,7.8,1.85,29000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Circle,Color,Jafar Panahi,64.0,90.0,397.0,0.0,Nargess Mamizadeh,5.0,673780.0,Drama,...,26.0,Persian,Iran,Not Rated,10000.0,2000.0,0.0,7.5,1.85,697
The Cure,Color,Kiyoshi Kurosawa,78.0,111.0,62.0,6.0,Anna Nakagawa,89.0,94596.0,Crime|Horror|Mystery|Thriller,...,50.0,Japanese,Japan,R,1000000.0,1997.0,13.0,7.4,1.85,817
The Mongol King,Color,Anthony Vallone,108.0,84.0,2.0,2.0,John Considine,45.0,25043962.0,Crime|Drama,...,1.0,English,USA,PG-13,3250.0,2005.0,44.0,7.8,2.35,4
Signed Sealed Delivered,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,25043962.0,Comedy|Drama,...,6.0,English,Canada,R,19850000.0,2013.0,470.0,7.7,2.35,84


In [19]:
# Display the column names again to verify renaming
filtered_df.columns

Index(['COLOR', 'DIRECTOR_NAME', 'NUM_CRITIC_FOR_REVIEWS', 'DURATION',
       'DIRECTOR_FACEBOOK_LIKES', 'ACTOR_3_FACEBOOK_LIKES', 'ACTOR_2_NAME',
       'ACTOR_1_FACEBOOK_LIKES', 'GROSS', 'GENRES', 'ACTOR_1_NAME',
       'NUM_VOTED_USERS', 'CAST_TOTAL_FACEBOOK_LIKES', 'ACTOR_3_NAME',
       'FACENUMBER_IN_POSTER', 'PLOT_KEYWORDS', 'MOVIE_IMDB_LINK',
       'NUM_USER_FOR_REVIEWS', 'LANGUAGE', 'COUNTRY', 'CONTENT_RATING',
       'BUDGET', 'TITLE_YEAR', 'ACTOR_2_FACEBOOK_LIKES', 'IMDB_SCORE',
       'ASPECT_RATIO', 'MOVIE_FACEBOOK_LIKES'],
      dtype='object')

In [20]:
# Remove "_FACEBOOK" suffix from any column names that have it
filtered_df.rename(columns=lambda label:label.replace("_FACEBOOK", ""))

Unnamed: 0_level_0,COLOR,DIRECTOR_NAME,NUM_CRITIC_FOR_REVIEWS,DURATION,DIRECTOR_LIKES,ACTOR_3_LIKES,ACTOR_2_NAME,ACTOR_1_LIKES,GROSS,GENRES,...,NUM_USER_FOR_REVIEWS,LANGUAGE,COUNTRY,CONTENT_RATING,BUDGET,TITLE_YEAR,ACTOR_2_LIKES,IMDB_SCORE,ASPECT_RATIO,MOVIE_LIKES
MOVIE_TITLE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,Color,Doug Walker,108.0,103.0,131.0,366.0,Rob Walker,131.0,25043962.0,Documentary,...,153.0,English,USA,R,19850000.0,2005.0,12.0,7.1,2.35,0
Tangled,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,...,387.0,English,USA,PG,260000000.0,2010.0,553.0,7.8,1.85,29000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Circle,Color,Jafar Panahi,64.0,90.0,397.0,0.0,Nargess Mamizadeh,5.0,673780.0,Drama,...,26.0,Persian,Iran,Not Rated,10000.0,2000.0,0.0,7.5,1.85,697
The Cure,Color,Kiyoshi Kurosawa,78.0,111.0,62.0,6.0,Anna Nakagawa,89.0,94596.0,Crime|Horror|Mystery|Thriller,...,50.0,Japanese,Japan,R,1000000.0,1997.0,13.0,7.4,1.85,817
The Mongol King,Color,Anthony Vallone,108.0,84.0,2.0,2.0,John Considine,45.0,25043962.0,Crime|Drama,...,1.0,English,USA,PG-13,3250.0,2005.0,44.0,7.8,2.35,4
Signed Sealed Delivered,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,25043962.0,Comedy|Drama,...,6.0,English,Canada,R,19850000.0,2013.0,470.0,7.7,2.35,84


In [21]:
# Display the index to confirm that "_FACEBOOK" suffix is removed from any column names that have it
filtered_df.index

Index(['Avatar', 'Pirates of the Caribbean: At World's End',
       'The Dark Knight Rises', 'Star Wars: Episode VII - The Force Awakens',
       'Tangled', 'Avengers: Age of Ultron',
       'Harry Potter and the Half-Blood Prince',
       'Pirates of the Caribbean: Dead Man's Chest', 'Man of Steel',
       'The Avengers',
       ...
       'The Last Waltz', 'Clerks', 'In the Company of Men', 'Slacker',
       'Stories of Our Lives', 'The Circle', 'The Cure', 'The Mongol King',
       'Signed Sealed Delivered', 'The Following'],
      dtype='object', name='MOVIE_TITLE', length=1546)

In [22]:
# Sort the DataFrame by index (alphabetical order of movie titles)
filtered_df.sort_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.sort_index(inplace=True)


In [23]:
# Display the sorted DataFrame
filtered_df

Unnamed: 0_level_0,COLOR,DIRECTOR_NAME,NUM_CRITIC_FOR_REVIEWS,DURATION,DIRECTOR_FACEBOOK_LIKES,ACTOR_3_FACEBOOK_LIKES,ACTOR_2_NAME,ACTOR_1_FACEBOOK_LIKES,GROSS,GENRES,...,NUM_USER_FOR_REVIEWS,LANGUAGE,COUNTRY,CONTENT_RATING,BUDGET,TITLE_YEAR,ACTOR_2_FACEBOOK_LIKES,IMDB_SCORE,ASPECT_RATIO,MOVIE_FACEBOOK_LIKES
MOVIE_TITLE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Cloverfield Lane,Color,Dan Trachtenberg,411.0,104.0,16.0,82.0,John Gallagher Jr.,14000.0,71897215.0,Drama|Horror|Mystery|Sci-Fi|Thriller,...,440.0,English,USA,PG-13,15000000.0,2016.0,338.0,7.3,2.35,33000
10 Days in a Madhouse,Color,Timothy Hines,1.0,111.0,0.0,247.0,Kelly LeBrock,1000.0,14616.0,Drama,...,10.0,English,USA,R,12000000.0,2015.0,445.0,7.5,1.85,26000
10 Things I Hate About You,Color,Gil Junger,133.0,97.0,19.0,835.0,Heath Ledger,23000.0,38176108.0,Comedy|Drama|Romance,...,549.0,English,USA,PG-13,16000000.0,1999.0,13000.0,7.2,1.85,10000
"10,000 B.C.",Color,Christopher Barnard,108.0,22.0,0.0,366.0,Morgan Freeman,5.0,25043962.0,Comedy,...,153.0,English,USA,R,19850000.0,2005.0,593.0,7.2,2.35,0
11:14,Color,Greg Marcks,68.0,85.0,9.0,407.0,Barbara Hershey,861.0,25043962.0,Comedy|Crime|Drama,...,133.0,English,USA,R,6000000.0,2003.0,618.0,7.2,1.85,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Yours, Mine and Ours",Color,Melville Shavelson,8.0,111.0,5.0,559.0,Tom Bosley,6000.0,25043962.0,Comedy|Family,...,61.0,English,USA,Unrated,2500000.0,1968.0,584.0,7.2,1.85,0
Zero Dark Thirty,Color,Kathryn Bigelow,558.0,157.0,0.0,304.0,Harold Perrineau,1000.0,95720716.0,Drama|History|Thriller,...,640.0,English,USA,R,40000000.0,2012.0,1000.0,7.4,1.85,39000
Zodiac,Color,David Fincher,377.0,162.0,21000.0,495.0,Jake Gyllenhaal,21000.0,33048353.0,Crime|Drama|History|Mystery|Thriller,...,589.0,English,USA,R,65000000.0,2007.0,15000.0,7.7,2.35,12000
Zombieland,Color,Ruben Fleischer,445.0,88.0,181.0,11.0,Bill Murray,15000.0,75590286.0,Adventure|Comedy|Horror|Sci-Fi,...,553.0,English,USA,R,23600000.0,2009.0,13000.0,7.7,2.35,26000


In [24]:
# Step 3: Add a New Column
# Adding a 'PROFIT' column, calculated as the difference between 'gross' and 'budget'
filtered_df = filtered_df.assign(PROFIT=filtered_df['GROSS'] - filtered_df['BUDGET'])

In [25]:
# Display the first few rows of the cleaned and modified dataset
filtered_df.head()

Unnamed: 0_level_0,COLOR,DIRECTOR_NAME,NUM_CRITIC_FOR_REVIEWS,DURATION,DIRECTOR_FACEBOOK_LIKES,ACTOR_3_FACEBOOK_LIKES,ACTOR_2_NAME,ACTOR_1_FACEBOOK_LIKES,GROSS,GENRES,...,LANGUAGE,COUNTRY,CONTENT_RATING,BUDGET,TITLE_YEAR,ACTOR_2_FACEBOOK_LIKES,IMDB_SCORE,ASPECT_RATIO,MOVIE_FACEBOOK_LIKES,PROFIT
MOVIE_TITLE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Cloverfield Lane,Color,Dan Trachtenberg,411.0,104.0,16.0,82.0,John Gallagher Jr.,14000.0,71897215.0,Drama|Horror|Mystery|Sci-Fi|Thriller,...,English,USA,PG-13,15000000.0,2016.0,338.0,7.3,2.35,33000,56897215.0
10 Days in a Madhouse,Color,Timothy Hines,1.0,111.0,0.0,247.0,Kelly LeBrock,1000.0,14616.0,Drama,...,English,USA,R,12000000.0,2015.0,445.0,7.5,1.85,26000,-11985384.0
10 Things I Hate About You,Color,Gil Junger,133.0,97.0,19.0,835.0,Heath Ledger,23000.0,38176108.0,Comedy|Drama|Romance,...,English,USA,PG-13,16000000.0,1999.0,13000.0,7.2,1.85,10000,22176108.0
"10,000 B.C.",Color,Christopher Barnard,108.0,22.0,0.0,366.0,Morgan Freeman,5.0,25043962.0,Comedy,...,English,USA,R,19850000.0,2005.0,593.0,7.2,2.35,0,5193962.0
11:14,Color,Greg Marcks,68.0,85.0,9.0,407.0,Barbara Hershey,861.0,25043962.0,Comedy|Crime|Drama,...,English,USA,R,6000000.0,2003.0,618.0,7.2,1.85,0,19043962.0


In [26]:
# Display concise summary information about the DataFrame, including data types and memory usage
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1546 entries, 10 Cloverfield Lane to [Rec]
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   COLOR                      1546 non-null   object 
 1   DIRECTOR_NAME              1546 non-null   object 
 2   NUM_CRITIC_FOR_REVIEWS     1546 non-null   float64
 3   DURATION                   1546 non-null   float64
 4   DIRECTOR_FACEBOOK_LIKES    1546 non-null   float64
 5   ACTOR_3_FACEBOOK_LIKES     1546 non-null   float64
 6   ACTOR_2_NAME               1546 non-null   object 
 7   ACTOR_1_FACEBOOK_LIKES     1546 non-null   float64
 8   GROSS                      1546 non-null   float64
 9   GENRES                     1546 non-null   object 
 10  ACTOR_1_NAME               1546 non-null   object 
 11  NUM_VOTED_USERS            1546 non-null   int64  
 12  CAST_TOTAL_FACEBOOK_LIKES  1546 non-null   int64  
 13  ACTOR_3_NAME               1546 no