In [1]:
# task1.ipynb

# Task 1.1: Load the data
import pandas as pd

def load_data(file_path):
    """Load the dataset from the given file path."""
    return pd.read_csv(file_path)

# File path (local file path)
file_path = "movie_metadata.csv"

# Load data
data = load_data(file_path)
data.head()


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0$,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0$,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0$,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0$,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,0.0$,,12.0,7.1,,0


In [2]:
# Task 1.2: Count missing values in 'duration' column
def count_missing_values(data, column):
    """Count missing values in the specified column."""
    return data[column].isnull().sum()

missing_duration_values = count_missing_values(data, 'duration')
print(f"Missing values in 'duration' column: {missing_duration_values}")


Missing values in 'duration' column: 15


In [3]:
# Task 1.3: Replace missing values in 'duration' column with median value
def replace_missing_values(data, column):
    """Replace missing values in the specified column with the median value."""
    median_value = data[column].median()
    data[column] = data[column].fillna(median_value)

replace_missing_values(data, 'duration')


In [4]:
# Task 1.4: Calculate the average film length
def calculate_average(data, column):
    """Calculate the average of the specified column, rounded to two decimal places."""
    return round(data[column].mean(), 2)

average_duration = calculate_average(data, 'duration')
print(f"Average film length: {average_duration}")


Average film length: 107.19


In [5]:
# Task 1.5: Categorize movie duration
def categorize_movie_duration(data):
    """Create a movie_duration_category column based on the duration."""
    conditions = [
        (data['duration'] < 90),
        (data['duration'] >= 90) & (data['duration'] <= 120),
        (data['duration'] > 120)
    ]
    categories = ["1. <90", "2. 90–120", "3. >120"]
    data['movie_duration_category'] = pd.cut(data['duration'], bins=[0, 90, 120, data['duration'].max()], labels=categories, right=False)

categorize_movie_duration(data)
data[['duration', 'movie_duration_category']].head()


Unnamed: 0,duration,movie_duration_category
0,178.0,3. >120
1,169.0,3. >120
2,148.0,3. >120
3,164.0,3. >120
4,103.0,2. 90–120


In [6]:
# Task 1.6: Build a summary table for films released after 2000
def build_summary_table(data):
    """Build a summary table for films released after 2000."""
    filtered_data = data[data['title_year'] >= 2000]
    summary_table = pd.crosstab(filtered_data['title_year'], filtered_data['movie_duration_category'])
    return summary_table

summary_table = build_summary_table(data)
print("Summary table for films released after 2000:")
print(summary_table)


Summary table for films released after 2000:
movie_duration_category  1. <90  2. 90–120  3. >120
title_year                                         
2000.0                       25        108       38
2001.0                       29        118       41
2002.0                       36        145       28
2003.0                       31        104       34
2004.0                       30        134       50
2005.0                       31        141       49
2006.0                       40        142       57
2007.0                       31        126       47
2008.0                       29        157       39
2009.0                       42        174       44
2010.0                       26        169       35
2011.0                       36        149       40
2012.0                       41        139       41
2013.0                       38        141       58
2014.0                       36        170       46
2015.0                       34        142       50
2016.0             

In [7]:
# Task 1.7: Count the number of films in 2008 between 90 minutes and two hours long
def count_films_in_category(summary_table, year, category):
    """Count the number of films in the specified year and category."""
    return summary_table.loc[year, category]

films_2008_90_120 = count_films_in_category(summary_table, 2008, "2. 90–120")
print(f"Number of films between 90 minutes and two hours long released in 2008: {films_2008_90_120}")


Number of films between 90 minutes and two hours long released in 2008: 157


In [8]:
# Task 1.8: Categorize plot keywords
def categorize_plot_keywords(data):
    """Create a movie_plot_category column based on the plot_keywords."""
    def categorize_plot(plot_keywords):
        if pd.isnull(plot_keywords):
            return "other"
        plot_keywords_lower = plot_keywords.lower()
        if "love" in plot_keywords_lower and "death" in plot_keywords_lower:
            return "love_and_death"
        elif "love" in plot_keywords_lower:
            return "love"
        elif "death" in plot_keywords_lower:
            return "death"
        else:
            return "other"

    data['movie_plot_category'] = data['plot_keywords'].apply(categorize_plot)

categorize_plot_keywords(data)
data[['plot_keywords', 'movie_plot_category']].head()


Unnamed: 0,plot_keywords,movie_plot_category
0,avatar|future|marine|native|paraplegic,other
1,goddess|marriage ceremony|marriage proposal|pi...,other
2,bomb|espionage|sequel|spy|terrorist,other
3,deception|imprisonment|lawlessness|police offi...,other
4,,other


In [9]:
# Task 1.9: Build a table for average ratings based on movie_plot_category
def build_rating_table(data):
    """Build a table for average ratings based on movie_plot_category."""
    return data.groupby('movie_plot_category')['imdb_score'].mean().round(2)

average_rating_table = build_rating_table(data)
print("Average ratings based on movie_plot_category:")
print(average_rating_table)


Average ratings based on movie_plot_category:
movie_plot_category
death             6.54
love              6.58
love_and_death    6.51
other             6.43
Name: imdb_score, dtype: float64


In [10]:
# Task 1.10: Get the average rating of films in the 'love' category
average_rating_love = average_rating_table['love']
print(f"Average rating of films in the 'love' category: {average_rating_love}")


Average rating of films in the 'love' category: 6.58


In [11]:
# Task 1.11: Clean the budget column and calculate the median budget
def clean_budget_column(data):
    """Clean the budget column by removing non-numeric characters and converting to float."""
    data['budget'] = data['budget'].replace('[$,]', '', regex=True).astype(float)

clean_budget_column(data)
median_budget = data['budget'].median()
print(f"Median budget for all films listed: {median_budget}")


Median budget for all films listed: 15000000.0
