Exercise: Working with Pandas DataFrames (Book Ratings Example)

In [1]:
# Import libraries and prepare the raw data
import pandas as pd
import numpy as np

# DO NOT CHANGE THE VARIABLE NAMES

# Configure Pandas to display numerical values with one decimal place.
# This improves readability when working with ratings and summary statistics.
pd.set_option('display.precision', 1)

# Create Pandas Series objects to store book titles and their respective authors.
# These Series will later be combined with user ratings to form a DataFrame.
books = pd.Series(data=[
    'Great Expectations',
    'Of Mice and Men',
    'Romeo and Juliet',
    'The Time Machine',
    'Alice in Wonderland'
])

authors = pd.Series(data=[
    'Charles Dickens',
    'John Steinbeck',
    'William Shakespeare',
    ' H. G. Wells',
    'Lewis Carroll'
])

# Create individual Pandas Series for each user's book ratings.
# Ratings are aligned with the order of the book titles above.
# Missing ratings are represented as NaN, which Pandas uses to indicate absent data.
user_1 = pd.Series(data=[3.2, np.nan, 2.5])
user_2 = pd.Series(data=[5.0, 1.3, 4.0, 3.8])
user_3 = pd.Series(data=[2.0, 2.3, np.nan, 4.0])
user_4 = pd.Series(data=[4.0, 3.5, 4.0, 5.0, 4.2])

Organising the data using a dictionary

In [2]:
# Combine all Series objects into a dictionary.
# Each key represents a column name, and each value represents column data.
# This structure allows for an easy and readable conversion into a DataFrame.
dat = {
    'Book Title': books,
    'Author': authors,
    'User 1': user_1,
    'User 2': user_2,
    'User 3': user_3,
    'User 4': user_4
}

### Notebook grading (used to verify correctness during assessment)
dat1 = {
    'Book Title': books,
    'Author': authors,
    'User 1': user_1,
    'User 2': user_2,
    'User 3': user_3,
    'User 4': user_4
}

if dat1 != dat:
    print("dat is not correct")
else:
    print("dat is correct")

dat is correct


Creating the DataFrame

In [3]:
# Create a Pandas DataFrame from the dictionary.
# Pandas automatically assigns numerical row indices.
# At this stage, the DataFrame may still contain NaN values.
book_ratings = pd.DataFrame(dat)

### Notebook grading
book_ratings1 = pd.DataFrame(dat1)
if not book_ratings1.equals(book_ratings):
    print("book_ratings is not correct")
else:
    print("book_ratings is correct")

book_ratings is correct


Handling missing values using column averages

In [4]:
# Replace missing (NaN) values with the average rating of each column.
# Only numeric columns are considered when calculating the mean.
# The operation is performed in place to avoid creating unnecessary copies.
book_ratings.fillna(
    book_ratings.mean(axis=0, numeric_only=True),
    inplace=True
)

### Notebook grading
book_ratings1 = pd.DataFrame(dat)
book_ratings1.fillna(
    book_ratings1.mean(axis=0, numeric_only=True),
    inplace=True
)

if not book_ratings1.equals(book_ratings):
    print("book_ratings has not been correctly updated.")
else:
    print("book_ratings was correctly updated")

book_ratings was correctly updated
