In [7]:
# Step 1: Import pandas and create a sample dataset
import pandas as pd
import numpy as np

# Create sample data with missing values and incorrect data types
data = {
    'Title': ['Movie A', 'Movie B', 'Movie C', 'Movie D', 'Movie E'],
    'Rating': [8.2, np.nan, 7.5, np.nan, 9.0],        # Some missing ratings
    'Release_Year': ['2010', '2012', np.nan, '2015', '2018']  # Some missing years, and as strings
}

df = pd.DataFrame(data)

print("üéûÔ∏è Original Movie Dataset:")
print(df)
print("-" * 60)



üéûÔ∏è Original Movie Dataset:
     Title  Rating Release_Year
0  Movie A     8.2         2010
1  Movie B     NaN         2012
2  Movie C     7.5          NaN
3  Movie D     NaN         2015
4  Movie E     9.0         2018
------------------------------------------------------------


In [2]:
# Step 2: Check for missing values
print("üîç Missing Values in Each Column:")
print(df.isnull().sum())
print("-" * 60)

üîç Missing Values in Each Column:
Title           0
Rating          2
Release_Year    1
dtype: int64
------------------------------------------------------------


In [3]:

# Step 3: Handle Missing Data
# Replace missing 'Rating' with median rating
df['Rating'].fillna(df['Rating'].median(), inplace=True)

# Drop rows where 'Release_Year' is missing
df.dropna(subset=['Release_Year'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Rating'].fillna(df['Rating'].median(), inplace=True)


In [4]:
# Step 4: Correct Data Type
# Convert 'Release_Year' from string to integer
df['Release_Year'] = df['Release_Year'].astype(int)


In [5]:
# Step 5: Output
print("‚úÖ Cleaned Movie Dataset:")
print(df)
print("-" * 60)

print("‚ÑπÔ∏è DataFrame Info After Cleaning:")
print(df.info())

‚úÖ Cleaned Movie Dataset:
     Title  Rating  Release_Year
0  Movie A     8.2          2010
1  Movie B     8.2          2012
3  Movie D     8.2          2015
4  Movie E     9.0          2018
------------------------------------------------------------
‚ÑπÔ∏è DataFrame Info After Cleaning:
<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 0 to 4
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         4 non-null      object 
 1   Rating        4 non-null      float64
 2   Release_Year  4 non-null      int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 128.0+ bytes
None
