In [5]:
import pandas as pd

In [6]:
# read the data from csv file
df = pd.read_csv("../data/raw/imdb_raw.csv")

In [7]:
# Missing values per column
print("Missing values per column:\n")
print(df.isnull().sum())
print("\n")

# Check Duplicated values in row
print("Duplicated Rows Count : ",df.duplicated().sum())

Missing values per column:

title           0
director        0
release_year    0
runtime         0
genre           0
rating          0
metascore       0
gross           0
dtype: int64


Duplicated Rows Count :  0


In [None]:
# ðŸ§¾ Display the first row of the original dataset to see raw data and column structure
print("Original DataFrame : \n", df.head(1))

# ðŸ§¹ Normalize column names for consistency
#  - strip(): remove leading/trailing spaces
#  - lower(): convert all names to lowercase
#  - replace(" ", "_"): replace spaces with underscores (Pythonic column names)
#  - replace("-", "_"): replace hyphens with underscores (avoid syntax issues)
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace("-", "_")

# âœ… Check updated column names after normalization
print("Updated Column Names : \n", df.columns)

# ðŸŽ¯ Clean and extract only useful numeric parts from string columns using regex

# Extract only the 4-digit year from the 'release_year' column (e.g., "1999 (USA)" â†’ "1999")
df["release_year"] = df["release_year"].str.extract(r"(\d{4})")

# Extract only the numeric runtime (e.g., "120 min" â†’ "120")
df["runtime"] = df["runtime"].str.extract(r"(\d+)")

# Remove unwanted symbols ($ and M) from 'gross' values (e.g., "$123.5M" â†’ "123.5")
# The 'regex=True' allows pattern-based replacements
df["gross"] = df["gross"].str.replace(r"[\$,M]", "", regex=True)

# ðŸ”¢ Convert cleaned string columns into numeric data types (int or float)
# 'errors="coerce"' â†’ converts invalid values (like text or empty) into NaN
df["release_year"] = pd.to_numeric(df["release_year"], errors="coerce")
df["runtime"] = pd.to_numeric(df["runtime"], errors="coerce")
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
df["metascore"] = pd.to_numeric(df["metascore"], errors="coerce")
df["gross"] = pd.to_numeric(df["gross"], errors="coerce")

# ðŸ“Š Display one sample row after cleaning and conversion
print("Cleaned DataFrame : \n", df.head(1))

# ðŸ§¼ Remove rows with missing (NaN) values in key columns
# 'subset' ensures only specific columns are checked for NaN
df = df.dropna(subset=["release_year", "runtime", "rating", "metascore", "gross"])

df.to_csv("../data/processed/imdb_clean.csv",index=False)

# âœ… Final check: display the first 5 clean, numeric records ready for analysis
print(df.head(5))

Original DataFrame : 
                       title        director release_year  runtime  genre  \
0  The Shawshank Redemption  Frank Darabont       (1994)  142 min  Drama   

   rating  metascore    gross  
0     9.3         82  $28.34M  
Updated Column Names : 
 Index(['title', 'director', 'release_year', 'runtime', 'genre', 'rating',
       'metascore', 'gross'],
      dtype='object')
Cleaned DataFrame : 
                       title        director  release_year  runtime  genre  \
0  The Shawshank Redemption  Frank Darabont          1994      142  Drama   

   rating  metascore  gross  
0     9.3         82  28.34  
                      title              director  release_year  runtime  \
0  The Shawshank Redemption        Frank Darabont          1994      142   
1             The Godfather  Francis Ford Coppola          1972      175   
2           The Dark Knight     Christopher Nolan          2008      152   
3          Schindler's List      Steven Spielberg          1993     