1. Importing Libraries and Data Set

In [1]:
# Importing Libraries
import pandas as pd

In [2]:
# Load the dataset
file_path = "critic_reviews.csv"  # Update the path if necessary
df = pd.read_csv(file_path)

2. Inspecting the Dataset

In [3]:
print(df.head())          # View the first few rows
print(df.info())          # Check column data types and non-null values
print(df.describe())      # Summary statistics for numerical columns
print(df.isnull().sum())  # Count of missing values per column

   reviewId creationDate      criticName            criticPageUrl reviewState  \
0   1912176   1966-07-30  Jennie Kermode  /critics/jennie-kermode       fresh   
1   1895073   1998-08-21      Scott Nash      /critics/scott-nash       fresh   
2     25507   2000-01-01     Roger Ebert     /critics/roger-ebert       fresh   
3     25508   2000-01-01             NaN                      NaN      rotten   
4     25509   2000-01-01       Joe Brown       /critics/joe-brown       fresh   

   isFresh  isRotten isRtUrl  isTopCritic        publicationUrl  \
0     True     False   False        False  /critics/source/1869   
1     True     False   False        False  /critics/source/1465   
2     True     False   False         True    /critics/source/67   
3    False      True   False         True   /critics/source/148   
4     True     False   False         True   /critics/source/474   

     publicationName                                          reviewUrl  \
0       Eye for Film    http://www.

3. Data Cleaning

In [4]:
# Handling Missing Values
df['criticName'] = df['criticName'].fillna("N/A")
df['criticPageUrl'] = df['criticPageUrl'].fillna("N/A")

# Using a placehold for missing numerical values
df['originalScore'] =df['originalScore'].fillna(-1)

# Dropping irrelevant columns with missing values
df = df.drop(columns = ['isRtUrl', 'reviewUrl', 'quote'], errors = 'ignore')

print(df.isnull().sum()) # Testing to make sure missing values are handled

reviewId           0
creationDate       0
criticName         0
criticPageUrl      0
reviewState        0
isFresh            0
isRotten           0
isTopCritic        0
publicationUrl     0
publicationName    0
scoreSentiment     0
originalScore      0
movieId            0
dtype: int64


In [5]:
# Checking for Duplicates
print(df.duplicated(subset=['reviewId']).sum())

0


In [6]:
# Standardizing Columns
df['criticName'] = df['criticName'].str.lower()                 #Converting to lowercase for uniformity
df['publicationName'] = df['publicationName'].str.lower()       #Converting to lowercase for uniformity

# Converting to a proper datetime format for easier processing
df['creationDate'] = pd.to_datetime(df['creationDate'])
df['creationDate'] = df['creationDate'].dt.strftime('%m-%d-%Y')

print(df['creationDate'])

0        07-30-1966
1        08-21-1998
2        01-01-2000
3        01-01-2000
4        01-01-2000
            ...    
27137    06-14-2024
27138    06-15-2024
27139    06-17-2024
27140    06-24-2024
27141    06-24-2024
Name: creationDate, Length: 27142, dtype: object


In [7]:
# Exporting cleaned dataset
df.to_csv("cleaned_critic.reviews.csv", index=False)