In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from textblob import TextBlob

In [2]:
# Load the dataset
df = pd.read_csv('final_merged_with_reviews_data.csv')

In [3]:
print(df.head())
print(df.info())
print(df.describe())

             title                    id  audienceScore  tomatoMeter rating  \
0       love, lies             love_lies           43.0          NaN    NaN   
1       love, lies             love_lies           43.0          NaN    NaN   
2  dinosaur island  dinosaur_island_2015           23.0          NaN    NaN   
3           adrift           adrift_2018           65.0         69.0  PG-13   
4           adrift           adrift_2018           65.0         69.0  PG-13   

                                      ratingContents releaseDateTheaters  \
0                                                NaN                 NaN   
1                                                NaN                 NaN   
2                                                NaN                 NaN   
3  ['Injury Images', 'Brief Drug Use', 'Thematic ...          2018-06-01   
4  ['Injury Images', 'Brief Drug Use', 'Thematic ...          2018-06-01   

  releaseDateStreaming  runtimeMinutes                      genre  .

In [4]:
df.drop(columns=['scoreSentiment'], inplace=True)
df.drop(columns=['boxOffice'], inplace=True)
df.drop(columns=['reviewUrl'], inplace=True)
df.drop(columns=['originalScore'], inplace=True)
df.drop(columns=['soundMix'], inplace=True)

In [5]:
# Drop duplicates
df.drop_duplicates(subset=['title', 'id'], inplace=True)

In [6]:
# Fill missing values in numerical columns with their mean
numerical_columns = ['audienceScore', 'tomatoMeter', 'runtimeMinutes']
for column in numerical_columns:
    df[column].fillna(df[column].mean(), inplace=True)

# Verify that missing values have been filled
missing_values = df.isnull().sum()
print("Missing values after filling:")
print(missing_values)

Missing values after filling:
title                       0
id                          0
audienceScore               0
tomatoMeter                 0
rating                  57845
ratingContents          57845
releaseDateTheaters     44119
releaseDateStreaming    21426
runtimeMinutes              0
genre                    6036
originalLanguage         6970
director                  823
writer                  22419
distributor             48619
reviewId                    0
creationDate                0
criticName                  0
isTopCritic                 0
reviewState                 0
publicatioName              0
reviewText               3772
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mean(), inplace=True)


In [7]:
# Fill missing categorical values with 'Unknown'
df['rating'].fillna('Unknown', inplace=True)
df['ratingContents'].fillna('Unknown', inplace=True)
df['genre'].fillna('Unknown', inplace=True)
df['director'].fillna('Unknown', inplace=True)
df['writer'].fillna('Unknown', inplace=True)
df['distributor'].fillna('Unknown', inplace=True)
df['originalLanguage'].fillna('unknown', inplace=True)
df['reviewText'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ratingContents'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

In [8]:
# Fill missing values in date columns with a placeholder date or median
df['releaseDateTheaters'].fillna(pd.to_datetime('1900-01-01'), inplace=True)
df['releaseDateStreaming'].fillna(pd.to_datetime('1900-01-01'), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['releaseDateTheaters'].fillna(pd.to_datetime('1900-01-01'), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['releaseDateStreaming'].fillna(pd.to_datetime('1900-01-01'), inplace=True)


In [9]:
# Convert dates to datetime format
df['releaseDateTheaters'] = pd.to_datetime(df['releaseDateTheaters'], errors='coerce')
df['releaseDateStreaming'] = pd.to_datetime(df['releaseDateStreaming'], errors='coerce')
df['creationDate'] = pd.to_datetime(df['creationDate'], errors='coerce')

In [10]:
# Strip whitespace and convert text to lowercase for text columns
text_columns = ['title', 'id', 'genre', 'originalLanguage', 'director', 'writer', 'publicatioName', 'reviewText']
for col in text_columns:
    df[col] = df[col].str.lower().str.strip()

In [11]:
# Extract year from releaseDateTheaters
df['releaseYear'] = pd.to_datetime(df['releaseDateTheaters']).dt.year

In [12]:
# Define a function to compute sentiment polarity
def get_sentiment(review):
    try:
        # Compute sentiment polarity using TextBlob
        blob = TextBlob(review)
        return blob.sentiment.polarity
    except:
        return None

# Apply the sentiment analysis function to the reviewText column
df['sentiment_score'] = df['reviewText'].apply(get_sentiment)

In [13]:
# Check final dataframe information
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 69077 entries, 0 to 2316757
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   title                 69077 non-null  object        
 1   id                    69077 non-null  object        
 2   audienceScore         69077 non-null  float64       
 3   tomatoMeter           69077 non-null  float64       
 4   rating                69077 non-null  object        
 5   ratingContents        69077 non-null  object        
 6   releaseDateTheaters   69077 non-null  datetime64[ns]
 7   releaseDateStreaming  69077 non-null  datetime64[ns]
 8   runtimeMinutes        69077 non-null  float64       
 9   genre                 69077 non-null  object        
 10  originalLanguage      69077 non-null  object        
 11  director              69077 non-null  object        
 12  writer                69077 non-null  object        
 13  distributor        

In [14]:
# Save the DataFrame to a CSV file
df.to_csv('cleaned_data.csv', index=False)

print("DataFrame saved to 'cleaned_data.csv'.")

DataFrame saved to 'cleaned_data.csv'.
