In [1]:
import pandas as pd
import plotly.express as px
from google.colab import drive
from pprint import pprint, pformat
drive.mount('/content/modules', force_remount=True)

Mounted at /content/modules


In [None]:
# Read IMDB data
db = pd.read_csv("/content/modules/My Drive/IMDB1.csv")
# Preliminary checkup
pprint(['shape', db.shape])
pprint(['columns', db.columns])
if "Revenue(Millions)" in db.columns: # Rename column for convenience
    db = db.rename(columns={"Revenue(Millions)": "Revenue"})
pprint(['dtypes', db.dtypes])
db.info()
categorical_columns = ["Title", "Genre", "Description", "Director", "Actors"]
for col in categorical_columns:
    pprint([col, db[col].nunique(), db[col].dropna().unique()[:3]])
numeric_columns = ["Year", "Runtime", "Votes", "Rating", "Revenue", "Metascore"]
pprint(db[numeric_columns].describe())
pprint(db.isnull().sum())
#db.head()

In [None]:
# Convert comma-separated lists into Python lists
re_name = r'[ ]*([^,]+?)[ ]*(?:,|$)'
db.Genre    = db.Genre.str.findall(re_name)
db.Actors   = db.Actors.str.findall(re_name)
db.Director = db.Director.str.findall(re_name)
db.head()

In [None]:
genres    = db[['Title', 'Genre']].explode('Genre')
actors    = db[['Title', 'Actors']].explode('Actors')
directors = db[['Title', 'Director']].explode('Director')
pprint(genres['Genre'].value_counts())
pprint(actors['Actors'].value_counts())
pprint(directors['Director'].value_counts())

In [None]:
genres = db[['Title', 'Genre', 'Rating', 'Revenue', 'Votes', 'Runtime', 'Year']].explode('Genre')
genres_agg = genres.groupby('Genre').agg(
    num_movies=('Title', 'nunique'),
    avg_rating=('Rating', 'mean'),
    total_revenue=('Revenue', 'sum'),
    avg_runtime=('Runtime', 'mean'),
    total_votes=('Votes', 'sum'),
    earliest_year=('Year', 'min'),
    latest_year=('Year', 'max')
).reset_index()
pprint(genres_agg.sort_values(by='num_movies', ascending=False))

In [None]:
directors = db[['Title', 'Director', 'Year', 'Rating', 'Revenue', 'Votes', 'Runtime']].explode('Director')
director_year_rating = pd.pivot_table( # Average movie rating per Director by Year.
    directors,
    index='Director',
    columns='Year',
    values='Rating',
    aggfunc='mean',
    fill_value=0
)
pprint(director_year_rating.head(10))

In [None]:
import plotly.express as px
directors = db[['Title', 'Director', 'Year', 'Rating', 'Revenue', 'Votes', 'Runtime']].explode('Director')
director_summary = directors.groupby('Director').agg(
    num_movies=('Title', 'nunique'),
    avg_rating=('Rating', 'mean'),
    total_revenue=('Revenue', 'sum'),
    avg_votes=('Votes', 'mean'),
    avg_runtime=('Runtime', 'mean')
).reset_index()
top_directors = director_summary.nlargest(20, 'num_movies')
fig = px.bar(
    top_directors,
    x='Director',
    y='num_movies',
    title='Top 20 Directors by Number of Movies',
    labels={'num_movies': 'Number of Movies', 'Director': 'Director'},
    color='num_movies',
    color_continuous_scale='blues'
)
fig.update_layout(
    xaxis_title="Director",
    yaxis_title="Number of Movies",
    xaxis_tickangle=-45
)
fig.show()

In [None]:
actors = db[['Title', 'Actors', 'Genre']].explode('Actors')
actors = actors.explode('Genre')
actor_genre_pivot = pd.pivot_table(
    actors,
    index='Actors',
    columns='Genre',
    values='Title',
    aggfunc='count',
    fill_value=0
)
pprint(actor_genre_pivot.head(10))

In [None]:
high_rating = db.query("Rating >= 8").copy()
high_rating["Decade"] = (high_rating['Year'] // 10) * 10
decade_stats = high_rating.groupby('Decade').agg(
    num_movies=('Title', 'count'),
    avg_rating=('Rating', 'mean'),
    total_revenue=('Revenue', 'sum'),
    avg_votes=('Votes', 'mean')
).reset_index()
decade_stats_sorted = decade_stats.sort_values(by='avg_rating', ascending=False)
pprint(decade_stats_sorted)

In [None]:
numeric_metrics = ["Rating", "Revenue", "Metascore"]
melted = pd.melt(db, id_vars=["Title", "Year"], value_vars=numeric_metrics, var_name="Metric", value_name="Value")
pprint(melted.head(10))
print()
metric_summary = melted.groupby("Metric").agg(
    mean_value=("Value", "mean"),
    count_value=("Value", "count")
).reset_index()
pprint(metric_summary)

In [None]:
import plotly.express as px
yearly_metrics = melted.groupby(['Year', 'Metric']).agg(
    avg_value=('Value', 'mean')
).reset_index()
fig = px.line(
    yearly_metrics,
    x='Year',
    y='avg_value',
    color='Metric',
    markers=True,
    title='Yearly Averages of Movie Metrics'
)
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Average Value',
    hovermode='x unified'
)
fig.show()