In [None]:
# Show unique values for non-numeric columns (e.g., genres, title type)
for col in df.select_dtypes(include='object').columns:
    print(f'Column: {col}')
    print(df[col].unique()[:10])  # Show up to 10 unique values
    print()

In [None]:
# Show summary statistics for numeric columns
df.describe()

In [None]:
# Show columns and data types
df.info()

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('imdb rating.csv')
df.head()

# IMDb Ratings CSV Data Exploration

This notebook explores the structure and contents of the `imdb rating.csv` file. It will help you understand the columns, data types, and sample data, so you can use this CSV as the main data source for your analysis and backend logic.

# IMDB Ratings Data Exploration

This notebook explores the IMDB ratings CSV data to understand its structure and identify analysis opportunities.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette('husl')

## Load and Examine Data

In [None]:
# Load the CSV data
df = pd.read_csv('imdb rating.csv')

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names and types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

print("\nData types:")
print(df.info())

## Data Cleaning and Preprocessing

In [None]:
# Convert date columns
df['Date Rated'] = pd.to_datetime(df['Date Rated'])
df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')

# Clean runtime column
df['Runtime (mins)'] = pd.to_numeric(df['Runtime (mins)'], errors='coerce')

# Split genres into separate columns for analysis
df['Genres_List'] = df['Genres'].str.split(', ')

# Clean numeric columns
df['Your Rating'] = pd.to_numeric(df['Your Rating'], errors='coerce')
df['IMDb Rating'] = pd.to_numeric(df['IMDb Rating'], errors='coerce')
df['Num Votes'] = pd.to_numeric(df['Num Votes'], errors='coerce')
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

print(f"After cleaning, shape: {df.shape}")
df.head()

## Basic Statistics and Analysis

In [None]:
# Rating statistics
print("Your Rating Statistics:")
print(df['Your Rating'].describe())

print("\nIMDb Rating Statistics:")
print(df['IMDb Rating'].describe())

print("\nYear range:")
print(f"From {df['Year'].min()} to {df['Year'].max()}")

print("\nRuntime statistics:")
print(df['Runtime (mins)'].describe())

In [None]:
# Rating distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Your ratings distribution
axes[0,0].hist(df['Your Rating'].dropna(), bins=range(1, 12), alpha=0.7, edgecolor='black')
axes[0,0].set_title('Your Rating Distribution')
axes[0,0].set_xlabel('Rating')
axes[0,0].set_ylabel('Count')

# IMDB ratings distribution
axes[0,1].hist(df['IMDb Rating'].dropna(), bins=30, alpha=0.7, edgecolor='black')
axes[0,1].set_title('IMDb Rating Distribution')
axes[0,1].set_xlabel('Rating')
axes[0,1].set_ylabel('Count')

# Year distribution
decade_counts = df.groupby((df['Year'] // 10) * 10).size()
axes[1,0].bar(decade_counts.index, decade_counts.values, width=8)
axes[1,0].set_title('Movies by Decade')
axes[1,0].set_xlabel('Decade')
axes[1,0].set_ylabel('Count')

# Runtime distribution
axes[1,1].hist(df['Runtime (mins)'].dropna(), bins=30, alpha=0.7, edgecolor='black')
axes[1,1].set_title('Runtime Distribution')
axes[1,1].set_xlabel('Runtime (minutes)')
axes[1,1].set_ylabel('Count')

plt.tight_layout()
plt.show()

## Genre Analysis

In [None]:
# Extract all genres
all_genres = []
for genres_list in df['Genres_List'].dropna():
    all_genres.extend(genres_list)

# Count genre frequency
genre_counts = pd.Series(all_genres).value_counts()

print(f"Total unique genres: {len(genre_counts)}")
print("\nTop 10 most common genres:")
print(genre_counts.head(10))

# Plot genre frequency
plt.figure(figsize=(12, 6))
genre_counts.head(15).plot(kind='bar')
plt.title('Most Common Genres')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Genre rating analysis
genre_ratings = {}
for genre in genre_counts.head(10).index:
    mask = df['Genres'].str.contains(genre, na=False)
    genre_ratings[genre] = df[mask]['Your Rating'].mean()

genre_rating_df = pd.DataFrame(list(genre_ratings.items()), columns=['Genre', 'Avg_Rating'])
genre_rating_df = genre_rating_df.sort_values('Avg_Rating', ascending=False)

plt.figure(figsize=(10, 6))
plt.bar(genre_rating_df['Genre'], genre_rating_df['Avg_Rating'])
plt.title('Average Your Rating by Genre (Top 10 Genres)')
plt.xlabel('Genre')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("Average ratings by genre:")
print(genre_rating_df)

## Temporal Analysis

In [None]:
# Rating trends over time
df['Rating_Date_Year'] = df['Date Rated'].dt.year
df['Rating_Date_Month'] = df['Date Rated'].dt.to_period('M')

# Monthly rating activity
monthly_activity = df.groupby('Rating_Date_Month').size()

plt.figure(figsize=(15, 6))
monthly_activity.plot(kind='line', marker='o')
plt.title('Rating Activity Over Time')
plt.xlabel('Month')
plt.ylabel('Number of Ratings')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Average ratings by decade
df['Decade'] = (df['Year'] // 10) * 10
decade_ratings = df.groupby('Decade')['Your Rating'].agg(['mean', 'count']).reset_index()
decade_ratings = decade_ratings[decade_ratings['count'] >= 5]  # Filter decades with at least 5 movies

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.bar(decade_ratings['Decade'], decade_ratings['mean'])
plt.title('Average Rating by Decade')
plt.xlabel('Decade')
plt.ylabel('Average Rating')

plt.subplot(1, 2, 2)
plt.bar(decade_ratings['Decade'], decade_ratings['count'])
plt.title('Number of Movies by Decade')
plt.xlabel('Decade')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

print("Decade analysis:")
print(decade_ratings)

## Runtime Analysis

In [None]:
# Runtime vs Rating correlation
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(df['Runtime (mins)'], df['Your Rating'], alpha=0.6)
plt.xlabel('Runtime (minutes)')
plt.ylabel('Your Rating')
plt.title('Runtime vs Your Rating')

# Runtime categories
df['Runtime_Category'] = pd.cut(df['Runtime (mins)'], 
                               bins=[0, 90, 120, 150, 300], 
                               labels=['Short (<90)', 'Medium (90-120)', 'Long (120-150)', 'Very Long (>150)'])

runtime_ratings = df.groupby('Runtime_Category')['Your Rating'].agg(['mean', 'count']).reset_index()

plt.subplot(1, 2, 2)
plt.bar(range(len(runtime_ratings)), runtime_ratings['mean'])
plt.xticks(range(len(runtime_ratings)), runtime_ratings['Runtime_Category'], rotation=45)
plt.title('Average Rating by Runtime Category')
plt.ylabel('Average Rating')

plt.tight_layout()
plt.show()

print("Runtime analysis:")
print(runtime_ratings)

correlation = df[['Runtime (mins)', 'Your Rating']].corr().iloc[0, 1]
print(f"\nCorrelation between runtime and your rating: {correlation:.3f}")

## Director Analysis

In [None]:
# Director analysis
director_stats = df.groupby('Directors').agg({
    'Your Rating': ['mean', 'count'],
    'IMDb Rating': 'mean'
}).round(2)

director_stats.columns = ['Avg_Your_Rating', 'Movie_Count', 'Avg_IMDb_Rating']
director_stats = director_stats.reset_index()

# Filter directors with at least 2 movies
frequent_directors = director_stats[director_stats['Movie_Count'] >= 2].sort_values('Avg_Your_Rating', ascending=False)

print("Directors with multiple movies (sorted by your average rating):")
print(frequent_directors.head(10))

if len(frequent_directors) > 0:
    plt.figure(figsize=(12, 6))
    top_directors = frequent_directors.head(10)
    plt.barh(range(len(top_directors)), top_directors['Avg_Your_Rating'])
    plt.yticks(range(len(top_directors)), top_directors['Directors'])
    plt.xlabel('Average Rating')
    plt.title('Top Directors by Average Rating (2+ movies)')
    plt.tight_layout()
    plt.show()

## Summary Statistics

In [None]:
# Summary statistics
print("=== IMDB RATINGS ANALYSIS SUMMARY ===")
print(f"\nTotal movies rated: {len(df)}")
print(f"Rating period: {df['Date Rated'].min().strftime('%Y-%m-%d')} to {df['Date Rated'].max().strftime('%Y-%m-%d')}")
print(f"Movie years: {int(df['Year'].min())} to {int(df['Year'].max())}")

print(f"\nYour average rating: {df['Your Rating'].mean():.2f}")
print(f"Most common rating: {df['Your Rating'].mode().iloc[0]}")
print(f"Rating standard deviation: {df['Your Rating'].std():.2f}")

print(f"\nFavorite genre: {genre_counts.index[0]}")
print(f"Highest rated genre: {genre_rating_df.iloc[0]['Genre']} ({genre_rating_df.iloc[0]['Avg_Rating']:.2f})")

print(f"\nAverage runtime preference: {df['Runtime (mins)'].mean():.0f} minutes")
print(f"Most productive rating year: {df['Rating_Date_Year'].value_counts().index[0]}")

if len(frequent_directors) > 0:
    print(f"\nFavorite director: {frequent_directors.iloc[0]['Directors']} ({frequent_directors.iloc[0]['Avg_Your_Rating']:.1f} avg rating)")

# Comparison with IMDB ratings
rating_diff = df['Your Rating'] - df['IMDb Rating']
print(f"\nRating vs IMDb: {rating_diff.mean():.2f} difference on average")
if rating_diff.mean() > 0:
    print("You tend to rate movies higher than IMDb average")
else:
    print("You tend to rate movies lower than IMDb average")