In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as snb

# Load the ratings dataset
ratings = pd.read_csv('ratings.csv')
print(ratings)

# Display the first and last few rows
print(ratings.head())
print(ratings.tail())

# Display dataset information and summary statistics
ratings.info()  # Shows data types and non-null values
print(ratings.shape)  # Prints the shape of the dataset (rows, columns)
print(ratings.describe())  # Statistical summary

# Find movies with the highest rating (assuming 5 is the highest rating)
highest_rated_movies = ratings[ratings['rating'] == 5.0]
print(highest_rated_movies)

# Load the movies metadata dataset
movies_metadata = pd.read_csv('movies_metadata.csv')
print(movies_metadata.describe())

# Find the movie with the highest runtime
highest_runtime_movie = movies_metadata.loc[movies_metadata['runtime'].idxmax()]
print(highest_runtime_movie)

# Group by runtime and sum revenue, then sort values
total_revenue_per_runtime = movies_metadata.groupby('runtime')['revenue'].sum().sort_values(ascending=False)
print(total_revenue_per_runtime)

# Sort movies by vote count and release date
data3 = movies_metadata.groupby('vote_count')['release_date'].apply(list)
print(data3)

# Find top 10 movies with the highest revenue
top_10_revenue = movies_metadata.nlargest(10, 'revenue')
print(top_10_revenue[['title', 'revenue']])

# Load IMDB dataset
data3 = pd.read_csv('IMDB1000.csv')
print(data3.describe())

# Extract genres
genres_list = []
for genre in data3['Genre']:
    genres_list.append(genre)

# Load another IMDB dataset
data4 = pd.read_csv('IMDB-Movie-Data.csv')

# Find top 10 revenue-generating movies
top_10_revenue_movies = data4.nlargest(10, 'Revenue (Millions)')[['original_title', 'Revenue (Millions)']]
print(top_10_revenue_movies.sort_values('Revenue (Millions)', ascending=False))

# Extract genres and get unique values
genre_set = set()
for genre_entry in data4['Genre']:
    for genre in genre_entry.split(','):
        genre_set.add(genre.strip())
print(genre_set)

# Total revenue per year
total_revenue_per_year = data4.groupby('Year')['Revenue (Millions)'].sum()
print(total_revenue_per_year.sort_values(ascending=False).max())

# Plot revenue per year
plt.figure(figsize=(12, 6))
snb.barplot(x=total_revenue_per_year.index, y=total_revenue_per_year.values)
plt.xlabel("Year")
plt.ylabel("Total Revenue (Millions)")
plt.title("Total Revenue per Year")
plt.xticks(rotation=45)
plt.show()

# Top 10 highest revenue movies visualization
top_10 = data4.nlargest(10, 'Revenue (Millions)')
snb.barplot(x='Revenue (Millions)', y='Title', hue='Title', data=top_10)
plt.title("Top 10 Highest Revenue Generating Movies")
plt.show()

# Top 10 highest-rated movies visualization
top_10_rating = data4.nlargest(10, 'Rating')
snb.barplot(x='Rating', y='Title', hue='Title', data=top_10_rating)
plt.title("Top 10 Rated Movies")
plt.show()

# Count plot for number of movies released per year
snb.countplot(x="Year", data=data4)
plt.title("Number of Movies per Year")
plt.show()

# Load Excel dataset
data1 = pd.read_excel('data1.xlsx')
print(data1.columns)
print(data1.describe())

# Count plot for number of action movies per year
action_movies = data4[data4['Genre'].str.contains("Action", case=False, na=False)]
snb.countplot(x='Year', data=action_movies)
plt.title("Number of Action Movies per Year")
plt.xticks(rotation=45)
plt.show()

# Homework task: Scatter plot vs Density plot for an actor (e.g., Aamir Khan)
aamir_khan_movies = data4[data4['Actors'].str.contains("Aamir Khan", case=False, na=False)]
snb.scatterplot(x='Year', y='Revenue (Millions)', data=aamir_khan_movies)
plt.title("Aamir Khan Movies Revenue Over Years")
plt.show()

snb.kdeplot(x='Year', data=aamir_khan_movies)
plt.title("Density Distribution of Aamir Khan Movies")
plt.show()
