In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import other required libraries
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import pylab as pl
import numpy as np
%matplotlib inline

In [None]:
df=pd.read_csv("/kaggle/input/imdb-movies-dataset/imdb_movies.csv")
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.replace("?",np.nan,inplace=True)
df.replace(" ",np.nan,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
missing_data=df.isnull()
missing_data.head()

In [None]:
#counting missing values in each column
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")

**3. Deal with missing data¶
How to deal with missing data?

1.Drop data

a. Drop the whole row\ b. Drop the whole column

2.Replace data

a. Replace it by mean\ b. Replace it by frequency\ c. Replace it based on other functions

**

In [None]:
# the column "crew" is not required for any analysis, so we drop it.
df.drop(['crew'], axis=1, inplace=True)


In [None]:
#the genre has some missing values so well will fill it by its frequency
#see which values are there in genre 
df.genre.value_counts()

In [None]:
# another method to check the most common value in the column is by using ".idxmax()"
df.genre.value_counts().idxmax()

In [None]:
df.genre.replace(np.nan,"Drama",inplace=True)

In [None]:
df.info()

In [None]:
# find the correlation between the features
corr = df[["score", "revenue", "budget_x"]].corr()
sns.heatmap(corr, annot=True)
plt.title('Correlation Matrix Heatmap')

In [None]:
#now lets check which are the top 10 movies with high revenue
data=pd.DataFrame(df,columns=["names","revenue"])
data_sorted=data.sort_values(by="revenue",ascending=False)
data_sorted["revenue"]=data_sorted["revenue"]/1000000
pd.options.display.float_format="{:,.0f}".format
data_sorted.set_index('names', inplace=True)
ranking_rev = data_sorted.head(10)
ranking_rev



**We’ll use this piece of data frame to create our chart.¶
**

In [None]:
index=ranking_rev.index
values=ranking_rev["revenue"]
plot_title="Top 10 movies by revenue,usd million"
title_size=18
subtitle='Source: Kaggle / IMDB Movies'
x_label="revenue,usd million"
file_name="barh-plot"


In [None]:
# draw a figure with a subplot. We’re using the viridis color scheme to create gradients later.
fig, ax = plt.subplots(figsize=(10,6), facecolor=(.94, .94, .94))
mpl.pyplot.viridis()

# create bars
bar = ax.barh(index, values, color='darkseagreen')
plt.tight_layout()
ax.xaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))
# set title, its font size, and position
title = plt.title(plot_title, pad=20, fontsize=title_size)
title.set_position([.33, 1])
plt.subplots_adjust(top=0.9, bottom=0.1)

# create bar labels/annotations
rects = ax.patches
# Place a label for each bar
for rect in rects:
    # Get X and Y placement of label from rect
    x_value = rect.get_width()
    y_value = rect.get_y() + rect.get_height() / 2

    # Number of points between bar and label; change to your liking
    space = -30
    # Vertical alignment for positive values
    ha = 'left'

    # If value of bar is negative: place label to the left of the bar
    if x_value < 0:
        # Invert space to place label to the left
        space *= -1
        # Horizontally align label to the right
        ha = 'right'

    # Use X value as label and format number
    label = '{:,.0f}'.format(x_value)

    # Create annotation
    plt.annotate(
        label,                      # Use `label` as label
        (x_value, y_value),         # Place label at bar end
        xytext=(space, 0),          # Horizontally shift label by `space`
        textcoords='offset points', # Interpret `xytext` as offset in points
        va='center',                # Vertically center label
        ha=ha,                      # Horizontally align label differently for positive and negative values
        color = 'black')            # Change label color to white
    
# Set subtitle
tfrom = ax.get_xaxis_transform()
ann = ax.annotate(subtitle, xy=(5, 1), xycoords=tfrom, bbox=dict(boxstyle='square,pad=1.3', fc='#f0f0f0', ec='none'))

#Set x-label
ax.set_xlabel(x_label, color='black')


In [None]:
revenue_by_country=df.loc[:,["country","revenue"]]
revenue_by_country=revenue_by_country.groupby("country").sum()
Top_5_countries=revenue_by_country.nlargest(5,"revenue")/1000000
pd.options.display.float_format = '{:,.0f}'.format


In [None]:
index=Top_5_countries.index
values=Top_5_countries["revenue"]
plot_title= "Top 5 countries by revenue,usd million"
title_size=18
subtitle = 'Source: Kaggle / IMDB Movies'
x_label = 'Revenue, USD million'


In [None]:
# draw a figure with a subplot. We’re using the viridis color scheme to create gradients later.
fig, ax = plt.subplots(figsize=(10,6), facecolor=(.94, .94, .94))
mpl.pyplot.viridis()

# create bars
bar = ax.barh(index, values, color='darkseagreen')
plt.tight_layout()
ax.xaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))
# set title, its font size, and position
title = plt.title(plot_title, pad=20, fontsize=title_size)
title.set_position([.33, 1])
plt.subplots_adjust(top=0.9, bottom=0.1)

# create bar labels/annotations
rects = ax.patches
# Place a label for each bar
for rect in rects:
    # Get X and Y placement of label from rect
    x_value = rect.get_width()
    y_value = rect.get_y() + rect.get_height() / 2

    # Number of points between bar and label; change to your liking
    space = -30
    # Vertical alignment for positive values
    ha = 'left'

    # If value of bar is negative: place label to the left of the bar
    if x_value < 0:
        # Invert space to place label to the left
        space *= -1
        # Horizontally align label to the right
        ha = 'right'

    # Use X value as label and format number
    label = '{:,.0f}'.format(x_value)

    # Create annotation
    plt.annotate(
        label,                      # Use `label` as label
        (x_value, y_value),         # Place label at bar end
        xytext=(space, 0),          # Horizontally shift label by `space`
        textcoords='offset points', # Interpret `xytext` as offset in points
        va='center',                # Vertically center label
        ha=ha,                      # Horizontally align label differently for positive and negative values
        color = 'black')            # Change label color to white
    
# Set subtitle
tfrom = ax.get_xaxis_transform()
ann = ax.annotate(subtitle, xy=(5, 1), xycoords=tfrom, bbox=dict(boxstyle='square,pad=1.3', fc='#f0f0f0', ec='none'))

#Set x-label
ax.set_xlabel(x_label, color='black')

In [None]:
# Now we compare the Budget against the Revenue generated for the Top 10 movies with highest revenue
# Now we compare the Budget against the Revenue generated for the Top 10 movies with highest revenue
top_10_movies = df.sort_values('revenue', ascending=False).head(10)
top_10_movies.set_index('names', inplace=True)
top_10_movies

In [None]:
# now we keep only the columns budget_x and revenue and remove all other columns from the dataframe
# we also remove any duplicate values present in the dataframe

top_10_movies = top_10_movies[['budget_x','revenue']]
top_10_movies = top_10_movies.drop_duplicates()
top_10_movies

In [None]:
# now we plot a horizontal bar chart comparing the revenue generated against the budget for the top 10 movies

ax = top_10_movies.plot.barh(width=0.8, color=['darkseagreen', 'deepskyblue'])

custom_legend = ['Budget', 'Revenue']
ax.legend(custom_legend)    
ax.set_xlabel("Amount in Millions")
ax.set_ylabel("Top 10 Movies")

In [None]:
# now we check which are the top 5 languages of the movies which generate maximum revenue

rev_by_lang = df.loc[:, ['orig_lang', 'revenue']]
rev_by_lang = rev_by_lang.groupby('orig_lang').sum()
top_5_lang = rev_by_lang.nlargest(5, 'revenue')
top_5_lang

plt.figure(figsize=(15,6))
# Create a bar plot
plt.barh(top_5_lang.index, top_5_lang['revenue']/1000000000, color='darkseagreen')
plt.title('Top 5 Languages of the Movies Generating Maximum Revenue', fontsize=20)

# Label the axes
plt.ylabel('Language', fontsize=15)
plt.xlabel('Revenue (in billions)', fontsize=15)

# Show the plot
plt.show()

In [None]:

# now we check which is the most common Genre of movies produced 

genre_count= df['genre'].value_counts().head(10)
genre_count

In [None]:
# plot a pie chart
plt.figure(figsize=(8,8))
plt.pie(genre_count, labels=genre_count.index, autopct="%0.01f%%")
plt.title('Top 10 Genre of Movies Produced')
plt.show()

In [None]:
orig_lang_count= df['orig_lang'].value_counts().head(10)
orig_lang_count

In [None]:
# plot a pie chart
plt.figure(figsize=(8,8))
plt.pie(orig_lang_count, labels=orig_lang_count.index, autopct="%0.01f%%")
plt.title('Top 10 Languages in which the movies are Produced')
plt.show()


In [None]:
# now we check the number of movies released per year

# convert the date_x column from string format to date format
df["date_x"]= pd.to_datetime(df["date_x"])

# Get the count of movies released per year
year_counts = df["date_x"].dt.year.value_counts().sort_index()

In [None]:
# Generate the plot
plt.figure(figsize=(12,8))
plt.plot(year_counts.index, year_counts)

# Set the X-axis tick labels to show bins every 10 years
xticks = [year for year in year_counts.index if year % 5 == 0]
plt.xticks(xticks, rotation=45)

# Set the title of the plot
plt.title("Number of Movies Released per Year")
plt.xlabel("Years")
plt.ylabel("Counts")

# Show the plot
plt.show()

In [None]:
# now we check the number of movies released per month

months=["JAN","FEB","MAR","APR","MAY","JUN","JUL","AUG","SEP","OCT","NOV","DEC"]
month_count = df["date_x"].dt.month.value_counts().sort_index()
plt.figure(figsize=(12,8))
plt.bar(x=months, height= month_count, color = "darkseagreen")
plt.title("Total Number of Movies Released per Month from 1900 to 2023")
# Add values on top of each bar
for i, v in enumerate(month_count):
    plt.text(i, v, str(v), ha='center', va='bottom')
    