In [None]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import seaborn as sns
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
base_url = "https://www.imdb.com/search/title/?title_type=tv_series&num_votes=1000,&languages=en&sort=user_rating,desc&start=0"


In [None]:
shows = []
shows_per_page = 50
for page_number in range(0, 1000, shows_per_page):
    url = base_url + str(page_number) 
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    for show_div in soup.find_all("div", class_="lister-item-content"):
        show = {}
        rank = show_div.find("span", class_="lister-item-index unbold text-primary").text.strip().replace('.','')
        show["Rank"] = int(rank)
        show["Title"] = show_div.find("h3").a.text.strip()
        show["Year"] = show_div.find("span", class_="lister-item-year").text.strip().replace('(','').replace(')','').replace('II','').replace('I','')
        year = show["Year"].strip()
        if "–" in year:
            start_year, end_year = year.split("–")
            show["Start Year"] = start_year
            show["End Year"] = end_year
        else:
            show["Start Year"] = year
            show["End Year"] = None
        show["Rating"] = show_div.find("strong").text.strip()
        show["Duration"] = show_div.find("span", class_="runtime").text.strip().replace('min','') if show_div.find("span", class_="runtime") else None
        value = show_div.find_all("span", attrs={"name": "nv"})
        show["Votes"] = value[0].text.replace(',','') 
        genre = show_div.find("span", class_="genre").text.strip()
        genres = [g.strip() for g in genre.split(",")]
        show["Genre"] = ", ".join(genres)
        show["Genre"] = show["Genre"].strip('"')
#         show["Certificate"] = show_div.find("span", class_="certificate")
#         show["Certificate"] = show["Certificate"].replace('[', '').replace(']', '')
        certificate_elem = show_div.find("span", class_="certificate")
        if certificate_elem is not None:
            certificate = certificate_elem.text.strip().replace('[', '').replace(']', '')
        else:
            certificate = None
        show["Certificate"] = certificate 
         # Extract the primary genre from the "genre" field
        genre = show_div.find("span", class_="genre").text.strip()
        genres = [g.strip() for g in genre.split(",")]
        show["Primary Genre"] = genres[0]
        shows.append(show)

In [None]:
df = pd.DataFrame(shows)
df.drop("Year", axis=1, inplace=True)
df.to_csv('TVSHOWS_1000.csv')
df[df.isna().any(axis=1)]
df = df.dropna()
df

In [None]:
df['Votes'] = df['Votes'].astype('int64')
df['Start Year'] = df['Start Year'].astype('int64')
df["End Year"].replace('', 0, inplace=True)
df["End Year"] = df["End Year"].astype(int)
df['Duration'] = df['Duration'].astype(str).astype('int64')
df['Rating'] = df['Rating'].astype(str).astype('float')

In [None]:
df.head()

Compare the ratings of shows that started in 2019 and 2020. Use a t-test to determine if there is a statistically significant difference in the mean ratings of these two groups.

In [None]:
tv_shows_2019 = df[df['Start Year'] == 2019]
tv_shows_2020 = df[df['Start Year'] == 2020]
# Combine the two DataFrames into one
tv_shows_2019_2020 = pd.concat([tv_shows_2019, tv_shows_2020])

# Conduct a t-test to determine if there is a statistically significant difference in mean ratings between the two groups
t, p = stats.ttest_ind(tv_shows_2019['Rating'], tv_shows_2020['Rating'])

# Print the t-statistic and p-value
print(f"t-statistic: {t}, p-value: {p}")

# Visualize the distribution of ratings for each group using a distplot from seaborn
sns.displot(data=tv_shows_2019_2020, x='Rating', hue='Start Year', kind='kde', common_norm=False,palette="Set2")
plt.show()


Which genres are most popular and in high demand among viewers

In [None]:

genre_counts = df["Primary Genre"].value_counts()

# Plot the genre distribution
fig, ax = plt.subplots(figsize=(25, 10))
plt.xticks(rotation='vertical', fontsize=18)
plt.bar(genre_counts.index, genre_counts.values)
plt.xlabel("Genre")
plt.ylabel("Number of TV Shows")
plt.title("Genre Distribution of Top 1000 TV Shows")
plt.xticks(rotation=90)
plt.show()


Popularity of TV shows over time by plotting the number of votes received each year

In [None]:
tv_shows_filtered = df[df['Start Year'] > 1990]

# Group the TV shows by start year and count the number of votes
votes_by_year = tv_shows_filtered.groupby("Start Year")["Votes"].sum()

# Plot the number of votes over time
fig, ax = plt.subplots(figsize=(25, 10))
plt.xticks(rotation='vertical', fontsize=18)
plt.plot(votes_by_year.index, votes_by_year.values)
plt.xlabel("Year")
plt.ylabel("Number of Votes")
plt.title("Popularity of TV Shows Over Time")
plt.show()


Relationship between the duration of a TV show and its rating. This could help determine whether longer shows are generally rated higher or not.

In [None]:

import seaborn as sns

fig, ax = plt.subplots(figsize=(25, 10))

sns.regplot(x="Duration", y="Rating", data=df,ax=ax)
plt.xlabel("Duration (in minutes)")
plt.ylabel("Rating")
plt.title("Relationship Between Duration and Rating of TV Shows")
plt.show()


Start and end years of TV shows. To understand trends in the television industry and when certain genres were popular.

In [None]:
import seaborn as sns

# Filter the dataframe to include only rows with start and end years after 2000
df_filtered = df[(df["Start Year"] > 2000) & (df["End Year"] > 2000)]

# Reshape the dataframe for Seaborn's catplot
df_melted = pd.melt(df_filtered, value_vars=["Start Year", "End Year"])

# Create the catplot
sns.catplot(data=df_melted, kind="count", x="value", hue="variable",
            palette={"Start Year": "tab:blue", "End Year": "tab:orange"},
            height=6, aspect=2)

# Set the axis labels and title
plt.xlabel("Year")
plt.ylabel("Number of TV Shows")
plt.title("Distribution of Start and End Years of TV Shows (After 2000)")

plt.show()



Relationship between the rating of a TV show and the number of votes it received. If there is a correlation between the two and if highly-rated shows tend to receive more votes.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


# Plot the relationship between rating and votes
fig, ax = plt.subplots(figsize=(25, 10))
plt.xticks(rotation='vertical', fontsize=18)
# plt.scatter(df["Rating"], df["Votes"])
sns.regplot(x="Rating", y="Votes", data=df,ax=ax)
plt.xlabel("Rating")
plt.ylabel("Number of Votes")
plt.title("Relationship Between Rating and Votes of TV Shows")
plt.show()


To understand how many shows have managed to stay on air for a long time and if this trend has changed over time. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Remove shows which don't have an end year
df.dropna(subset=["End Year"], inplace=True)

# Define a threshold for the number of years a show needs to air to be considered "long-running"
threshold = 10

# Calculate the number of long-running shows for each year
long_running_shows = []
years = sorted(set(df["Start Year"].tolist() + df["End Year"].tolist()))
years = [year for year in years if year >= 2000]
for year in years:
    long_running_count = sum([1 for index, row in df.iterrows() if (row["Start Year"] <= year) & (row["End Year"] >= year) & (row["End Year"] - row["Start Year"] + 1 >= threshold)])
    long_running_shows.append(long_running_count)

# Plot the number of long-running shows over time
plt.plot(years, long_running_shows)
plt.xlabel("Year")
plt.ylabel("Number of Long-Running Shows")
plt.title("Number of Long-Running Shows Over Time")
plt.show()



Number of high-rated TV shows over time to understand if the quality of television shows has improved over time.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(25, 10))
plt.xticks(rotation='vertical', fontsize=18)

# Convert the Rating column to a floating-point data type
df["Rating"] = df["Rating"].astype(float)

# Define a threshold for the rating of a show to be considered "high-rated"
threshold = 8.0

# Calculate the number of high-rated shows for each year
high_rated_shows = []
years = sorted(set(df["Start Year"].tolist()))
for year in years:
    high_rated_count = sum([1 for index, row in df.iterrows() if (row["Start Year"] == year) & (row["Rating"] >= threshold)])
    high_rated_shows.append(high_rated_count)

# Plot the number of high-rated shows over time
plt.plot(years, high_rated_shows)
plt.xlabel("Year")
plt.ylabel("Number of High-Rated Shows",fontsize=18)
plt.title("Number of High-Rated Shows Over Time",fontsize=18)
plt.show()


Popularity of different genres over time

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# # Split the genres into separate rows for each TV show
# df = df.explode("Genre")

# Keep only the specified genres
# df_Genre = df[df["Genre"].isin(["Action", "Comedy", "Drama", "Family", "Documentary"])]
# df_Genre = df[df["Genre"]
# Group the TV shows by genre and year
genre_year_group = df.groupby(["Primary Genre", "Start Year"]).size().reset_index(name="Count")

# Create a pivot table to reshape the data
pivot = genre_year_group.pivot(index="Start Year", columns="Primary Genre", values="Count")

# Plot the popularity of each genre over time
pivot.plot.bar(stacked=True, figsize=(12, 8))
plt.xlabel("Year")
plt.ylabel("Number of Shows")
plt.title("Popularity of Different Genres Over Time")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()





The average rating of TV shows over time to understand the trend of popularity.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


# Group the data by start year and find the mean rating for each year
grouped_data = df.groupby("Start Year").mean()["Rating"]

# Plot the line chart
plt.plot(grouped_data.index, grouped_data.values)

# Add labels and title to the chart
# fig, ax = plt.subplots(figsize=(25, 10))
# plt.xticks(rotation='vertical', fontsize=18)
plt.xlabel("Start Year")
plt.ylabel("Average Rating")
plt.title("Trend analysis of TV show popularity over time")

# Show the chart
plt.show()


Compare the distribution of different genres among the top 1000 TV shows.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


# Group the data by genre and find the count of each genre
grouped_data = df.groupby("Primary Genre").count()["Title"]

# Plot the bar chart
plt.bar(grouped_data.index, grouped_data.values)

# Add labels and title to the chart
plt.xlabel("Genre")
plt.ylabel("Number of TV Shows")
plt.title("Genre distribution of top 1000 TV shows")

# Show the chart
plt.show()


Which shows has more popularity based on votes

In [None]:
import matplotlib.pyplot as plt
import pandas as pd


# Sort the dataframe by the number of votes in descending order
df = df.sort_values(by='Votes', ascending=False)

# Get the top 10 rows of the sorted dataframe
df = df.head(10)

# Plot the data as a bar chart
plt.bar(df['Title'], df['Votes'])

# Add labels to the x and y axis
plt.xlabel('Title')
plt.ylabel('Votes')

# Show the plot
plt.show()



To visualize the average ratings of TV shows over time and how it has changed. And see if there's a correlation between start year and ratings.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


# Group the data by start year and calculate the average rating
grouped = df.groupby("Start Year").mean()

# Plot the average rating by start year
plt.plot(grouped.index, grouped["Rating"])
plt.xlabel("Start Year")
plt.ylabel("Average Rating")
plt.title("Average Rating of TV Shows by Start Year")

# Show the plot
plt.show()


by decade #Not useful

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create a new column for the decade
df['Decade'] = (df['Start Year']//10)*10

# Group the data by decade and calculate the average rating
grouped = df.groupby("Decade").mean()

# Plot the average rating by decade
plt.plot(grouped.index, grouped["Rating"])
plt.xlabel("Decade")
plt.ylabel("Average Rating")
plt.title("Average Rating of TV Shows by Decade")

# Show the plot
plt.show()


Genre popularity analysis - You could use bar plots to visualize the number of TV shows in each genre and how it has changed over time. You can also calculate the average ratings for each genre and compare it using a t-test. data: title,start year,end year, rating, votes. and genre.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

# Calculate the number of TV shows in each genre
genre_counts = df["Primary Genre"].value_counts()

# Plot the number of TV shows in each genre using a bar plot
plt.bar(genre_counts.index, genre_counts.values)
plt.xlabel("Genre")
plt.ylabel("Number of TV Shows")
plt.title("Number of TV Shows by Genre")
plt.xticks(rotation=90)
plt.show()

# Calculate the average rating for each genre
genre_mean_ratings = df.groupby("Primary Genre")["Rating"].mean()

# Plot the average ratings for each genre using a bar plot
# plt.displot(genre_mean_ratings.index, genre_mean_ratings.values)
# sns.displot(genre_mean_ratings.values, kde=True)
# plt.xlabel("Genre")
# plt.ylabel("Average Rating")
# plt.title("Average Rating by Genre")
# plt.xticks(rotation=90)
# plt.show()

# Perform a t-test to compare the average ratings of two genres
genre1 = df[df["Primary Genre"] == "Drama"]["Rating"]
genre2 = df[df["Primary Genre"] == "Comedy"]["Rating"]

t, p = stats.ttest_ind(genre1, genre2)

print("T-statistic:", t)
print("P-value:", p)


Which shows has more popularity based on Rating

In [None]:


tv_shows_filtered = df[df['Start Year'] > 2000]

# Create the scatter plot
sns.scatterplot(data=df, x='Start Year', y='Rating', hue='Primary Genre')

# Add labels and titles
plt.xlabel('Start Year')
plt.ylabel('Rating')
plt.title('Rating vs Start Year, Colored by Genre')

# Show the plot
plt.show()

#Top TV Show which are popular in particular genre

In [None]:
df['Certificate']=df['Certificate'].fillna('Not Rated')

certificates=df[df['Certificate']!='Not Rated']['Certificate']
fig,axs=plt.subplots(figsize=(20,5))
sns.countplot(certificates)
plt.title("Count of Certificates provided", weight = "bold")
plt.show()

In [None]:
from collections import Counter
genre=[]
for x in df['Genre']:
    for y in x.split(','):
        genre.append(y.strip().lower())
        

count=Counter(genre)
count=count.most_common()[:10]
x,y=map(list,zip(*count))

fig,axs=plt.subplots(figsize=(20,5))
sns.barplot(y,x)
plt.ylabel("Genres")
plt.title("Top Ten Genres")
plt.show()

In [None]:
fig,axs=plt.subplots(figsize=(20,5))
sns.scatterplot(x=df['Start Year'].value_counts()[:10].index,y=df['Start Year'].value_counts()[:10])
# sns.regplot(x=df['Start Year'].value_counts()[:10].index,y=df['Start Year'].value_counts()[:10])

# sns.barplot(data=df, x='Start Year', y='Rating', hue='Primary Genre')
plt.title("Maximum TV shows released in-")
plt.xlabel("year")
plt.ylabel("Number of TV shows")
plt.show()

In [None]:
# import seaborn as sns


# # Calculate standard deviation of ratings for each genre
# std_by_genre = df.groupby('Genre')['Rating'].std().reset_index()

# # Create a scatter plot
# sns.scatterplot(x='Genre', y='Rating', data=std_by_genre, s=100)

# # Set labels and title
# sns.set(rc={'figure.figsize':(15,8)})
# sns.set_style('whitegrid')
# # sns.set(font_scale=1.5)
# sns.set_palette('bright')
# plt.title('Standard Deviation of Ratings by Genre')
# plt.xlabel('Genre')
# plt.ylabel('Standard Deviation')
# plt.xticks(rotation=90)


# # Display the plot
# plt.show()


In [None]:

plt.scatter(df['Start Year'], df['Rating'])

# Add labels and a title to the plot
plt.xlabel('Start Year')
plt.ylabel('Rating')
plt.title('Start Year vs. Rating')

# Display the plot
plt.show()

In [None]:

import pandas as pd
import seaborn as sns


# Sort the dataset by rating in descending order
tv_shows_sorted = df.sort_values(by='Rating', ascending=False)

# Select the top 10 highest-rated TV shows
top_10 = tv_shows_sorted.head(100)

# Create a scatter plot of Rating vs. Votes
sns.regplot(data=top_10, x='Rating', y='Votes')
plt.title('Top 10 Highest-Rated TV Shows')
plt.show()

