In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries
0,Ariel,movie,"Comedy, Crime, Romance",1988.0,tt0094675,7.4,8776.0,JP
1,Shadows in Paradise,movie,"Comedy, Drama, Music",1986.0,tt0092149,7.5,7538.0,JP
2,Forrest Gump,movie,"Drama, Romance",1994.0,tt0109830,8.8,2320505.0,JP
3,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997.0,tt0119116,7.6,517631.0,JP
4,My Life Without Me,movie,"Drama, Romance",2003.0,tt0314412,7.4,26044.0,JP
...,...,...,...,...,...,...,...,...
9840,Shin Tennis no Ouji-sama: Hyoutei vs Rikkai - ...,tv,"Animation, Sport",2021.0,tt12451810,6.6,13.0,JP
9841,Gangnam B-Side,tv,"Crime, Drama, Mystery",2024.0,tt31390557,,,US
9842,Concordia,tv,"Drama, Thriller",2023.0,tt11324982,4.7,316.0,JP
9843,The Disappearance of Kimmy Diore,tv,Drama,2024.0,tt27207590,6.2,109.0,US


In [None]:
# cleaning of the data set 

In [3]:
# Check for missing values
print(df.isnull().sum())

# Fill missing values or drop rows with NaNs
df['imdbAverageRating'] = df['imdbAverageRating'].fillna(0)
df['imdbNumVotes'] = df['imdbNumVotes'].fillna(0)

# Convert data types if necessary
df['releaseYear'] = df['releaseYear'].astype(int, errors='ignore')


title                  576
type                     0
genres                 265
releaseYear             24
imdbId                 906
imdbAverageRating     1174
imdbNumVotes          1174
availableCountries       0
dtype: int64


In [5]:
# Top 10 movies by rating
top_rated = df[df['type'] == 'movie'].sort_values(by='imdbAverageRating', ascending=False).head(10)
print(top_rated[['title', 'imdbAverageRating', 'imdbNumVotes']])




                                              title  imdbAverageRating  \
5790                                       Kataomoi                9.5   
5373                          Aikatsu Planet! Movie                9.4   
55                         The Shawshank Redemption                9.3   
47                                    The Godfather                9.2   
5721                                    As It Flows                9.0   
48                            The Godfather Part II                9.0   
25    The Lord of the Rings: The Return of the King                9.0   
3762                                American Muscle                9.0   
31                                  The Dark Knight                9.0   
5910                             Patrice: The Movie                8.9   

      imdbNumVotes  
5790           6.0  
5373          40.0  
55       2965080.0  
47       2067715.0  
5721          12.0  
48       1396513.0  
25       2029972.0  
3762           7.

In [6]:
# Movies with the highest number of votes (indicating popularity)
most_voted = df[df['type'] == 'movie'].sort_values(by='imdbNumVotes', ascending=False).head(10)
print(most_voted[['title', 'imdbAverageRating', 'imdbNumVotes']])

                                                  title  imdbAverageRating  \
55                             The Shawshank Redemption                9.3   
31                                      The Dark Knight                9.0   
706                                           Inception                8.8   
78                                           Fight Club                8.8   
2                                          Forrest Gump                8.8   
110                                        Pulp Fiction                8.9   
1827                                       Interstellar                8.7   
87                                           The Matrix                8.7   
47                                        The Godfather                9.2   
23    The Lord of the Rings: The Fellowship of the Ring                8.9   

      imdbNumVotes  
55       2965080.0  
31       2946547.0  
706      2614737.0  
78       2396518.0  
2        2320505.0  
110      227672

In [7]:
# Split genres into multiple rows for detailed analysis
df_genres = df.assign(genres=df['genres'].str.split(', ')).explode('genres')

# Group by genres and calculate the average rating and votes
genre_insights = df_genres.groupby('genres').agg({
    'imdbAverageRating': 'mean',
    'imdbNumVotes': 'sum'
}).sort_values(by='imdbAverageRating', ascending=False)
print(genre_insights)


                    imdbAverageRating  imdbNumVotes
genres                                             
Biography                    7.002069    18592990.0
History                      6.856693     7108477.0
Talk-Show                    6.675000       46611.0
Adventure                    6.551444    91497333.0
War                          6.550000     3801324.0
Sport                        6.538411     2329152.0
Reality-TV                   6.472000      626325.0
Western                      6.419048      960453.0
Action                       6.363616   130797431.0
Romance                      6.328986    31218154.0
Animation                    6.325878    18185017.0
Musical                      6.305263     1062734.0
Crime                        6.264624    69556583.0
Fantasy                      6.228139    25819757.0
Game-Show                    6.222951      223878.0
Documentary                  6.205164     1081683.0
Comedy                       6.093546    72768331.0
Drama       

In [9]:
# Average ratings over the years
yearly_trends = df.groupby('releaseYear').agg({
    'imdbAverageRating': 'mean',
    'imdbNumVotes': 'sum'
}).sort_index()
print(yearly_trends)




             imdbAverageRating  imdbNumVotes
releaseYear                                 
1929.0                7.000000          95.0
1930.0                0.000000           0.0
1932.0                7.900000        6363.0
1933.0                6.800000          63.0
1936.0                7.000000          80.0
...                        ...           ...
2021.0                5.175150     4131752.0
2022.0                5.183787     6422522.0
2023.0                5.400147     2648240.0
2024.0                5.113527      890372.0
2025.0                0.000000           0.0

[91 rows x 2 columns]


In [10]:
# Define thresholds for good/bad movies
good_movies = df[(df['imdbAverageRating'] >= 7.5) & (df['imdbNumVotes'] >= 1000)]
bad_movies = df[(df['imdbAverageRating'] <= 5) & (df['imdbNumVotes'] >= 100)]

print("Good Movies:")
print(good_movies[['title', 'imdbAverageRating', 'imdbNumVotes']])

print("Bad Movies:")
print(bad_movies[['title', 'imdbAverageRating', 'imdbNumVotes']])


Good Movies:
                                                  title  imdbAverageRating  \
1                                   Shadows in Paradise                7.5   
2                                          Forrest Gump                8.8   
3                                     The Fifth Element                7.6   
5     Pirates of the Caribbean: The Curse of the Bla...                8.1   
6                                            Unforgiven                8.2   
...                                                 ...                ...   
9773                                 When They Cry: Kai                8.0   
9787                                             Rivals                8.0   
9792           Mobile Suit Gundam 0083: Stardust Memory                7.7   
9824                                   Samurai Champloo                8.5   
9838                                       Black Clover                8.2   

      imdbNumVotes  
1           7538.0  
2       

In [11]:
# Group by available countries
country_insights = df.groupby('availableCountries').agg({
    'imdbAverageRating': 'mean',
    'imdbNumVotes': 'sum'
}).sort_values(by='imdbAverageRating', ascending=False)
print(country_insights)


                    imdbAverageRating  imdbNumVotes
availableCountries                                 
JP, US                       6.870196    18456236.0
US                           6.196498    73384660.0
JP                           5.581172   227659454.0


In [12]:
# Popular genres with high ratings
popular_genres = genre_insights[genre_insights['imdbNumVotes'] > 10000].sort_values(by='imdbAverageRating', ascending=False)
print(popular_genres)

# Filter data to understand audience preferences
popular_movies = df[(df['imdbAverageRating'] > 8) & (df['imdbNumVotes'] > 10000)]
print(popular_movies[['title', 'genres', 'imdbAverageRating']])


             imdbAverageRating  imdbNumVotes
genres                                      
Biography             7.002069    18592990.0
History               6.856693     7108477.0
Talk-Show             6.675000       46611.0
Adventure             6.551444    91497333.0
War                   6.550000     3801324.0
Sport                 6.538411     2329152.0
Reality-TV            6.472000      626325.0
Western               6.419048      960453.0
Action                6.363616   130797431.0
Romance               6.328986    31218154.0
Animation             6.325878    18185017.0
Musical               6.305263     1062734.0
Crime                 6.264624    69556583.0
Fantasy               6.228139    25819757.0
Game-Show             6.222951      223878.0
Documentary           6.205164     1081683.0
Comedy                6.093546    72768331.0
Drama                 6.060468   165362595.0
Sci-Fi                6.048246    49905532.0
Music                 6.019368     5425995.0
Thriller  