In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
df = pd.read_csv('16k_Movies.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Release Date,Description,Rating,No of Persons Voted,Directed by,Written by,Duration,Genres
0,0,Dekalog (1988),"Mar 22, 1996",This masterwork by Krzysztof Kieślowski is one...,7.4,118,Krzysztof Kieslowski,"Krzysztof Kieslowski, Krzysztof Piesiewicz",9 h 32 m,Drama
1,1,Three Colors: Red,"Nov 23, 1994",Krzysztof Kieslowski closes his Three Colors t...,8.3,241,Krzysztof Kieslowski,"Krzysztof Kieslowski, Krzysztof Piesiewicz, Ag...",1 h 39 m,"Drama,Mystery,Romance"
2,2,The Conformist,"Oct 22, 1970","Set in Rome in the 1930s, this re-release of B...",7.3,106,Bernardo Bertolucci,"Alberto Moravia, Bernardo Bertolucci",1 h 47 m,Drama
3,3,Tokyo Story,"Mar 13, 1972",Yasujiro Ozu’s Tokyo Story follows an aging co...,8.1,147,Yasujirô Ozu,"Kôgo Noda, Yasujirô Ozu",2 h 16 m,Drama
4,4,The Leopard (re-release),"Aug 13, 2004","Set in Sicily in 1860, Luchino Visconti's spec...",7.8,85,Luchino Visconti,"Giuseppe Tomasi di Lampedusa, Suso Cecchi D'Am...",3 h 7 m,"Drama,History"


In [3]:
##. What percentage of missing or null values exist in key columns such as Description, Directed by, and Written by?
missing_values_per = (df.isnull().sum()/ len(df)) * 100
total_missing = df.isnull().sum()
missing_value_outcome = pd.concat([total_missing,missing_values_per], axis =1)
missing_value_outcome

Unnamed: 0,0,1
Unnamed: 0,0,0.0
Title,0,0.0
Release Date,0,0.0
Description,0,0.0
Rating,3444,21.141805
No of Persons Voted,3461,21.246163
Directed by,7,0.042971
Written by,963,5.911602
Duration,13,0.079804
Genres,5,0.030694


In [5]:
## How would you handle these missing values?
df = df.copy()
df['Rating'].fillna(df['Rating'].mean(),inplace = True)
df.tail()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Rating'].fillna(df['Rating'].mean(),inplace = True)


Unnamed: 0.1,Unnamed: 0,Title,Release Date,Description,Rating,No of Persons Voted,Directed by,Written by,Duration,Genres
16285,16285,Boys to Men,"Apr 27, 2001",Compilation of 4 short films from different di...,6.617632,,,,\n https://www.amazon.com/Boys-Men-Phil...,Drama
16286,16286,Saint Misbehavin': The Wavy Gravy Movie,"Dec 10, 2010","Beginning with Woodstock ‘99, director Michell...",6.617632,,Michelle Esrick,,Not Rated,Documentary
16287,16287,Collectors,"Oct 1, 2000",America is drawn to the macabre handiwork of t...,6.617632,,Julian P. Hobbs,,1 h 20 m,Documentary
16288,16288,Bonhoeffer,"Jun 20, 2003",Dramatic documentary about the young German pa...,6.617632,,Martin Doblmeier,Martin Doblmeier,1 h 33 m,"Documentary,Biography,History,War"
16289,16289,7th Street,"Jan 17, 2003",This documentary explores change in one of the...,6.617632,,Josh Pais,Josh Pais,\n http://www.7thstreetmovie.com/\n ...,Documentary


In [17]:
df = df.copy()
df = df.fillna(method='ffill')
df.tail()

  df = df.fillna(method='ffill')


Unnamed: 0.1,Unnamed: 0,Title,Release Date,Description,Rating,No of Persons Voted,Directed by,Written by,Duration,Genres
16285,16285,Boys to Men,"Apr 27, 2001",Compilation of 4 short films from different di...,6.617632,10,Mark Christensen,"Mark Christensen, Brian Hamill",\n https://www.amazon.com/Boys-Men-Phil...,Drama
16286,16286,Saint Misbehavin': The Wavy Gravy Movie,"Dec 10, 2010","Beginning with Woodstock ‘99, director Michell...",6.617632,10,Michelle Esrick,"Mark Christensen, Brian Hamill",Not Rated,Documentary
16287,16287,Collectors,"Oct 1, 2000",America is drawn to the macabre handiwork of t...,6.617632,10,Julian P. Hobbs,"Mark Christensen, Brian Hamill",1 h 20 m,Documentary
16288,16288,Bonhoeffer,"Jun 20, 2003",Dramatic documentary about the young German pa...,6.617632,10,Martin Doblmeier,Martin Doblmeier,1 h 33 m,"Documentary,Biography,History,War"
16289,16289,7th Street,"Jan 17, 2003",This documentary explores change in one of the...,6.617632,10,Josh Pais,Josh Pais,\n http://www.7thstreetmovie.com/\n ...,Documentary


In [18]:
## 2. Perform a summary of basic statistics (mean, median, standard deviation, min, and max) for the Rating column.
df['Rating'].describe()

count    16290.000000
mean         6.617632
std          1.256782
min          0.300000
25%          6.100000
50%          6.617632
75%          7.400000
max         10.000000
Name: Rating, dtype: float64

In [19]:
## Identify any outliers in movie ratings using statistical methods such as Z-scores or IQR.
from scipy import stats
z_scores = stats.zscore(df['Rating'])
outliers = df[(z_scores > 3) | (z_scores < -3)]
print(outliers)

       Unnamed: 0                          Title  Release Date  \
156           156  Never Rarely Sometimes Always  Mar 13, 2020   
262           262                    Wojnarowicz  Mar 19, 2021   
507           507            Maria Full of Grace  Jul 16, 2004   
518           518                 The Wicker Man   Aug 7, 1974   
519           519                 The Wicker Man   Aug 7, 1974   
...           ...                            ...           ...   
16273       16273                      Strippers   Dec 8, 2000   
16274       16274                         Vulgar  Apr 26, 2002   
16276       16276             The Singing Forest  Nov 14, 2003   
16277       16277    The Garbage Pail Kids Movie  Aug 22, 1987   
16279       16279                United Passions   Jun 5, 2015   

                                             Description  Rating  \
156    Faced with an unintended pregnancy and a lack ...     0.9   
262    Wojnarowicz: F**k You F*ggot F**ker is a fiery...     2.5   
507

In [20]:
Q1 = df['Rating'].quantile(0.25)
Q3 = df['Rating'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers_r = df[(df['Rating'] < lower_bound)|(df['Rating'] > upper_bound)]
print(outliers_r)

       Unnamed: 0                                         Title  Release Date  \
156           156                 Never Rarely Sometimes Always  Mar 13, 2020   
262           262                                   Wojnarowicz  Mar 19, 2021   
439           439               Restless Creature: Wendy Whelan  May 24, 2017   
453           453  The Body Remembers When the World Broke Open  Nov 22, 2019   
507           507                           Maria Full of Grace  Jul 16, 2004   
...           ...                                           ...           ...   
16273       16273                                     Strippers   Dec 8, 2000   
16274       16274                                        Vulgar  Apr 26, 2002   
16276       16276                            The Singing Forest  Nov 14, 2003   
16277       16277                   The Garbage Pail Kids Movie  Aug 22, 1987   
16279       16279                               United Passions   Jun 5, 2015   

                           

In [21]:
## 3. How many unique genres exist in the dataset? What are the most common genres over time?
col = df[['Genres']]
for x in col:
    print(df[x].unique())

['Drama' 'Drama,Mystery,Romance' 'Drama,History' ...
 'Action,Adventure,Horror' 'Drama,Crime,Action'
 'Comedy,Crime,Family,Sci-Fi']


In [22]:
## What are the most common genres over time?
common_genres = df['Genres'].value_counts()
print(common_genres)

Genres
Drama                                                                    1521
Documentary                                                              1018
Comedy,Drama                                                              789
Comedy,Drama,Romance                                                      646
Drama,Romance                                                             575
                                                                         ... 
Documentary,Biography,Comedy,Drama,Family                                   1
Animation,Action,Adventure,Comedy,Crime,Family,Fantasy,Mystery,Sci-Fi       1
Documentary,Biography,Drama,History,Mystery                                 1
Comedy,Crime,Drama,Horror,Thriller                                          1
Comedy,Crime,Family,Sci-Fi                                                  1
Name: count, Length: 1663, dtype: int64


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16290 entries, 0 to 16289
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           16290 non-null  int64  
 1   Title                16290 non-null  object 
 2   Release Date         16290 non-null  object 
 3   Description          16290 non-null  object 
 4   Rating               16290 non-null  float64
 5   No of Persons Voted  16290 non-null  object 
 6   Directed by          16290 non-null  object 
 7   Written by           16290 non-null  object 
 8   Duration             16290 non-null  object 
 9   Genres               16290 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 1.2+ MB


In [24]:
## 4. Convert the Release Date column into components like year and month.
df['Release Date'] = pd.to_datetime(df['Release Date'])
df['Year of Release'] = pd.DatetimeIndex(df['Release Date']).year
df['Month of Release'] = pd.DatetimeIndex(df['Release Date']).month
df['Day of Release'] = pd.DatetimeIndex(df['Release Date']).day
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Release Date,Description,Rating,No of Persons Voted,Directed by,Written by,Duration,Genres,Year of Release,Month of Release,Day of Release
0,0,Dekalog (1988),1996-03-22,This masterwork by Krzysztof Kieślowski is one...,7.4,118,Krzysztof Kieslowski,"Krzysztof Kieslowski, Krzysztof Piesiewicz",9 h 32 m,Drama,1996,3,22
1,1,Three Colors: Red,1994-11-23,Krzysztof Kieslowski closes his Three Colors t...,8.3,241,Krzysztof Kieslowski,"Krzysztof Kieslowski, Krzysztof Piesiewicz, Ag...",1 h 39 m,"Drama,Mystery,Romance",1994,11,23
2,2,The Conformist,1970-10-22,"Set in Rome in the 1930s, this re-release of B...",7.3,106,Bernardo Bertolucci,"Alberto Moravia, Bernardo Bertolucci",1 h 47 m,Drama,1970,10,22
3,3,Tokyo Story,1972-03-13,Yasujiro Ozu’s Tokyo Story follows an aging co...,8.1,147,Yasujirô Ozu,"Kôgo Noda, Yasujirô Ozu",2 h 16 m,Drama,1972,3,13
4,4,The Leopard (re-release),2004-08-13,"Set in Sicily in 1860, Luchino Visconti's spec...",7.8,85,Luchino Visconti,"Giuseppe Tomasi di Lampedusa, Suso Cecchi D'Am...",3 h 7 m,"Drama,History",2004,8,13


In [27]:
## What are the most frequent movie release years and months?
df['Year of Release'].value_counts().sort_values(ascending=False)

Year of Release
2014    707
2015    687
2018    644
2013    643
2017    640
2016    634
2019    596
2012    586
2022    567
2020    566
2011    566
2021    561
2006    551
2005    529
2023    508
2004    500
2007    485
2010    454
2002    449
2003    437
2008    431
2009    411
2001    400
2000    384
2024    318
1999    277
1998    179
1997    163
1996    156
1993    147
1995    142
1990    137
1986    132
1992    127
1994    124
1987    124
1989    118
1991    111
1988    107
1985    106
1984    101
1982     86
1981     81
1983     79
1980     76
1979     71
1978     58
1976     50
1973     47
1971     45
1974     45
1977     42
1975     41
1972     33
1970     31
Name: count, dtype: int64

In [28]:
df['Month of Release'].value_counts().sort_values(ascending=False)

Month of Release
10    1613
4     1470
3     1457
9     1456
8     1451
11    1385
6     1370
12    1350
5     1345
7     1307
2     1148
1      938
Name: count, dtype: int64

In [29]:
## 5. How many duplicate movies exist based on the Title column? What steps would you take to remove duplicates, if any?
df['Title'].duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
16285    False
16286    False
16287    False
16288    False
16289    False
Name: Title, Length: 16290, dtype: bool

In [30]:
## What steps would you take to remove duplicates, if any?
df['Title'].unique()

array(['Dekalog (1988)', 'Three Colors: Red', 'The Conformist', ...,
       'Collectors', 'Bonhoeffer', '7th Street'], dtype=object)

In [31]:
## 6. Perform frequency counts of categorical columns such as Directed by and Written by. Which directors and screenwriters have the most movies in the dataset?
directed_by_counts = df['Directed by'].value_counts()
written_by_counts = df['Written by'].value_counts()
print(directed_by_counts)
print(written_by_counts)

Directed by
Woody Allen                                             50
Ron Howard                                              40
Steven Spielberg                                        36
Ridley Scott                                            35
Clint Eastwood                                          35
                                                        ..
Joseph Gordon-Levitt                                     1
Vadim Jendreyko                                          1
John Francis Daley, \n    \n      Jonathan Goldstein     1
Natalia Almada                                           1
Josh Pais                                                1
Name: count, Length: 7379, dtype: int64
Written by
Woody Allen                                    46
Tyler Perry                                    21
Hong Sang-soo                                  19
David Mamet                                    17
John Hughes                                    16
                                      

In [None]:
## 7. Create a new column that classifies movies based on their Duration (e.g., short, medium, long). How many movies fall into each category, and how does the rating vary by category?
df['Duration'].max()

In [32]:
## 8. What are the average and median Number of Persons Voted per movie? What is the relationship between the number of votes and the movie's Rating?
persons_voted = df.groupby('No of Persons Voted').size()
mean_persons_voted = persons_voted.mean()
median_persons_voted = persons_voted.median()
print(f'The mean number of persons voted per movie is {mean_persons_voted}')
print(f'The median number of persons voted per movie is {median_persons_voted}')

The mean number of persons voted per movie is 16.673490276356194
The median number of persons voted per movie is 3.0


In [33]:
## What is the relationship between the number of votes and the movie's Rating?
fig = px.histogram(x = df['No of Persons Voted'], color=df['Rating'], text_auto=True, barmode='group')
fig.show()

In [42]:
## calculate the total number of movies directed by each director
movie_count = df.groupby('Directed by').size()
mean_movie = movie_count.mean()
median_movie = movie_count.median()
max_movie 

In [37]:
## 14. Create a new binary flag to indicate high-rated movies (with a Rating greater than 75). How many movies fall into this category?
df['Title'] = ((df['Rating'] > 7.5)).astype(int)
df['Title'].sum()

0

In [41]:
df['High Rated Movies'] = ((df['Rating'] > 7.5) & (df['Title'])).astype(int)
df['High Rated Movies'].sum()

0

In [51]:
## 15. Extract the year from the Release Date column and calculate the number of movies released per year. What trends in movie releases can be observed over time?
df.new_movies_each_year()
year = list(range(1970-2025))
count = [0 for _ in range(len(year))]
for i in range(len(df)):
    count = (int(df.loc[i, "Release Date"]).split(',')[1]) - 1970 = 1
plt.figure(figsize = (14, 8))
plt.title('New Movies Released Per Year')
addvalue(plt.bar(year, count, width = 0.50, color = 'purple'), fixed_precision = 0)
plt.xlabel('Year')
plt.ylabel('New Movie Released')
plt.xticks(rotation = 45, ha = 'right')
plt.tight_layout
plt.show()

SyntaxError: cannot assign to expression (3959783786.py, line 6)

In [2]:
df['Year of Release'].value_counts()
df

NameError: name 'df' is not defined