# Movies Analytics

## 1. Import Libraries

In [None]:
#Menampilkan Drive
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np

## 2. Reading and Exploring Data

### Movies Data

In [None]:
mnames = ['movieid', 'title', 'genres']
movies_data = pd.read_table('/content/gdrive/MyDrive/Python TIA/ml-1m/movies.dat', sep='::', header=None, names=mnames, engine='python', encoding='latin-1')
print('movie shape : ', movies_data.shape)
movies_data.info()

movie shape :  (3883, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieid  3883 non-null   int64 
 1   title    3883 non-null   object
 2   genres   3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [None]:
# display
movies_data.head(10)

Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [None]:
# check unique movies
unique_movies = movies_data['movieid'].unique().tolist()
len(unique_movies)

3883

### Ratings

In [None]:
rnames = ['userid', 'movieid', 'rating', 'timestamp']
ratings_data = pd.read_table('/content/gdrive/MyDrive/Python TIA/ml-1m/ratings.dat', sep='::', header=None, names=rnames, engine='python', encoding='latin-1')
print('rating shape : ', ratings_data.shape)
ratings_data.info()

rating shape :  (1000209, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   userid     1000209 non-null  int64
 1   movieid    1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [None]:
# display
ratings_data

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [None]:
ratings_data.shape

(1000209, 4)

In [None]:
# check unique movies
unique_ratings = ratings_data['userid'].unique().tolist()
len(unique_ratings)

6040

In [None]:
# statistics summary
ratings_data.describe()

Unnamed: 0,userid,movieid,rating,timestamp
count,1000209.0,1000209.0,1000209.0,1000209.0
mean,3024.512,1865.54,3.581564,972243700.0
std,1728.413,1096.041,1.117102,12152560.0
min,1.0,1.0,1.0,956703900.0
25%,1506.0,1030.0,3.0,965302600.0
50%,3070.0,1835.0,4.0,973018000.0
75%,4476.0,2770.0,4.0,975220900.0
max,6040.0,3952.0,5.0,1046455000.0


In [None]:
# minimum and maximum rating
print('minimum = ', ratings_data['rating'].min())
print('maximum = ', ratings_data['rating'].max())

minimum =  1
maximum =  5


## 3. Data Cleaning

In [None]:
movies_data.isnull().any()

movieid    False
title      False
genres     False
dtype: bool

In [None]:
ratings_data.isnull().any()

userid       False
movieid      False
rating       False
timestamp    False
dtype: bool

## 4. Data Analytics

In [None]:
# filter movies with drama genre
drama_movies = movies_data['genres'].str.contains('Drama')
drama_movies.info()

<class 'pandas.core.series.Series'>
RangeIndex: 3883 entries, 0 to 3882
Series name: genres
Non-Null Count  Dtype
--------------  -----
3883 non-null   bool 
dtypes: bool(1)
memory usage: 3.9 KB


In [None]:
movies_data[drama_movies]

Unnamed: 0,movieid,title,genres
3,4,Waiting to Exhale (1995),Comedy|Drama
10,11,"American President, The (1995)",Comedy|Drama|Romance
13,14,Nixon (1995),Drama
15,16,Casino (1995),Drama|Thriller
16,17,Sense and Sensibility (1995),Drama|Romance
...,...,...,...
3876,3946,Get Carter (2000),Action|Drama|Thriller
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [None]:
# merging movies and rating dataframe
movie_ratings_data = movies_data.merge(ratings_data, on='movieid', how='inner')
movie_ratings_data.head()

Unnamed: 0,movieid,title,genres,userid,rating,timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474


In [None]:
# most rated movies
most_rated = movie_ratings_data.groupby('title').size().sort_values(ascending=False)
most_rated.head(25)

title
American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
Saving Private Ryan (1998)                               2653
Terminator 2: Judgment Day (1991)                        2649
Matrix, The (1999)                                       2590
Back to the Future (1985)                                2583
Silence of the Lambs, The (1991)                         2578
Men in Black (1997)                                      2538
Raiders of the Lost Ark (1981)                           2514
Fargo (1996)                                             2513
Sixth Sense, The (1999)                                  2459
Braveheart (1995)                                        2443
Shakespeare in Love (1998)                               2369
Pr

In [None]:
#simple version
most_rated = movie_ratings_data.title.value_counts()
most_rated.head(25)

American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
Saving Private Ryan (1998)                               2653
Terminator 2: Judgment Day (1991)                        2649
Matrix, The (1999)                                       2590
Back to the Future (1985)                                2583
Silence of the Lambs, The (1991)                         2578
Men in Black (1997)                                      2538
Raiders of the Lost Ark (1981)                           2514
Fargo (1996)                                             2513
Sixth Sense, The (1999)                                  2459
Braveheart (1995)                                        2443
Shakespeare in Love (1998)                               2369
Princess

In [None]:
#highly rated movies
highly_rated = movie_ratings_data.groupby('title').agg({'rating':[np.size, np.mean]})
highly_rated

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"$1,000,000 Duck (1971)",37,3.027027
'Night Mother (1986),70,3.371429
'Til There Was You (1997),52,2.692308
"'burbs, The (1989)",303,2.910891
...And Justice for All (1979),199,3.713568
...,...,...
"Zed & Two Noughts, A (1985)",29,3.413793
Zero Effect (1998),301,3.750831
Zero Kelvin (Kjærlighetens kjøtere) (1995),2,3.500000
Zeus and Roxanne (1997),23,2.521739


In [None]:
# highly rated movies sorted version
highly_rated_sorted = highly_rated.sort_values(by=('rating', 'size'), ascending=False)
highly_rated_sorted

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
American Beauty (1999),3428,4.317386
Star Wars: Episode IV - A New Hope (1977),2991,4.453694
Star Wars: Episode V - The Empire Strikes Back (1980),2990,4.292977
Star Wars: Episode VI - Return of the Jedi (1983),2883,4.022893
Jurassic Park (1993),2672,3.763847
...,...,...
Target (1995),1,4.000000
I Don't Want to Talk About It (De eso no se habla) (1993),1,4.000000
An Unforgettable Summer (1994),1,3.000000
Never Met Picasso (1996),1,2.000000


In [None]:
#display movies that have 5 rating
highly_rated_sorted_5 = highly_rated_sorted.loc[highly_rated_sorted[('rating', 'mean')] == 5]
highly_rated_sorted_5

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Gate of Heavenly Peace, The (1995)",3,5.0
Smashing Time (1967),2,5.0
Lured (1947),1,5.0
Song of Freedom (1936),1,5.0
Follow the Bitch (1998),1,5.0
One Little Indian (1973),1,5.0
"Baby, The (1973)",1,5.0
Schlafes Bruder (Brother of Sleep) (1995),1,5.0
Bittersweet Motel (2000),1,5.0
Ulysses (Ulisse) (1954),1,5.0
