# Opschonen

In [1]:
import numpy as np
import pandas as pd

def missing(df):
    return df[df.isna().any(axis='columns')]

def movie_url(imdb_id):
    return f'https://www.imdb.com/title/tt{imdb_id:07}/'

In [2]:
movies = pd.read_parquet('data/movies.parquet')
print(f'n_movies = {movies.shape[0]}')
movies.head()

n_movies = 35613


Unnamed: 0,imdb_id,title,year,genres
0,8,Edison Kinetoscopic Record of a Sneeze,1894,"[documentary, short]"
1,10,La sortie des usines Lumière,1895,"[documentary, short]"
2,12,The Arrival of a Train,1896,"[documentary, short]"
3,25,The Oxford and Cambridge University Boat Race,1895,
4,91,Le manoir du diable,1896,"[short, horror]"


In [3]:
ratings = pd.read_parquet('data/ratings.parquet')
print(f'n_ratings = {ratings.shape[0]}')
ratings.head()

n_ratings = 867696


Unnamed: 0,user_id,imdb_id,rating,date_time
0,1,114508,8,2013-10-05 21:00:50
1,2,208092,5,2020-04-09 21:01:12
2,2,358273,9,2020-01-15 03:10:27
3,2,10039344,5,2020-01-09 20:50:53
4,2,6199572,3,2020-05-14 18:54:43


In [4]:
users = pd.read_parquet('data/users.parquet')
print(f'n_users = {users.shape[0]}')
users.head()

n_users = 67630


Unnamed: 0,user_id,twitter_id
0,1,139564917
1,2,522540374
2,3,475571186
3,4,215022153
4,5,349681331


## Movies

### Ontbrekend

In [5]:
mis_movies = missing(movies)
mis_movies

Unnamed: 0,imdb_id,title,year,genres
3,25,The Oxford and Cambridge University Boat Race,1895,
8,443,"Hiawatha, the Messiah of the Ojibway",1903,
32,5530,L'héroïsme de Paddy,1915,
51,9340,The Man Who Woke Up,1918,
80,12844,White and Unmarried,1921,
...,...,...,...,...
35540,11316824,Happy Old Year,2019,
35556,11433098,Rob Delaney: Jackie,2020,
35562,11515458,Al-Zaeem,1993,
35569,11566166,Mr. Zoo: The Missing VIP,2020,


In [6]:
mis_movies['genres'].unique()

array([None], dtype=object)

In [7]:
mis_movies['imdb_id'].apply(movie_url)

3         https://www.imdb.com/title/tt0000025/
8         https://www.imdb.com/title/tt0000443/
32        https://www.imdb.com/title/tt0005530/
51        https://www.imdb.com/title/tt0009340/
80        https://www.imdb.com/title/tt0012844/
                          ...                  
35540    https://www.imdb.com/title/tt11316824/
35556    https://www.imdb.com/title/tt11433098/
35562    https://www.imdb.com/title/tt11515458/
35569    https://www.imdb.com/title/tt11566166/
35610    https://www.imdb.com/title/tt12194082/
Name: imdb_id, Length: 251, dtype: object

### Foutief

In [8]:
duplicates = movies[movies['imdb_id'].duplicated()]['imdb_id'].values
movies[movies['imdb_id'].isin(duplicates)]

Unnamed: 0,imdb_id,title,year,genres
8137,106519,Carlito's Way,1993,
8138,106519,Carlito's Way,1993,"[crime, drama, thriller]"
21707,1979376,Toy Story 4,2019,"[animation, adventure, comedy, family, fantasy]"
21708,1979376,Toy Story 4,2019,"[animation, adventure, comedy, family, fantasy]"


In [9]:
movies = movies.drop_duplicates('imdb_id')
print(f'n_movies = {movies.shape[0]}')
movies[movies['imdb_id'].isin(duplicates)]

n_movies = 35611


Unnamed: 0,imdb_id,title,year,genres
8137,106519,Carlito's Way,1993,
21707,1979376,Toy Story 4,2019,"[animation, adventure, comedy, family, fantasy]"


In [10]:
movies['year'].describe()

count    35611.000000
mean      2000.186543
std         21.058439
min       1878.000000
25%       1992.000000
50%       2009.000000
75%       2014.000000
max       2021.000000
Name: year, dtype: float64

In [11]:
set(np.concatenate(movies['genres'].dropna().values))

{'action',
 'adult',
 'adventure',
 'animation',
 'biography',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'film_noir',
 'game_show',
 'history',
 'horror',
 'music',
 'musical',
 'mystery',
 'news',
 'reality_tv',
 'romance',
 'sci_fi',
 'short',
 'sport',
 'talk_show',
 'thriller',
 'war',
 'western'}

## Ratings

### Ontbrekend

In [12]:
missing(ratings)

Unnamed: 0,user_id,imdb_id,rating,date_time


### Foutief

In [13]:
ratings[ratings.duplicated(['user_id', 'imdb_id'])]

Unnamed: 0,user_id,imdb_id,rating,date_time


In [14]:
ratings['rating'].describe()

count    867696.000000
mean          7.315751
std           1.853804
min           0.000000
25%           6.000000
50%           8.000000
75%           9.000000
max          10.000000
Name: rating, dtype: float64

## Users

### Ontbrekend

In [15]:
missing(users)

Unnamed: 0,user_id,twitter_id


### Foutief

In [16]:
users[users['user_id'].duplicated()]

Unnamed: 0,user_id,twitter_id


In [17]:
duplicates = users[users['twitter_id'].duplicated()]['twitter_id'].values
dup_users = users[users['twitter_id'].isin(duplicates)]
dup_users.sort_values(by=['twitter_id', 'user_id'])

Unnamed: 0,user_id,twitter_id
49484,49485,620953
67522,67523,620953
20252,20253,718993
32327,32328,718993
10494,10495,743053
...,...,...
55972,55973,1225774491122442244
3584,3585,1225774932124209154
12908,12909,1225774932124209154
24273,24274,1226401983650623493


In [18]:
dup_users.groupby('twitter_id').count().describe()

Unnamed: 0,user_id
count,4832.0
mean,2.0
std,0.0
min,2.0
25%,2.0
50%,2.0
75%,2.0
max,2.0


In [19]:
users['user_id'].is_monotonic

True

In [20]:
users = users.drop_duplicates('twitter_id')
dup_groups = [sorted(group['user_id'].values) for _twitter_id, group in dup_users.groupby('twitter_id')]
dup_reps = {dup_id: unq_id for [unq_id, *dup_ids] in dup_groups for dup_id in dup_ids}
ratings = (
    ratings
        .replace({'user_id': dup_reps})
        .drop_duplicates(['user_id', 'imdb_id'])
)

# Bewaren

In [21]:
movies.to_parquet('data/movies_clean.parquet')
ratings.to_parquet('data/ratings_clean.parquet')
users.to_parquet('data/users_clean.parquet')

# Conclusies

## Movies
Ontbrekend: genres 251 films (niet opgelost). Foutief: 2 dubbele films (opgelost), nog te verschijnen films (de dataset bestaat uit beoordeelde films, niet opgelost).

## Users
Foutief: 4832 dubbele gebruikers (opgelost).