# Converteer dat naar parquet

In [1]:
import numpy as np
import pandas as pd

def read_movies(file_name):
    movies = pd.read_table(
        file_name, encoding='utf_8', engine='python', sep='::', names=['imdb_id', 'title', 'genres']
    )
    titles = (
        movies['title']
            .str.extract(r'(?P<title>.+) \((?P<year>\d{4})\)')
            .astype({'year': np.int})
    )
    genres = (
        movies['genres']
            .str.lower()
            .str.replace('-', '_')
            .str.split('|')
    )
    return (
        movies
            .drop(columns=['title', 'genres'])
            .join(titles)
            .join(genres)
    )

def read_ratings(file_name):
    return pd.read_table(
        file_name, engine='python', sep='::', names=['user_id', 'imdb_id', 'rating', 'date_time'],
        parse_dates=['date_time'], date_parser=lambda s: pd.to_datetime(s, origin='unix', unit='s')
    )

def read_users(file_name):
    return pd.read_table(file_name, engine='python', sep='::', names=['user_id', 'twitter_id'])

In [2]:
movies = read_movies('MovieTweetings/latest/movies.dat')
movies

Unnamed: 0,imdb_id,title,year,genres
0,8,Edison Kinetoscopic Record of a Sneeze,1894,"[documentary, short]"
1,10,La sortie des usines Lumière,1895,"[documentary, short]"
2,12,The Arrival of a Train,1896,"[documentary, short]"
3,25,The Oxford and Cambridge University Boat Race,1895,
4,91,Le manoir du diable,1896,"[short, horror]"
...,...,...,...,...
35608,12133722,Have a Good Trip: Adventures in Psychedelics,2020,[documentary]
35609,12192654,Planet of the Humans,2019,[documentary]
35610,12194082,No Hunger in Paradise,2017,
35611,12221748,Becoming,2020,[documentary]


In [3]:
ratings = read_ratings('MovieTweetings/latest/ratings.dat')
ratings

Unnamed: 0,user_id,imdb_id,rating,date_time
0,1,114508,8,2013-10-05 21:00:50
1,2,208092,5,2020-04-09 21:01:12
2,2,358273,9,2020-01-15 03:10:27
3,2,10039344,5,2020-01-09 20:50:53
4,2,6199572,3,2020-05-14 18:54:43
...,...,...,...,...
867691,67628,9071322,5,2020-02-01 00:37:28
867692,67628,9898858,3,2020-04-04 00:00:52
867693,67629,172495,10,2020-04-17 07:03:35
867694,67629,414387,10,2020-04-17 07:17:32


In [4]:
users = read_users('MovieTweetings/latest/users.dat')
users

Unnamed: 0,user_id,twitter_id
0,1,139564917
1,2,522540374
2,3,475571186
3,4,215022153
4,5,349681331
...,...,...
67625,67626,441446292
67626,67627,36878476
67627,67628,330301436
67628,67629,1244805465323835397


In [5]:
movies.to_parquet('data/movies.parquet')
ratings.to_parquet('data/ratings.parquet')
users.to_parquet('data/users.parquet')