In [1]:
import pandas as pd
import numpy as np
import re, os
from pathlib import Path
from tqdm import tqdm
import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path

### Looking at the first few lines we notice that a peculiar structure. There are two types of lines. One with just the movie_id followed by a colon. And other with user_id, rating and date for that movie_id.

In [2]:
rating_data = Path('../data/training_set/combined.txt')
# files = list(rating_data.glob('*.txt'))

In [3]:
with open(rating_data,'r',encoding='utf-8',errors='ignore') as f:
    data = f.readlines()

In [6]:
data[:10]

['15739:\n',
 '1604980,4,2005-07-08\n',
 '2217939,3,2005-07-18\n',
 '2580478,4,2005-07-25\n',
 '1634362,3,2005-08-01\n',
 '314837,4,2005-08-09\n',
 '1850680,4,2005-08-14\n',
 '2238856,4,2005-08-31\n',
 '1332360,5,2005-02-06\n',
 '1277134,3,2005-04-08\n']

### Loading a text file, followed by stripping the newline character

In [4]:
data = [d.strip() for d in data]

In [6]:
len(data)

100498277

In [5]:
data[:11]

['1:',
 '1488844,3,2005-09-06',
 '822109,5,2005-05-13',
 '885013,4,2005-10-19',
 '30878,4,2005-12-26',
 '823519,3,2004-05-03',
 '893988,3,2005-11-17',
 '124105,4,2004-08-05',
 '1248029,3,2004-04-22',
 '1842128,4,2004-05-09',
 '2238063,3,2005-05-11']

### First we make a list of all movie_ids and compare them with the movie_ids from first part.

In [8]:
movie_list=[]
for i,d in tqdm(enumerate(data),total=len(data)):
    f=re.search('\d+:',d)
    if f is not None:
#         print(f'{i} : {d}')
        movie_list.append(d[:-1])

100%|██████████| 5390/5390 [00:00<00:00, 499134.49it/s]


In [9]:
movie_list[:5]

['15739']

### Loading movie_ids from first part

In [11]:
movie_data = pd.read_csv('./data/movie_titles_clean.csv',index_col='movie_id')

In [12]:
movie_data.head()

Unnamed: 0_level_0,year,title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003,Dinosaur Planet
2,2004,Isle of Man TT 2004 Review
3,1997,Character
4,1994,Paula Abdul's Get Up & Dance
5,2004,The Rise and Fall of ECW


In [13]:
movie_data.index

Int64Index([    1,     2,     3,     4,     5,     6,     7,     8,     9,
               10,
            ...
            17761, 17762, 17763, 17764, 17765, 17766, 17767, 17768, 17769,
            17770],
           dtype='int64', name='movie_id', length=17763)

In [13]:
movie_list_int = [int(d) for d in movie_list]

### Here we take the difference of both list of movie_ids. We notice that while there are many movies that don't have a rating (first command); there is only one movie in the rating list that doesn't have movie information (second command)

In [None]:
set(movie_data.index).difference(movie_list_int)

In [None]:
set(movie_list_int).difference(movie_data.index)

### In the following we read the dataset. First we search for the movie_id line. When we find such a line (f!=None), we store the id in 'movie' variable. When we don't have such a line, its the rating line. We split it on comma and combine it with movie_id.

In [7]:
rating_data=[]
for i,d in tqdm(enumerate(data),total=len(data)):
    f=re.search('\d+:',d)
    if f is not None:
        movie = d[:-1]
        continue
    rating=d.split(',')
    rating_data.append((movie,*rating))
rating_df = pd.DataFrame(rating_data,columns=['movie_id','user_id','rating','date'])

 43%|████▎     | 42768085/100498277 [10:02<13:32, 71020.60it/s]  


KeyboardInterrupt: 

### Now we have list of tuples with the format (movie_id, user_id, rating, date). We make a pandas dataframe from it. 

In [14]:
rating_df = pd.DataFrame(rating_data,columns=['movie_id','user_id','rating','date'])

In [15]:
rating_df.head()

Unnamed: 0,movie_id,user_id,rating,date
0,15739,1604980,4,2005-07-08
1,15739,2217939,3,2005-07-18
2,15739,2580478,4,2005-07-25
3,15739,1634362,3,2005-08-01
4,15739,314837,4,2005-08-09


In [17]:
rating_df.user_id.nunique()

470758

In [18]:
rating_df.movie_id.nunique()

4499

### Dropping the movie for which info is not available

In [19]:
rating_df = rating_df.query('movie_id!=4388')

### Here we ensure that there is only ony rating for each movie_id and user_id by first grouping the data by movie_id, user_id and counting the number of ratings.

In [20]:
uniq_rating_df=rating_df.groupby(['movie_id','user_id'])['rating'].count()

### Checking if we have only 1 rating for each combination

In [21]:
uniq_rating_df[uniq_rating_df!=1]

Series([], Name: rating, dtype: int64)

In [22]:
rating_df.index.name = 'index'

In [33]:
# rating_df.to_csv('rating.csv')
rating_df = rating_df.drop('date',axis=1).apply(pd.to_numeric)

### Doing the above for other three files

In [27]:
with open(file2,'r',encoding='utf-8',errors='ignore') as f:
    data2 = f.readlines()
print(len(data2))
with open(file3,'r',encoding='utf-8',errors='ignore') as f:
    data2.extend(f.readlines())
print(len(data2))
with open(file4,'r',encoding='utf-8',errors='ignore') as f:
    data2.extend(f.readlines())
print(len(data2))

26982302
49588088
76440014


In [28]:
data2=[d.strip() for d in data2]

In [29]:
rating_data2=[]
for i,d in tqdm(enumerate(data2),total=len(data2)):
    f=re.search('\d+:',d)
    if f is not None:
        movie = d[:-1]
        continue
    rating=d.split(',')
    rating_data2.append((movie,*rating))

100%|██████████| 76440014/76440014 [05:05<00:00, 250115.44it/s]


In [38]:
rating2_df = pd.DataFrame(rating_data2,columns=['movie_id','user_id','rating','date'])

In [40]:
rating2_df = rating2_df.drop('date',axis=1).apply(pd.to_numeric)

### Concatenating Datasets

In [41]:
all_rating_df=pd.concat([rating_df,rating2_df])

In [42]:
all_rating_df.index.name = 'index'

In [43]:
all_rating_df.head()

Unnamed: 0_level_0,movie_id,user_id,rating
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,1488844,3
1,1,822109,5
2,1,885013,4
3,1,30878,4
4,1,823519,3


In [47]:
table = pa.Table.from_pandas(all_rating_df)

In [48]:
pq.write_table(table, 'all_ratings.parquet')

In [None]:
all_rating_df.to_csv('all_rating.csv')