In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Leitura dos Dados

## Movie Lens 100K

MovieLens data sets were collected by the GroupLens Research Project
at the University of Minnesota.
 
This data set consists of:

	* 100,000 ratings (1-5) from 943 users on 1682 movies.

	* Each user has rated at least 20 movies. 

    * Simple demographic info for the users (age, gender, occupation, zip)


In [3]:
# Load Ratings (u.data)
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('Dados/100K/u.data', sep='\t', names=r_cols, encoding='latin-1')

# Load genres
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'Unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('Dados/100K/u.item', sep='|', names=i_cols,
 encoding='latin-1')

# Load users
u_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('Dados/100K/u.user', sep='|', names=u_cols, encoding='latin-1')


In [4]:
items

Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,Unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [6]:
users

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [7]:
# Merge users and ratings on user_id
movies_users_ratings = pd.merge(users, ratings, on='user_id')
# Merge movies_users_ratings and items on movie_id
items_merged = pd.merge(movies_users_ratings, items, on='movie_id')
items_merged

Unnamed: 0,user_id,age,gender,occupation,zip_code,movie_id,rating,unix_timestamp,title,release date,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,24,M,technician,85711,61,4,878542420,Three Colors: White (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0
1,13,47,M,educator,29206,61,4,882140552,Three Colors: White (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0
2,18,35,F,other,37212,61,4,880130803,Three Colors: White (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0
3,58,27,M,programmer,52246,61,5,884305271,Three Colors: White (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0
4,59,49,M,educator,08403,61,4,888204597,Three Colors: White (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,863,17,M,student,60089,1679,3,889289491,B. Monkey (1998),06-Feb-1998,...,0,0,0,0,0,1,0,1,0,0
99996,863,17,M,student,60089,1678,1,889289570,Mat' i syn (1997),06-Feb-1998,...,0,0,0,0,0,0,0,0,0,0
99997,863,17,M,student,60089,1680,2,889289570,Sliding Doors (1998),01-Jan-1998,...,0,0,0,0,0,1,0,0,0,0
99998,896,28,M,writer,91505,1681,3,887160722,You So Crazy (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0


In [8]:
items_merged.columns

Index(['user_id', 'age', 'gender', 'occupation', 'zip_code', 'movie_id',
       'rating', 'unix_timestamp', 'title', 'release date',
       'video release date', 'IMDb URL', 'Unknown', 'Action', 'Adventure',
       'Animation', 'Children's', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')

In [9]:
items_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 31 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             100000 non-null  int64  
 1   age                 100000 non-null  int64  
 2   gender              100000 non-null  object 
 3   occupation          100000 non-null  object 
 4   zip_code            100000 non-null  object 
 5   movie_id            100000 non-null  int64  
 6   rating              100000 non-null  int64  
 7   unix_timestamp      100000 non-null  int64  
 8   title               100000 non-null  object 
 9   release date        99991 non-null   object 
 10  video release date  0 non-null       float64
 11  IMDb URL            99987 non-null   object 
 12  Unknown             100000 non-null  int64  
 13  Action              100000 non-null  int64  
 14  Adventure           100000 non-null  int64  
 15  Animation           100000 non-null

In [10]:
items_merged.isnull().sum()

user_id                    0
age                        0
gender                     0
occupation                 0
zip_code                   0
movie_id                   0
rating                     0
unix_timestamp             0
title                      0
release date               9
video release date    100000
IMDb URL                  13
Unknown                    0
Action                     0
Adventure                  0
Animation                  0
Children's                 0
Comedy                     0
Crime                      0
Documentary                0
Drama                      0
Fantasy                    0
Film-Noir                  0
Horror                     0
Musical                    0
Mystery                    0
Romance                    0
Sci-Fi                     0
Thriller                   0
War                        0
Western                    0
dtype: int64

In [11]:
items_merged = items_merged.drop(['video release date', 'IMDb URL'], axis=1)
items_merged.columns

Index(['user_id', 'age', 'gender', 'occupation', 'zip_code', 'movie_id',
       'rating', 'unix_timestamp', 'title', 'release date', 'Unknown',
       'Action', 'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')

In [12]:
items_merged['occupation'].value_counts()

occupation
student          21957
other            10663
educator          9442
engineer          8175
programmer        7801
administrator     7479
writer            5536
librarian         5273
technician        3506
executive         3403
healthcare        2804
artist            2308
entertainment     2095
scientist         2058
marketing         1950
retired           1609
lawyer            1345
none               901
salesman           856
doctor             540
homemaker          299
Name: count, dtype: int64

In [13]:

# Inicializar o OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop=None)

# Ajustar e transformar a coluna 'occupation'
encoded_occupation = encoder.fit_transform(items_merged[['occupation']])

# Criar um DataFrame com as novas colunas
encoded_occupation_df = pd.DataFrame(encoded_occupation, columns=encoder.get_feature_names_out(['occupation']))

# Concatenar o novo DataFrame ao original, removendo a coluna 'occupation' original
items_merged = pd.concat([items_merged.drop('occupation', axis=1), encoded_occupation_df], axis=1)

# Exibir as primeiras linhas para conferir o resultado
items_merged.head()

Unnamed: 0,user_id,age,gender,zip_code,movie_id,rating,unix_timestamp,title,release date,Unknown,...,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
0,1,24,M,85711,61,4,878542420,Three Colors: White (1994),01-Jan-1994,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,13,47,M,29206,61,4,882140552,Three Colors: White (1994),01-Jan-1994,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,18,35,F,37212,61,4,880130803,Three Colors: White (1994),01-Jan-1994,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,58,27,M,52246,61,5,884305271,Three Colors: White (1994),01-Jan-1994,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,59,49,M,8403,61,4,888204597,Three Colors: White (1994),01-Jan-1994,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Exibir as primeiras linhas para conferir o resultado
items_merged.columns

Index(['user_id', 'age', 'gender', 'zip_code', 'movie_id', 'rating',
       'unix_timestamp', 'title', 'release date', 'Unknown', 'Action',
       'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
       'occupation_administrator', 'occupation_artist', 'occupation_doctor',
       'occupation_educator', 'occupation_engineer',
       'occupation_entertainment', 'occupation_executive',
       'occupation_healthcare', 'occupation_homemaker', 'occupation_lawyer',
       'occupation_librarian', 'occupation_marketing', 'occupation_none',
       'occupation_other', 'occupation_programmer', 'occupation_retired',
       'occupation_salesman', 'occupation_scientist', 'occupation_student',
       'occupation_technician', 'occupation_writer'],
      dtype='object')

In [15]:
items_merged['release date'].value_counts()

release date
01-Jan-1995    9932
01-Jan-1994    8890
01-Jan-1997    7078
01-Jan-1993    6657
01-Jan-1989    2360
               ... 
11-Feb-1998       1
17-May-1996       1
26-Oct-1996       1
02-Apr-1996       1
30-Apr-1997       1
Name: count, Length: 240, dtype: int64

In [16]:
# Converter a coluna 'release date' para datetime
items_merged['release date'] = pd.to_datetime(items_merged['release date'], format='%d-%b-%Y', errors='coerce')

# Extrair o ano da 'release date'
items_merged['release_year'] = items_merged['release date'].dt.year

# Calcular a mediana da coluna 'release_year'
median_year = items_merged['release_year'].median()

# Substituir valores nulos pela mediana
items_merged['release_year'].fillna(median_year, inplace=True)

# Verificar as primeiras linhas para conferir o resultado
items_merged[['release date', 'release_year']].head()

Unnamed: 0,release date,release_year
0,1994-01-01,1994.0
1,1994-01-01,1994.0
2,1994-01-01,1994.0
3,1994-01-01,1994.0
4,1994-01-01,1994.0


In [17]:
items_merged = items_merged.drop(['release date'], axis=1)
items_merged

Unnamed: 0,user_id,age,gender,zip_code,movie_id,rating,unix_timestamp,title,Unknown,Action,...,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer,release_year
0,1,24,M,85711,61,4,878542420,Three Colors: White (1994),0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1994.0
1,13,47,M,29206,61,4,882140552,Three Colors: White (1994),0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1994.0
2,18,35,F,37212,61,4,880130803,Three Colors: White (1994),0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1994.0
3,58,27,M,52246,61,5,884305271,Three Colors: White (1994),0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1994.0
4,59,49,M,08403,61,4,888204597,Three Colors: White (1994),0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1994.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,863,17,M,60089,1679,3,889289491,B. Monkey (1998),0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1998.0
99996,863,17,M,60089,1678,1,889289570,Mat' i syn (1997),0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1998.0
99997,863,17,M,60089,1680,2,889289570,Sliding Doors (1998),0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1998.0
99998,896,28,M,91505,1681,3,887160722,You So Crazy (1994),0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1994.0


In [18]:
# Usar map para converter 'M' em 0 e 'F' em 1
items_merged['gender'] = items_merged['gender'].map({'M': 0, 'F': 1})

# Verificar o resultado
items_merged[['gender']].head()

Unnamed: 0,gender
0,0
1,0
2,1
3,0
4,0


In [19]:
items_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 49 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   user_id                   100000 non-null  int64  
 1   age                       100000 non-null  int64  
 2   gender                    100000 non-null  int64  
 3   zip_code                  100000 non-null  object 
 4   movie_id                  100000 non-null  int64  
 5   rating                    100000 non-null  int64  
 6   unix_timestamp            100000 non-null  int64  
 7   title                     100000 non-null  object 
 8   Unknown                   100000 non-null  int64  
 9   Action                    100000 non-null  int64  
 10  Adventure                 100000 non-null  int64  
 11  Animation                 100000 non-null  int64  
 12  Children's                100000 non-null  int64  
 13  Comedy                    100000 non-null  in

In [20]:
items_merged['zip_code'].value_counts()

zip_code
55414    1103
20009     878
10019     850
22902     832
61820     817
         ... 
21206      20
01945      20
42141      20
55013      20
23112      20
Name: count, Length: 795, dtype: int64

In [21]:
items_merged = items_merged.drop(['zip_code'], axis=1)
items_merged.columns

Index(['user_id', 'age', 'gender', 'movie_id', 'rating', 'unix_timestamp',
       'title', 'Unknown', 'Action', 'Adventure', 'Animation', 'Children's',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western', 'occupation_administrator', 'occupation_artist',
       'occupation_doctor', 'occupation_educator', 'occupation_engineer',
       'occupation_entertainment', 'occupation_executive',
       'occupation_healthcare', 'occupation_homemaker', 'occupation_lawyer',
       'occupation_librarian', 'occupation_marketing', 'occupation_none',
       'occupation_other', 'occupation_programmer', 'occupation_retired',
       'occupation_salesman', 'occupation_scientist', 'occupation_student',
       'occupation_technician', 'occupation_writer', 'release_year'],
      dtype='object')

In [22]:
items_merged['title'].value_counts()

title
Star Wars (1977)                                583
Contact (1997)                                  509
Fargo (1996)                                    508
Return of the Jedi (1983)                       507
Liar Liar (1997)                                485
                                               ... 
Promise, The (Versprechen, Das) (1994)            1
Yankee Zulu (1994)                                1
Eye of Vichy, The (Oeil de Vichy, L') (1993)      1
Lashou shentan (1992)                             1
Scream of Stone (Schrei aus Stein) (1991)         1
Name: count, Length: 1664, dtype: int64

In [23]:
# Função para remover o ano em parênteses do título
def remove_year(title):
    return re.sub(r'\s\(\d{4}\)$', '', title)

# Aplicar a função à coluna 'movie_title'
items_merged['movie_title_cleaned'] = items_merged['title'].apply(remove_year)

# Verificar os primeiros títulos para confirmar
items_merged[['title', 'movie_title_cleaned']].value_counts()

title                               movie_title_cleaned        
Star Wars (1977)                    Star Wars                      583
Contact (1997)                      Contact                        509
Fargo (1996)                        Fargo                          508
Return of the Jedi (1983)           Return of the Jedi             507
Liar Liar (1997)                    Liar Liar                      485
                                                                  ... 
Great Day in Harlem, A (1994)       Great Day in Harlem, A           1
Other Voices, Other Rooms (1997)    Other Voices, Other Rooms        1
Good Morning (1971)                 Good Morning                     1
Girls Town (1996)                   Girls Town                       1
Á köldum klaka (Cold Fever) (1994)  Á köldum klaka (Cold Fever)      1
Name: count, Length: 1664, dtype: int64

In [24]:
items_merged

Unnamed: 0,user_id,age,gender,movie_id,rating,unix_timestamp,title,Unknown,Action,Adventure,...,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer,release_year,movie_title_cleaned
0,1,24,0,61,4,878542420,Three Colors: White (1994),0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1994.0,Three Colors: White
1,13,47,0,61,4,882140552,Three Colors: White (1994),0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1994.0,Three Colors: White
2,18,35,1,61,4,880130803,Three Colors: White (1994),0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1994.0,Three Colors: White
3,58,27,0,61,5,884305271,Three Colors: White (1994),0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1994.0,Three Colors: White
4,59,49,0,61,4,888204597,Three Colors: White (1994),0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1994.0,Three Colors: White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,863,17,0,1679,3,889289491,B. Monkey (1998),0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1998.0,B. Monkey
99996,863,17,0,1678,1,889289570,Mat' i syn (1997),0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1998.0,Mat' i syn
99997,863,17,0,1680,2,889289570,Sliding Doors (1998),0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1998.0,Sliding Doors
99998,896,28,0,1681,3,887160722,You So Crazy (1994),0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1994.0,You So Crazy


In [25]:
items_merged.columns

Index(['user_id', 'age', 'gender', 'movie_id', 'rating', 'unix_timestamp',
       'title', 'Unknown', 'Action', 'Adventure', 'Animation', 'Children's',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western', 'occupation_administrator', 'occupation_artist',
       'occupation_doctor', 'occupation_educator', 'occupation_engineer',
       'occupation_entertainment', 'occupation_executive',
       'occupation_healthcare', 'occupation_homemaker', 'occupation_lawyer',
       'occupation_librarian', 'occupation_marketing', 'occupation_none',
       'occupation_other', 'occupation_programmer', 'occupation_retired',
       'occupation_salesman', 'occupation_scientist', 'occupation_student',
       'occupation_technician', 'occupation_writer', 'release_year',
       'movie_title_cleaned'],
      dtype='object')

In [41]:
print(items_merged['Drama'].value_counts())

Drama
0    60105
1    39895
Name: count, dtype: int64


In [42]:
# Verificar quantos valores Unknown == 1 existem
unknown_mask = items_merged['Unknown'] == 1

# Transferir os valores de Unknown para Drama
items_merged.loc[unknown_mask, 'Drama'] = 1

# Remover a coluna Unknown, pois não é mais necessária
items_merged = items_merged.drop(columns=['Unknown'])

# Conferir se a transferência foi feita corretamente
items_merged['Drama'].value_counts()

Drama
0    60095
1    39905
Name: count, dtype: int64

In [27]:
def clean_100k():
    items_merged_100k = load_100K()

    # Inicializar o OneHotEncoder
    encoder = OneHotEncoder(sparse_output=False, drop=None)
    # Ajustar e transformar a coluna 'occupation'
    encoded_occupation = encoder.fit_transform(items_merged_100k[['occupation']])
    # Criar um DataFrame com as novas colunas
    encoded_occupation_df = pd.DataFrame(encoded_occupation, columns=encoder.get_feature_names_out(['occupation']))
    # Concatenar o novo DataFrame ao original, removendo a coluna 'occupation' original
    items_merged_100k = pd.concat([items_merged_100k, encoded_occupation_df], axis=1)

    # Converter a coluna 'release date' para datetime
    items_merged_100k['release date'] = pd.to_datetime(items_merged_100k['release date'], format='%d-%b-%Y', errors='coerce')
    # Extrair o ano da 'release date'
    items_merged_100k['release_year'] = items_merged_100k['release date'].dt.year
    # Calcular a mediana da coluna 'release_year'
    median_year = items_merged_100k['release_year'].median()
    # Substituir valores nulos pela mediana
    items_merged_100k['release_year'].fillna(median_year, inplace=True)

    # Usar map para converter 'M' em 0 e 'F' em 1
    items_merged['gender'] = items_merged['gender'].map({'M': 0, 'F': 1})

    # Dropar as colunas que não vamos usar
    clean_merged_100k = items_merged_100k.drop(['occupation', 'release date', 'video release date', 'IMDb URL', 'zip_code'], axis=1)

    return clean_merged_100k

## Movie Lens 1M

MovieLens 1M movie ratings. Stable benchmark dataset. 1 million ratings from 6000 users on 4000 movies. Released 2/2003.

In [28]:
# Carregando os dados de ratings
rnames1 = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings1 = pd.read_table('Dados/1M/ratings.dat', sep='::', header=None, names=rnames1, engine="python")

# Carregando os dados de movies
mnames1 = ['movie_id', 'title', 'genres']
items1 = pd.read_table("Dados/1M/movies.dat", sep="::", header=None, names=mnames1, engine="python", encoding='latin1')

# Carregando os dados de users
unames1 = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
users1 = pd.read_table('Dados/1M/users.dat', sep='::', header=None, names=unames1, engine="python")

In [29]:
items1

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [30]:
ratings1

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [31]:
users1

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [32]:
# Merge users and ratings on user_id
movies_users_ratings1 = pd.merge(users1, ratings1, on='user_id')
# Merge movies_users_ratings and items on movie_id
items_merged1 = pd.merge(movies_users_ratings1, items1, on='movie_id')
items_merged1.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code,movie_id,rating,unix_timestamp,title,genres
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


In [33]:
items_merged1.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code,movie_id,rating,unix_timestamp,title,genres
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


In [34]:
# Occupation
# Inicializar o OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop=None)

# Dicionário para mapear os códigos de ocupação com os nomes descritivos
occupation_map = {
    0: "other",
    1: "academic",
    2: "artist",
    3: "clerical",
    4: "college",
    5: "customer_service",
    6: "doctor",
    7: "executive",
    8: "farmer",
    9: "homemaker",
    10: "K-12_student",
    11: "lawyer",
    12: "programmer",
    13: "retired",
    14: "sales",
    15: "scientist",
    16: "self-employed",
    17: "technician",
    18: "tradesman",
    19: "unemployed",
    20: "writer"
}

# Mapeamento dos valores numéricos para os nomes das ocupações
items_merged1['occupation_name'] = items_merged1['occupation'].map(occupation_map)
# Ajustar e transformar a coluna 'occupation_name' com OneHotEncoder
encoded_occupation = encoder.fit_transform(items_merged1[['occupation_name']])
# Obter os nomes das colunas com a substituição de 'occupation_name_' por 'occupation_'
columns_occupation = [col.replace('occupation_name_', 'occupation_') for col in encoder.get_feature_names_out(['occupation_name'])]
# Criar um DataFrame com as novas colunas
encoded_occupation_df = pd.DataFrame(encoded_occupation, columns=columns_occupation)
# Concatenar as novas colunas ao DataFrame original
items_merged1 = pd.concat([items_merged1, encoded_occupation_df], axis=1)

#Gender
# Usar map para converter 'M' em 0 e 'F' em 1
items_merged1['gender'] = items_merged1['gender'].map({'M': 0, 'F': 1})

#Title
# Função para remover o ano e extraí-lo do título
def extract_year(title):
    match = re.search(r'\(\d{4}\)', title)
    if match:
        year = match.group(0).strip('()')
        title_without_year = re.sub(r'\s\(\d{4}\)$', '', title)
        return title_without_year, year
    return title, None

    # Aplicar a função e criar duas novas colunas: título limpo e ano extraído
items_merged1[['title', 'release_year']] = items_merged1['title'].apply(
    lambda x: pd.Series(extract_year(x)))

#Genres
# Separar os gêneros e aplicar One-Hot Encoding
genres_dummies = items_merged1['genres'].str.get_dummies(sep='|')
# Concatenar as novas colunas ao DataFrame original
items_merged1 = pd.concat([items_merged1, genres_dummies], axis=1)

#Drop
# Remover a coluna 'occupation_name' e a coluna numérica 'occupation' original, se não forem mais necessárias
items_merged1 = items_merged1.drop(['occupation', 'occupation_name', 'zip_code', 'genres'], axis=1)
items_merged1


KeyboardInterrupt: 

In [33]:
items_merged1.columns

Index(['user_id', 'gender', 'age', 'movie_id', 'rating', 'unix_timestamp',
       'title', 'occupation_K-12_student', 'occupation_academic',
       'occupation_artist', 'occupation_clerical', 'occupation_college',
       'occupation_customer_service', 'occupation_doctor',
       'occupation_executive', 'occupation_farmer', 'occupation_homemaker',
       'occupation_lawyer', 'occupation_other', 'occupation_programmer',
       'occupation_retired', 'occupation_sales', 'occupation_scientist',
       'occupation_self-employed', 'occupation_technician',
       'occupation_tradesman', 'occupation_unemployed', 'occupation_writer',
       'release_year', 'Action', 'Adventure', 'Animation', 'Children's',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')

In [34]:
items_merged1['age'].value_counts()

age
25    395556
35    199003
18    183536
45     83633
50     72490
56     38780
1      27211
Name: count, dtype: int64

In [35]:
# Definir uma seed para garantir que os valores gerados sejam sempre os mesmos
np.random.seed(42)
# Mapeamento das faixas etárias para gerar valores aleatórios dentro das faixas
age_ranges = {
    1: (7, 17),     # Under 18
    18: (18, 24),   # 18-24
    25: (25, 34),   # 25-34
    35: (35, 44),   # 35-44
    45: (45, 49),   # 45-49
    50: (50, 55),   # 50-55
    56: (56, 78)    # 56+
}

# Função para gerar uma idade aleatória dentro da faixa
def generate_random_age(age_group):
    low, high = age_ranges[age_group]
    return np.random.randint(low, high+1)

# Aplicando a função para substituir os valores de idade
items_merged1['age'] = items_merged1['age'].apply(generate_random_age)
items_merged1.head(5)

Unnamed: 0,user_id,gender,age,movie_id,rating,unix_timestamp,title,occupation_K-12_student,occupation_academic,occupation_artist,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,13,1193,5,978300760,One Flew Over the Cuckoo's Nest,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,75,1193,5,978298413,One Flew Over the Cuckoo's Nest,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,12,0,32,1193,4,978220179,One Flew Over the Cuckoo's Nest,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,15,0,29,1193,4,978199279,One Flew Over the Cuckoo's Nest,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,17,0,51,1193,5,978158471,One Flew Over the Cuckoo's Nest,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
items_merged1['age'].value_counts()

age
27    39720
33    39668
29    39645
30    39636
26    39621
      ...  
76     1653
62     1649
77     1636
57     1630
67     1577
Name: count, Length: 72, dtype: int64

In [37]:
items_merged1.columns

Index(['user_id', 'gender', 'age', 'movie_id', 'rating', 'unix_timestamp',
       'title', 'occupation_K-12_student', 'occupation_academic',
       'occupation_artist', 'occupation_clerical', 'occupation_college',
       'occupation_customer_service', 'occupation_doctor',
       'occupation_executive', 'occupation_farmer', 'occupation_homemaker',
       'occupation_lawyer', 'occupation_other', 'occupation_programmer',
       'occupation_retired', 'occupation_sales', 'occupation_scientist',
       'occupation_self-employed', 'occupation_technician',
       'occupation_tradesman', 'occupation_unemployed', 'occupation_writer',
       'release_year', 'Action', 'Adventure', 'Animation', 'Children's',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')

In [38]:
def clean_1M():
    items_merged_1M = load_1M()

    # Occupation
    # Inicializar o OneHotEncoder
    encoder = OneHotEncoder(sparse_output=False, drop=None)
    # Dicionário para mapear os códigos de ocupação com os nomes descritivos
    occupation_map = {
        0: "other",
        1: "academic",
        2: "artist",
        3: "clerical",
        4: "college",
        5: "customer_service",
        6: "doctor",
        7: "executive",
        8: "farmer",
        9: "homemaker",
        10: "K-12_student",
        11: "lawyer",
        12: "programmer",
        13: "retired",
        14: "sales",
        15: "scientist",
        16: "self-employed",
        17: "technician",
        18: "tradesman",
        19: "unemployed",
        20: "writer"
    }

    # Mapeamento dos valores numéricos para os nomes das ocupações
    items_merged_1M['occupation_name'] = items_merged_1M['occupation'].map(occupation_map)
    # Ajustar e transformar a coluna 'occupation_name' com OneHotEncoder
    encoded_occupation = encoder.fit_transform(items_merged_1M[['occupation_name']])
    # Obter os nomes das colunas com a substituição de 'occupation_name_' por 'occupation_'
    columns_occupation = [col.replace('occupation_name_', 'occupation_') for col in encoder.get_feature_names_out(['occupation_name'])]
    # Criar um DataFrame com as novas colunas
    encoded_occupation_df = pd.DataFrame(encoded_occupation, columns=columns_occupation)
    # Concatenar as novas colunas ao DataFrame original
    items_merged_1M = pd.concat([items_merged_1M, encoded_occupation_df], axis=1)

    # Age
    # Dicionário para mapear as idades aos grupos etários
    age_map = {
        1: "Under_18",
        18: "18-24",
        25: "25-34",
        35: "35-44",
        45: "45-49",
        50: "50-55",
        56: "56+"
    }

    # Mapear as idades para categorias
    items_merged1['age_group'] = items_merged1['age'].map(age_map)
    # Ajustar e transformar a coluna 'age_group'
    encoded_age = encoder.fit_transform(items_merged1[['age_group']])
    # Criar um DataFrame com as novas colunas
    encoded_age_df = pd.DataFrame(encoded_age, columns=encoder.get_feature_names_out(['age_group']))
    # Concatenar o DataFrame original com as novas colunas
    items_merged1 = pd.concat([items_merged1, encoded_age_df], axis=1)
    # Dropar a coluna original de idade, se necessário
    items_merged1 = items_merged1.drop(['age', 'age_group'], axis=1)

    # Gender
    # Usar map para converter 'M' em 0 e 'F' em 1
    items_merged_1M['gender'] = items_merged_1M['gender'].map({'M': 0, 'F': 1})

    # Title
    # Função para remover o ano e extraí-lo do título
    def extract_year(title):
        match = re.search(r'\(\d{4}\)', title)
        if match:
            year = match.group(0).strip('()')
            title_without_year = re.sub(r'\s\(\d{4}\)$', '', title)
            return title_without_year, year
        return title, None

        # Aplicar a função e criar duas novas colunas: título limpo e ano extraído
    items_merged_1M[['title', 'release_year']] = items_merged_1M['title'].apply(
        lambda x: pd.Series(extract_year(x)))
    
    # Genres
    # Separar os gêneros e aplicar One-Hot Encoding
    genres_dummies = items_merged_1M['genres'].str.get_dummies(sep='|')
    # Concatenar as novas colunas ao DataFrame original
    items_merged_1M = pd.concat([items_merged_1M, genres_dummies], axis=1)

    #Drop    
    # Dropar as colunas que não vamos usar
    clean_merged_1M = items_merged_1M.drop(['occupation', 'occupation_name', 'age', 'age_group', 'genres', 'zip_code'], axis=1)

    return clean_merged_1M

# Pré Processamento

In [39]:
def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe.
    
    Args:
        df: pandas dataframe containing 3 columns (userId, movieId, rating)
    
    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        movie_mapper: dict that maps movie id's to movie indices
        movie_inv_mapper: dict that maps movie indices to movie id's
    """
    M = df['user_id'].nunique()
    N = df['movie_id'].nunique()

    user_mapper = dict(zip(np.unique(df["user_id"]), list(range(M))))
    movie_mapper = dict(zip(np.unique(df["movie_id"]), list(range(N))))
    
    user_inv_mapper = dict(zip(list(range(M)), np.unique(df["user_id"])))
    movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movie_id"])))
    
    user_index = [user_mapper[i] for i in df['user_id']]
    item_index = [movie_mapper[i] for i in df['movie_id']]

    X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

## 100K

In [40]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)

In [41]:
X.shape

(943, 1682)

In [42]:
n_total = X.shape[0]*X.shape[1]
n_ratings = X.nnz
sparsity = n_ratings/n_total
print(f"Esparsidade da Matriz: {round(sparsity*100,2)}%")

Esparsidade da Matriz: 6.3%


In [43]:
n_ratings_per_user = X.getnnz(axis=1)
len(n_ratings_per_user)
print(f"O usuário mais ativo avaliou {n_ratings_per_user.max()} filmes.")
print(f"O usuário menos ativo avaliou {n_ratings_per_user.min()} filmes.")
n_ratings_per_movie = X.getnnz(axis=0)
len(n_ratings_per_movie)
print(f"O filme mais avaliado recebeu {n_ratings_per_movie.max()} avaliações.")
print(f"O filme menos avaliado recebeu {n_ratings_per_movie.min()} avaliações.")

O usuário mais ativo avaliou 737 filmes.
O usuário menos ativo avaliou 20 filmes.
O filme mais avaliado recebeu 583 avaliações.
O filme menos avaliado recebeu 1 avaliações.


## 1M

In [44]:
X1, user_mapper1, movie_mapper1, user_inv_mapper1, movie_inv_mapper1 = create_X(ratings1)

In [45]:
X1.shape

(6040, 3706)

In [46]:
n_total1 = X1.shape[0]*X1.shape[1]
n_ratings1 = X1.nnz
sparsity1 = n_ratings1/n_total1
print(f"Esparsidade da Matriz: {round(sparsity1*100,2)}%")

Esparsidade da Matriz: 4.47%


In [47]:
n_ratings_per_user1 = X1.getnnz(axis=1)
len(n_ratings_per_user1)
print(f"O usuário mais ativo avaliou {n_ratings_per_user1.max()} filmes.")
print(f"O usuário menos ativo avaliou {n_ratings_per_user1.min()} filmes.")
n_ratings_per_movie1 = X1.getnnz(axis=0)
len(n_ratings_per_movie1)
print(f"O filme mais avaliado recebeu {n_ratings_per_movie1.max()} avaliações.")
print(f"O filme menos avaliado recebeu {n_ratings_per_movie1.min()} avaliações.")

O usuário mais ativo avaliou 2314 filmes.
O usuário menos ativo avaliou 20 filmes.
O filme mais avaliado recebeu 3428 avaliações.
O filme menos avaliado recebeu 1 avaliações.


# Item-item Recommendations

Vamos encontrar os k filmes que têm os vetores de engajamento do usuário mais semelhantes para o filme i.

In [48]:
def find_similar_movies(movie_id, X, movie_mapper, movie_inv_mapper, k, metric='cosine'):
    """
    Finds k-nearest neighbours for a given movie id.
    
    Args:
        movie_id: id of the movie of interest
        X: user-item utility matrix
        k: number of similar movies to retrieve
        metric: distance metric for kNN calculations
    
    Output: returns list of k similar movie ID's
    """
    X = X.T
    neighbour_ids = []
    
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    if isinstance(movie_vec, (np.ndarray)):
        movie_vec = movie_vec.reshape(1,-1)
    # use k+1 since kNN output includes the movieId of interest
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(X)
    neighbour = kNN.kneighbors(movie_vec, return_distance=False)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [49]:
movie_titles = dict(zip(items['movie_id'], items['title']))

movie_id = 1

similar_movies = find_similar_movies(movie_id, X, movie_mapper, movie_inv_mapper, metric='cosine', k=10)
movie_title = movie_titles[movie_id]

print(f"Por que você assistiu {movie_title}:")
for i in similar_movies:
    print(movie_titles[i])

Por que você assistiu Toy Story (1995):
Star Wars (1977)
Return of the Jedi (1983)
Independence Day (ID4) (1996)
Rock, The (1996)
Mission: Impossible (1996)
Willy Wonka and the Chocolate Factory (1971)
Star Trek: First Contact (1996)
Fargo (1996)
Jerry Maguire (1996)


In [50]:
movie_titles1 = dict(zip(items1['movie_id'], items1['title']))

movie_id1 = 1

similar_movies1 = find_similar_movies(movie_id1, X1, movie_mapper1, movie_inv_mapper1, metric='cosine', k=10)
movie_title1 = movie_titles1[movie_id1]

print(f"Por que você assistiu {movie_title1}:")
for i in similar_movies1:
    print(movie_titles1[i])

Por que você assistiu Toy Story (1995):
Toy Story 2 (1999)
Groundhog Day (1993)
Aladdin (1992)
Bug's Life, A (1998)
Back to the Future (1985)
Babe (1995)
Star Wars: Episode V - The Empire Strikes Back (1980)
Men in Black (1997)
Forrest Gump (1994)


Os resultados acima mostram os 10 filmes mais semelhantes a Toy Story. A maioria dos filmes nesta lista são filmes de família dos anos 1990, o que parece bastante razoável. Observe que essas recomendações são baseadas apenas em classificações de itens do usuário. Recursos de filmes, como gêneros, não são usados ​​nesta abordagem.

Também podemos testar com as métricas de distância kNN e ver quais resultados obteriamos se usassemos "manhattan" (cityblock) ou "euclidean" em vez de "cosine".

# Base e Test

In [51]:
# Load Ratings (u.data)
r_cols2 = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings2 = pd.read_csv('Dados/100K/ub.base', sep='\t', names=r_cols2 ,encoding='latin-1')
ratings2

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
90565,943,1067,2,875501756
90566,943,1074,4,888640250
90567,943,1188,3,888640250
90568,943,1228,3,888640275


In [52]:
# Load Ratings (u.data)
r_cols3 = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings3 = pd.read_csv('Dados/100K/ub.test', sep='\t', names=r_cols2 ,encoding='latin-1')
ratings3

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,17,3,875073198
1,1,47,4,875072125
2,1,64,5,875072404
3,1,90,4,878542300
4,1,92,3,876892425
...,...,...,...,...
9425,943,471,5,875502042
9426,943,549,1,888639772
9427,943,595,2,875502597
9428,943,685,4,875502042


In [53]:
def split_train_test(data, basename, start, stop, maxtest):
    # Inicializar contadores
    ratingcnt = {}
    testcnt = 0

    # Abrir arquivos para treino e teste
    basefile = open(f"{basename}.base.csv", "w")
    testfile = open(f"{basename}.test.csv", "w")

    # Escrever os cabeçalhos nos arquivos (opcional, se necessário)
    columns = data.columns.astype(str)  # Converter todas as colunas para strings
    basefile.write(",".join(columns) + "\n")
    testfile.write(",".join(columns) + "\n")

    # Iterar sobre as linhas do dataframe
    for _, row in data.iterrows():
        user = row['user_id']  # Assumindo que a coluna de usuário seja 'user_id'

        if user not in ratingcnt:
            ratingcnt[user] = 0

        ratingcnt[user] += 1

        # Se ainda há espaço no teste e o rating do usuário está no intervalo definido
        if (testcnt < maxtest or maxtest <= 0) and start <= ratingcnt[user] <= stop:
            testfile.write(",".join(map(str, row.values)) + "\n")  # Converte os valores para string
            testcnt += 1
        else:
            basefile.write(",".join(map(str, row.values)) + "\n")  # Converte os valores para string

    # Fechar os arquivos
    basefile.close()
    testfile.close()

# Exemplo de uso com dataset de 1M
# Carregando os dados de ratings
data_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
data = pd.read_table('Dados/1M/ratings.dat', sep='::', names= data_cols, header=None, engine="python")

# Definir os parâmetros
basename = 'ua'  # O nome base para os arquivos
start = 1  # Avaliação inicial para o usuário
stop = 10  # Avaliação final (10 primeiras para o teste)
maxtest = 200000  # Número máximo de avaliações no conjunto de teste

# Executar a divisão
split_train_test(data, basename, start, stop, maxtest)

print("Divisão concluída com sucesso.")

Divisão concluída com sucesso.


In [54]:
data1 = pd.read_csv('./ua.base.csv')
data1

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,595,5,978824268
1,1,938,4,978301752
2,1,2398,4,978302281
3,1,2918,4,978302124
4,1,1035,5,978301753
...,...,...,...,...
939804,6040,1091,1,956716541
939805,6040,1094,5,956704887
939806,6040,562,5,956704746
939807,6040,1096,4,956715648


In [55]:
data2 = pd.read_csv('./ua.test.csv')
data2

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
60395,6040,593,5,956703954
60396,6040,3016,2,956716157
60397,6040,3017,1,956716519
60398,6040,2070,4,956715676
