In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('ml-100k/u.data',
                 sep='\t',    
                 header=None,  
                 names=['user_id', 'item_id', 'rating', 'timestamp'])

data['timestamp'] = (data['timestamp'] - pd.Timestamp("1970-01-01").second )/(3600*24*365)

genre = pd.read_csv('ml-100k/u.genre',
                 sep='|',    
                 header=None,  
                 names=['genre_name', 'genre_id'])
              
columns = [
    'movie_id',
    'movie_title',
    'release_date',
    'video_release_date',
    'IMDb_URL',
    'unknown',       
    'Action',
    'Adventure',
    'Animation',
    "Children's",
    'Comedy',
    'Crime',
    'Documentary',
    'Drama',
    'Fantasy',
    'Film-Noir',
    'Horror',
    'Musical',
    'Mystery',
    'Romance',
    'Sci-Fi',
    'Thriller',
    'War',
    'Western'
]

movie = pd.read_csv(
    'ml-100k/u.item',
    sep='|',
    header=None,        
    names=columns,     
    encoding='latin-1'  
).drop(columns=['video_release_date', 'IMDb_URL'])

movie["release_date"] = pd.to_datetime(movie["release_date"], errors='coerce') - pd.Timestamp("1970-01-01")
movie['years_1970'] = movie['release_date'].dt.days / 365
movie = movie.drop(columns=['release_date', "movie_title"])


users = pd.read_csv('ml-100k/u.user',
                    sep='|',
                    header=None,        
                    names= "user id | age | gender | occupation | zip code".split(" | "),     
                    encoding='latin-1')

In [3]:
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,27.944284
1,186,302,3,28.276184
2,22,377,1,27.869328
3,244,51,2,27.923862
4,166,346,1,28.107483


In [4]:
genre.head()

Unnamed: 0,genre_name,genre_id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4


In [5]:
movie.head()

Unnamed: 0,movie_id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,years_1970
0,1,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,25.016438
1,2,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,25.016438
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,25.016438
3,4,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,25.016438
4,5,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,25.016438


In [None]:
users["gender"] = users["gender"].apply(lambda x: 1 if x == "M" else 0)
users["occupation"] = users["occupation"]
users["occupation"] = pd.Categorical(users["occupation"])

In [None]:
users.head()
users = pd.get_dummies(users, columns=["occupation"])
users.head()

Unnamed: 0,user id,age,gender,zip code,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,...,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
0,1,24,1,85711,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,2,53,0,94043,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,3,23,1,32067,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,4,24,1,43537,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,5,33,0,15213,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


In [9]:
ratings = pd.merge(
    data,           
    movie,          
    how='left',     
    left_on='item_id',
    right_on='movie_id'
)
ratings.head()
ratings.drop(columns=["item_id", "movie_id"], inplace=True)
ratings["years_since_review"] = ratings["timestamp"] - ratings["years_1970"]
ratings.drop(columns=["timestamp", "years_1970"], inplace=True)
ratings.head()

Unnamed: 0,user_id,rating,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,years_since_review
0,196,3,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.862092
1,186,3,0,0,0,0,0,0,1,0,...,1,0,0,1,0,0,1,0,0,1.257006
2,22,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,3.852889
3,244,2,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,3.907424
4,166,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1.088305
