In [3]:
from pathlib import Path
import os, json, pandas as pd, numpy as np

baseDir = Path("/mnt/c/Users/Helin/OneDrive/Dokumente/BachelorThesis/code/srcCode/recsys-negative-feedback")

rawDir = baseDir / "data" / "raw" / "movielens"
procDir = baseDir / "data" / "processed" / "movielens"
procDir.mkdir(parents=True, exist_ok=True)

print("baseDir:", baseDir)
print("rawDir:", rawDir)
print("procDir:", procDir)


baseDir: /mnt/c/Users/Helin/OneDrive/Dokumente/BachelorThesis/code/srcCode/recsys-negative-feedback
rawDir: /mnt/c/Users/Helin/OneDrive/Dokumente/BachelorThesis/code/srcCode/recsys-negative-feedback/data/raw/movielens
procDir: /mnt/c/Users/Helin/OneDrive/Dokumente/BachelorThesis/code/srcCode/recsys-negative-feedback/data/processed/movielens


#1
We ensure reproducibility by fixing a stable project root (baseDir) and storing cleaned data to a canonical processed/ folder. This guarantees that model training always uses the same inputs regardless of editor working directory (e.g. VS Code / Jupyter).

In [4]:
ratingsPath = rawDir / "u.data"
df = pd.read_csv(
    ratingsPath,
    sep="\t",
    header=None,
    names=["origUserId","origItemId","rating","timestamp"]
)
df.head()



Unnamed: 0,origUserId,origItemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


#2
origUserId: the original MovieLens user ID (like “user #196” in their system)

origItemId: the original MovieLens movie ID (like “movie #242”)

rating: what they thought (1–5 stars)
Ratings 4–5 = “I like this, please recommend similar.”

Ratings 1–2 = “I dislike this, please don’t recommend similar.”

timestamp: when it happened (UNIX seconds)

#3
event = a user did something with an item at a time like User 22 watched Movie 377 and gave rating 1 at timestamp 878887116 evrey row is an event 

keep only users with at least 5 ratings and  items with at least 5 ratings

<5 is too weak to learn preference 


In [5]:
# tresholds
minUserEvents, minItemEvents = 5, 5

# count  each users ratings each movies ratings
userCounts = df["origUserId"].value_counts()
itemCounts = df["origItemId"].value_counts()

#getting rid of weak  items and users
df = df[df["origUserId"].isin(userCounts[userCounts >= minUserEvents].index)]
df = df[df["origItemId"].isin(itemCounts[itemCounts >= minItemEvents].index)]

print("After filtering:")
print("  users  :", df["origUserId"].nunique())
print("  items  :", df["origItemId"].nunique())
print("  events :", len(df))
#first 5 rows
df.head()


After filtering:
  users  : 943
  items  : 1349
  events : 99287


Unnamed: 0,origUserId,origItemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596



Data cleaning/filtering Beforehand
Thought PRocess:
user who only rated 1 movie ever. You can't learn their taste. There's no pattern.

A movie that only one person ever rated. You can't generalize it to others.

If you include extremely cold users/items, negative feedback looks random noise (“I hated this one obscure movie I watched once”). After filtering, negative signals become meaningful (“I systematically don’t like action movies”).

 filtering = denoising.


In [5]:
# new ids for users and items starting from 0
userMap = {u: i for i, u in enumerate(sorted(df["origUserId"].unique()))}
itemMap = {m: i for i, m in enumerate(sorted(df["origItemId"].unique()))}

df["userId"] = df["origUserId"].map(userMap)
df["itemId"] = df["origItemId"].map(itemMap)

# taking only columns we actually need 
df = df[["userId", "itemId", "rating", "timestamp"]]

# sort each users events in time order
df = df.sort_values(["userId", "timestamp"]).reset_index(drop=True)

print("After remapping:")
print("  df shape:", df.shape)
print("  unique users:", df["userId"].nunique())
print("  unique items:", df["itemId"].nunique())
df.head()


After remapping:
  df shape: (99287, 4)
  unique users: 943
  unique items: 1349


Unnamed: 0,userId,itemId,rating,timestamp
0,0,166,5,874965478
1,0,170,5,874965478
2,0,163,5,874965518
3,0,154,4,874965556
4,0,194,5,874965677


Recommender models don’t like scattered IDs like 196, 244, 927.
They want dense row indices: 0,1,2,3,…
Same for items.
This is how we’ll later build a user-item matrix or embeddings.
sort by timestamp per user.
TO “train on past, predict the future” split.
Given what you’ve watched so far, would I have guessed your next choice?

In [6]:
def userWiseLastSplit(frame):
    # last interaction per user pretend its in the future testing
    lastIdx = frame.groupby("userId").tail(1).index
    test = frame.loc[lastIdx]
    train = frame.drop(lastIdx)
    return train, test

trainDf, testDf = userWiseLastSplit(df)

print("train/test shapes:")
print("  train:", trainDf.shape)
print("  test :", testDf.shape)

# each user should appear in train AND in test 1 row eACh user
print("users in train:", trainDf["userId"].nunique())
print("users in test :", testDf["userId"].nunique())


train/test shapes:
  train: (98344, 4)
  test : (943, 4)
users in train: 943
users in test : 943


Evaluate recommenders by hiding each user's most recent interaction as test and training on their history, and asking the model to recover that heldout item. This simulates predicting the user's next choice based only on what the system would have known at that time.

In [7]:
procDir.mkdir(parents=True, exist_ok=True)

trainPath = procDir / "train.parquet"
testPath = procDir / "test.parquet"

trainDf.to_parquet(trainPath, index=False)
testDf.to_parquet(testPath, index=False)

print("Saved:")
print(" ", trainPath)
print(" ", testPath)


Saved:
  /mnt/c/Users/Helin/OneDrive/Dokumente/BachelorThesis/code/srcCode/recsys-negative-feedback/data/processed/movielens/train.parquet
  /mnt/c/Users/Helin/OneDrive/Dokumente/BachelorThesis/code/srcCode/recsys-negative-feedback/data/processed/movielens/test.parquet


parquet faster and cleaner than csv , can reload this and train models without redoing all the preprocessing again