# **Imports**


In [1]:
import numpy as np
import pandas as pd

# Load files

In [2]:
data = np.load('../data/Original_Dataset.npy', allow_pickle=True)
# Check the math "dimensions"
print(f"Type: {type(data)}")
print(f"Shape: {data.shape}")

# See the first "row"
print("First entry:")
print(data[0])

Type: <class 'numpy.ndarray'>
Shape: (4669820,)
First entry:
ur4592644,tt0120884,10,16 January 2005


In [3]:
# Check total records
total_records = data.shape[0]
print(f"We have {total_records:,} movie review records.")

# Since it's a 1D array of strings, we can't slice it like a matrix.
# Let's see the first 3 full entries to see the pattern
print("First 3 entries:")
print(data[:3])

We have 4,669,820 movie review records.
First 3 entries:
['ur4592644,tt0120884,10,16 January 2005'
 'ur3174947,tt0118688,3,16 January 2005'
 'ur3780035,tt0387887,8,16 January 2005']


# Convert data to DF

In [4]:
# 1. Load the raw strings
raw_data = np.load('../data/Original_Dataset.npy', allow_pickle=True)

# 2. Split the strings into a list of lists
# We limit to the first 1,000,000 rows if your RAM is struggling,
# but let's try the full set first.
split_data = [line.split(',') for line in raw_data]

# 3. Create the DataFrame
df = pd.DataFrame(split_data, columns=['User_ID', 'Movie_ID', 'Rating', 'Date'])

# 4. Fix Data Types (Crucial for Math!)
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

print("✅ Data Transformed into DataFrame")
print(df.head())

✅ Data Transformed into DataFrame
     User_ID   Movie_ID  Rating       Date
0  ur4592644  tt0120884      10 2005-01-16
1  ur3174947  tt0118688       3 2005-01-16
2  ur3780035  tt0387887       8 2005-01-16
3  ur4592628  tt0346491       1 2005-01-16
4  ur3174947  tt0094721       8 2005-01-16
