In [None]:
import json
import pandas as pd
import matplotlib as plt

# Step 1: Load the JSONL file containing movie data
with open('data/content.jsonl', 'r') as f:
    movies_data = [json.loads(line) for line in f]

# Step 2: Load the CSV file containing user ratings
df = pd.read_csv('submissoes/lightfm_versao_2_puro.csv')
# Normalize the Rating column
#df["Rating"] = df["Rating"].apply(lambda x: min(x, 10))
print(df.head())

# Step 3: Create a dictionary of imdbRatings from the movies data
imdbRatings = {movie['ItemId']: movie['imdbRating'] for movie in movies_data}
metascore = {movie['ItemId']: movie['Metascore'] for movie in movies_data}
imdbVotes = {movie['ItemId']: movie['imdbVotes'] for movie in movies_data}
boxOffice = {}

for movie in movies_data:
    item_id = movie['ItemId']
    box_office = movie.get('BoxOffice', 'N/A')
    if box_office == 'N/A':
        box_office = 0  # Replace 'N/A' with 0
    else:
        # Remove any non-numeric characters like "$" and commas, then convert to an integer
        box_office = int(''.join(c for c in box_office if c.isdigit()))
    boxOffice[item_id] = box_office


: 

In [None]:
imdbRatings

In [None]:
df["Rating"] = df["Rating"].apply(lambda x: max(x, -20))
df['Rating'] = (df['Rating'] - df['Rating'].min()) / (df['Rating'].max() - df['Rating'].min()) * 10
df['Rating'].hist()

In [None]:
df['imdbRating'] = df['ItemId'].map(imdbRatings)
df["imdbRating"] = pd.to_numeric(df["imdbRating"], errors='coerce')
df["imdbRating"] = df["imdbRating"].fillna(df['Rating'])
df["imdbRating"].hist()

In [59]:
df['BoxOffice'] = df['ItemId'].map(boxOffice)
df["BoxOffice"] = df["BoxOffice"].replace({"$": "", ",": ""}, regex=True)
df["BoxOffice"] = pd.to_numeric(df["BoxOffice"], errors='coerce')
min_box_office = df['BoxOffice'].min()
max_box_office = df['BoxOffice'].max()
df['BoxOffice'] = (df['BoxOffice'] - min_box_office) / (max_box_office - min_box_office) * 10

In [60]:
df['Metascore'] = df['ItemId'].map(metascore)
df["Metascore"] = pd.to_numeric(df["Metascore"], errors='coerce')
df["Metascore"] = df["Metascore"]/10
df["Metascore"] = df["Metascore"].fillna(df['imdbRating'])

In [61]:
df['imdbVotes'] = df['ItemId'].map(imdbVotes)
df["imdbVotes"] = pd.to_numeric(df["imdbVotes"], errors='coerce')
df["imdbVotes"] = df["imdbVotes"]/100

In [None]:
df["imdbVotes"].hist()

In [63]:
df['Score'] = (
    0.3 * df["imdbRating"] +
    2.2 * df["BoxOffice"] +
    0.2 * df["Metascore"] +
    0.2 * df["Rating"] + 
    0.1 * df["imdbVotes"]
)
df['Score'] = df['Score'].fillna(0.3 * df["imdbRating"] + 2.2 * df["BoxOffice"] + 0.25 * df["Metascore"] + 0.25 * df["Rating"])

In [None]:
df['Score'].hist()

In [65]:
df = df.sort_values(['UserId', 'Score'], ascending=[True, False])

In [None]:
df

In [67]:
df = df.drop(columns=['Rating', 'imdbRating', 'Score', 'BoxOffice', 'Metascore', 'imdbVotes'])
df.to_csv('Lightfm_average3.csv', index=False)