In [None]:
import os
import pandas as pd 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [97]:
df_ratings = pd.read_csv(os.path.join('data', 'prepared_ratings.csv'))
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18062183 entries, 0 to 18062182
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   userId  int64  
 2   rating  float64
 3   date    object 
dtypes: float64(1), int64(2), object(1)
memory usage: 551.2+ MB


In [None]:
df_ratings['date'] = pd.to_datetime(df_ratings['date'])

user_analysis = df_ratings.groupby('userId').agg(
    unique_movies=('id', 'nunique'),  # Count of unique movie IDs
    mean_rating=('rating', 'mean'),  # Mean rating
    max_rating=('rating', 'max'),    # Maximum rating
    min_rating=('rating', 'min'),    # Minimum rating
    max_date=('date', 'max'),   # Most recent rating date
    min_date=('date', 'min')    # Earliest rating date
).reset_index()

user_analysis.head()

Unnamed: 0,userId,unique_movies,mean_rating,max_rating,min_rating,max_date,min_date
0,1,22,4.159091,5.0,0.5,2015-03-09,2015-03-09
1,2,18,3.222222,5.0,1.0,1997-06-23,1997-06-23
2,3,4,3.0,4.0,2.0,2003-03-19,2003-03-19
3,4,53,3.433962,5.0,1.0,2003-01-15,2003-01-15
4,5,8,3.875,5.0,2.0,2000-02-01,2000-02-01


In [None]:
# Sort user_analysis by unique_movies in descending order
user_analysis = user_analysis.sort_values(by='unique_movies', ascending=False)
user_analysis.head(5)

Unnamed: 0,userId,unique_movies,mean_rating,max_rating,min_rating,max_date,min_date
45420,45811,6003,3.079421,5.0,0.5,2017-07-31,2015-12-15
267732,270123,5319,2.533841,5.0,0.5,2017-08-04,2012-12-13
241301,243443,4950,1.524747,5.0,0.5,2017-08-01,2000-04-07
192979,194690,3811,2.436893,5.0,0.5,2017-07-15,2006-01-09
97903,98787,3788,2.184266,5.0,0.5,2017-03-30,2016-06-25


In [100]:
user_analysis_top10 = user_analysis.head(20)
print("Shape of top 10 users analysis:", user_analysis_top10.shape)
print("Shape of original user analysis:", user_analysis.shape)
df_movies_top10 = df_ratings[df_ratings['userId'].isin(user_analysis_top10['userId'])]
print("Shape of movies rated by top 10 users:", df_movies_top10.shape)
print("Shape of original movies dataframe:", df_ratings.shape)

Shape of top 10 users analysis: (20, 7)
Shape of original user analysis: (268495, 7)
Shape of movies rated by top 10 users: (71990, 4)
Shape of original movies dataframe: (18062183, 4)


In [101]:
# Filter df_movies_top10 for ratings >= 4
df_movies_top10_high_ratings = df_movies_top10[df_movies_top10['rating'] >= 4]
print("Shape of filtered DataFrame:", df_movies_top10_high_ratings.shape)

Shape of filtered DataFrame: (12446, 4)


In [None]:
df_movies_top10_high_ratings['date'] = pd.to_datetime(df_movies_top10_high_ratings['date'])

user_analysis_top10_high_ratings = df_movies_top10_high_ratings.groupby('userId').agg(
    unique_movies=('id', 'nunique'),  # Count of unique movie IDs
    mean_rating=('rating', 'mean'),  # Mean rating
    max_rating=('rating', 'max'),    # Maximum rating
    min_rating=('rating', 'min'),    # Minimum rating
    max_date=('date', 'max'),   # Most recent rating date
    min_date=('date', 'min')    # Earliest rating date
).reset_index()
user_analysis_top10_high_ratings.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_top10_high_ratings['date'] = pd.to_datetime(df_movies_top10_high_ratings['date'])


Unnamed: 0,userId,unique_movies,mean_rating,max_rating,min_rating,max_date,min_date
0,8659,1013,4.025173,5.0,4.0,2015-12-24,2001-08-05
1,37222,1081,4.135523,5.0,4.0,2017-02-02,2017-02-02
2,39742,911,4.161361,5.0,4.0,2017-07-30,2015-06-30
3,45811,1027,4.214703,5.0,4.0,2017-07-25,2015-12-15
4,70648,263,4.043726,5.0,4.0,2011-08-29,2006-11-13
5,98415,385,4.123377,5.0,4.0,2017-08-03,2002-07-05
6,98787,647,4.478362,5.0,4.0,2017-03-30,2016-06-25
7,141589,479,4.442589,5.0,4.0,2017-01-18,2008-07-15
8,165352,163,4.208589,5.0,4.0,2017-03-24,2016-01-07
9,172224,1802,4.248613,5.0,4.0,2016-12-13,2000-08-03
