# 1.0-initial-data-exporation

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

## Review and processing of source data

Firstly, load our dataset and look on it.

In [2]:
# Define the column names for your data
data_columns = ['user_id', 'item_id', 'rating', 'timestamp']
item_columns = ['movie_id', 'movie_title', 'release_date',
                'video_release_date', 'IMDB_URL', 'unknown',
                'Action', 'Adventure', 'Animation', 'Children',
                'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
                'Sci-Fi', 'Thriller', 'War', 'Western']
user_columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']

# Load the training and test sets
train_data = pd.read_csv('ml-100k/u1.base', sep='\t', names=data_columns)
test_data = pd.read_csv('ml-100k/u1.test', sep='\t', names=data_columns)

# Load the item and user information
item = pd.read_csv('ml-100k/u.item', sep='|', names=item_columns, encoding='latin-1')
user = pd.read_csv('ml-100k/u.user', sep='|', names=user_columns, encoding='latin-1')

# Merge the dataframes
train_data = pd.merge(train_data, item, how='left', left_on='item_id', right_on='movie_id')
train_data = pd.merge(train_data, user, how='left', on='user_id')

test_data = pd.merge(test_data, item, how='left', left_on='item_id', right_on='movie_id')
test_data = pd.merge(test_data, user, how='left', on='user_id')

# Save the merged data as CSV files
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

In [3]:
item

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDB_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
user

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [5]:
data = pd.concat([train_data, test_data])
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,movie_title,release_date,video_release_date,IMDB_URL,unknown,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,age,gender,occupation,zip_code
0,1,1,5,874965758,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,...,0,0,0,0,0,0,24,M,technician,85711
1,1,2,3,876893171,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,...,0,0,0,1,0,0,24,M,technician,85711
2,1,3,4,878542960,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,...,0,0,0,1,0,0,24,M,technician,85711
3,1,4,3,876893119,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,...,0,0,0,0,0,0,24,M,technician,85711
4,1,5,3,889751712,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,...,0,0,0,1,0,0,24,M,technician,85711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,458,648,4,886395899,648,"Quiet Man, The (1952)",01-Jan-1952,,"http://us.imdb.com/M/title-exact?Quiet%20Man,%...",0,...,0,1,0,0,0,0,47,M,technician,Y1A6B
99996,458,1101,4,886397931,1101,Six Degrees of Separation (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Six%20Degrees...,0,...,1,0,0,0,0,0,47,M,technician,Y1A6B
99997,459,934,3,879563639,934,"Preacher's Wife, The (1996)",13-Dec-1996,,http://us.imdb.com/M/title-exact?Preacher's%20...,0,...,0,0,0,0,0,0,22,M,student,29201
99998,460,10,3,882912371,10,Richard III (1995),22-Jan-1996,,http://us.imdb.com/M/title-exact?Richard%20III...,0,...,0,0,0,0,1,0,44,F,other,60630


In [6]:
data.columns

Index(['user_id', 'item_id', 'rating', 'timestamp', 'movie_id', 'movie_title',
       'release_date', 'video_release_date', 'IMDB_URL', 'unknown', 'Action',
       'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary',
       'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
       'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'age', 'gender',
       'occupation', 'zip_code'],
      dtype='object')

In [7]:
data[["user_id", "item_id", "rating", "timestamp", "movie_id", "movie_title", "release_date", "video_release_date", "IMDB_URL"]]

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,movie_title,release_date,video_release_date,IMDB_URL
0,1,1,5,874965758,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,1,2,3,876893171,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,1,3,4,878542960,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,1,4,3,876893119,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,1,5,3,889751712,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)
...,...,...,...,...,...,...,...,...,...
99995,458,648,4,886395899,648,"Quiet Man, The (1952)",01-Jan-1952,,"http://us.imdb.com/M/title-exact?Quiet%20Man,%..."
99996,458,1101,4,886397931,1101,Six Degrees of Separation (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Six%20Degrees...
99997,459,934,3,879563639,934,"Preacher's Wife, The (1996)",13-Dec-1996,,http://us.imdb.com/M/title-exact?Preacher's%20...
99998,460,10,3,882912371,10,Richard III (1995),22-Jan-1996,,http://us.imdb.com/M/title-exact?Richard%20III...


In [8]:
num_unique_values = test_data.nunique()
num_unique_values

user_id                 459
item_id                1410
rating                    5
timestamp             13983
movie_id               1410
movie_title            1401
release_date            223
video_release_date        0
IMDB_URL               1400
unknown                   2
Action                    2
Adventure                 2
Animation                 2
Children                  2
Comedy                    2
Crime                     2
Documentary               2
Drama                     2
Fantasy                   2
Film-Noir                 2
Horror                    2
Musical                   2
Mystery                   2
Romance                   2
Sci-Fi                    2
Thriller                  2
War                       2
Western                   2
age                      56
gender                    2
occupation               21
zip_code                416
dtype: int64

Drop item_id, timestamp, movie_title, video_release_date, IMDB_URL, zip_code columns because its useless for our analysis.

In [9]:
# Drop useless columns
data.drop(data.columns[[1, 3, 5, 7, 8, 31]], axis=1, inplace=True)
data

Unnamed: 0,user_id,rating,movie_id,release_date,unknown,Action,Adventure,Animation,Children,Comedy,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,age,gender,occupation
0,1,5,1,01-Jan-1995,0,0,0,1,1,1,...,0,0,0,0,0,0,0,24,M,technician
1,1,3,2,01-Jan-1995,0,1,1,0,0,0,...,0,0,0,0,1,0,0,24,M,technician
2,1,4,3,01-Jan-1995,0,0,0,0,0,0,...,0,0,0,0,1,0,0,24,M,technician
3,1,3,4,01-Jan-1995,0,1,0,0,0,1,...,0,0,0,0,0,0,0,24,M,technician
4,1,3,5,01-Jan-1995,0,0,0,0,0,0,...,0,0,0,0,1,0,0,24,M,technician
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,458,4,648,01-Jan-1952,0,0,0,0,0,1,...,0,0,1,0,0,0,0,47,M,technician
99996,458,4,1101,01-Jan-1993,0,0,0,0,0,0,...,0,1,0,0,0,0,0,47,M,technician
99997,459,3,934,13-Dec-1996,0,0,0,0,0,0,...,0,0,0,0,0,0,0,22,M,student
99998,460,3,10,22-Jan-1996,0,0,0,0,0,0,...,0,0,0,0,0,1,0,44,F,other


Now we need to encode textual values. Make ordinal encoding for occupation column, one-hot encoding for gender column and replace release_year on release_date column.

In [10]:
# Use One-Hot encoder to encode categorial values with a low level of uniqueness
ohe_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
feature = ['gender']
ohe_encoder.fit(data[feature])

new_feat = ohe_encoder.transform(data[feature])
new_cols = pd.DataFrame(new_feat, dtype=int, columns=ohe_encoder.get_feature_names(feature))
data = pd.concat([data, new_cols], axis=1)    
data.drop(feature, axis=1, inplace=True)

# Fit and transform the 'occupation' column
encoder = OrdinalEncoder()
data['occupation'] = encoder.fit_transform(data[['occupation']])

# Convert the 'release_date' column to datetime
data['release_date'] = pd.to_datetime(data['release_date'])

# Extract the year and create a new 'release_year' column, drop ald column
data['release_date'].fillna('01-Jan-1980', inplace=True)
data['release_year'] = data['release_date'].dt.year.astype(int)
data.drop(data.columns[[3]], axis=1, inplace=True)
data



Unnamed: 0,user_id,rating,movie_id,unknown,Action,Adventure,Animation,Children,Comedy,Crime,...,Romance,Sci-Fi,Thriller,War,Western,age,occupation,gender_F,gender_M,release_year
0,1,5,1,0,0,0,1,1,1,0,...,0,0,0,0,0,24,19.0,0,1,1995
1,1,3,2,0,1,1,0,0,0,0,...,0,0,1,0,0,24,19.0,0,1,1995
2,1,4,3,0,0,0,0,0,0,0,...,0,0,1,0,0,24,19.0,0,1,1995
3,1,3,4,0,1,0,0,0,1,0,...,0,0,0,0,0,24,19.0,0,1,1995
4,1,3,5,0,0,0,0,0,0,1,...,0,0,1,0,0,24,19.0,0,1,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,458,4,648,0,0,0,0,0,1,0,...,1,0,0,0,0,47,19.0,0,1,1952
99996,458,4,1101,0,0,0,0,0,0,0,...,0,0,0,0,0,47,19.0,0,1,1993
99997,459,3,934,0,0,0,0,0,0,0,...,0,0,0,0,0,22,18.0,0,1,1996
99998,460,3,10,0,0,0,0,0,0,0,...,0,0,0,1,0,44,13.0,1,0,1996


In [11]:
data.isnull().sum()

user_id         0
rating          0
movie_id        0
unknown         0
Action          0
Adventure       0
Animation       0
Children        0
Comedy          0
Crime           0
Documentary     0
Drama           0
Fantasy         0
Film-Noir       0
Horror          0
Musical         0
Mystery         0
Romance         0
Sci-Fi          0
Thriller        0
War             0
Western         0
age             0
occupation      0
gender_F        0
gender_M        0
release_year    0
dtype: int64

In [12]:
train_data = data[:80000]
test_data = data[80000:]
test_data.reset_index(drop=True, inplace=True)

We see that toxic sentences located randomly, so we will make a test(little)/final(big) datasets with all toxic sentences in reference column.

In [13]:
test_data

Unnamed: 0,user_id,rating,movie_id,unknown,Action,Adventure,Animation,Children,Comedy,Crime,...,Romance,Sci-Fi,Thriller,War,Western,age,occupation,gender_F,gender_M,release_year
0,1,5,6,0,0,0,0,0,0,0,...,0,0,0,0,0,24,19.0,0,1,1995
1,1,3,10,0,0,0,0,0,0,0,...,0,0,0,1,0,24,19.0,0,1,1996
2,1,5,12,0,0,0,0,0,0,1,...,0,0,1,0,0,24,19.0,0,1,1995
3,1,5,14,0,0,0,0,0,0,0,...,1,0,0,0,0,24,19.0,0,1,1994
4,1,3,17,0,1,0,0,0,1,1,...,0,0,1,0,0,24,19.0,0,1,1996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,458,4,648,0,0,0,0,0,1,0,...,1,0,0,0,0,47,19.0,0,1,1952
19996,458,4,1101,0,0,0,0,0,0,0,...,0,0,0,0,0,47,19.0,0,1,1993
19997,459,3,934,0,0,0,0,0,0,0,...,0,0,0,0,0,22,18.0,0,1,1996
19998,460,3,10,0,0,0,0,0,0,0,...,0,0,0,1,0,44,13.0,1,0,1996


In [14]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Preprocess your data
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_data)
test_scaled = scaler.transform(test_data)

# Fit the KNN model
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(train_scaled)

# Make recommendations for a user in the test set
test_user = test_scaled[0]
distances, indices = knn.kneighbors(np.array([test_user]))

# Print out the recommended movie ids
recommended_movie_ids = train_data.iloc[indices[0]]['movie_id']
print(recommended_movie_ids)

7      9
13    19
10    15
36    57
38    59
Name: movie_id, dtype: int64


In [15]:
recommended_movie_ids

7      9
13    19
10    15
36    57
38    59
Name: movie_id, dtype: int64

In [16]:
# Assuming 'knn' is your trained model and 'test_scaled' is your preprocessed test set
recommendations = {}
for i in range(test_scaled.shape[0]):
    distances, indices = knn.kneighbors(np.array([test_scaled[i]]))
    recommended_movie_ids = train_data.iloc[indices[0]]['movie_id'].tolist()
    user_id = test_data.iloc[i]['user_id']
    recommendations[user_id] = recommended_movie_ids

KeyboardInterrupt: 

In [19]:
item1 = item.copy()

# Calculate average ratings for 'gender_F'
female_avg_ratings = data[data['gender_F'] == 1].groupby('movie_id')['rating'].mean()
female_avg_ratings.name = 'Female_avg_rating'
item1 = item1.merge(female_avg_ratings, left_on='movie_id', right_index=True, how='left')

# Calculate average ratings for 'gender_M'
male_avg_ratings = data[data['gender_M'] == 1].groupby('movie_id')['rating'].mean()
male_avg_ratings.name = 'Male_avg_rating'
item1 = item1.merge(male_avg_ratings, left_on='movie_id', right_index=True, how='left')

bins = [7, 10, 14, 18, 25, 35, 45, 55, 65, np.inf]
labels = ['7-10', '11-14', '15-18', '19-25', '26-35', '36-45', '46-55', '56-65', '65+']
data['age_group'] = pd.cut(data['age'], bins=bins, labels=labels)

# Calculate average ratings for each age group and merge with 'item1'
for label in labels:
    age_avg_ratings = data[data['age_group'] == label].groupby('movie_id')['rating'].mean()
    age_avg_ratings.name = f'{label}_avg_rating'
    item1 = item1.merge(age_avg_ratings, left_on='movie_id', right_index=True, how='left')
    
# Get the unique occupation types
occupation_types = data['occupation'].unique()

# Calculate average ratings for each occupation type and merge with 'item1'
for occupation in occupation_types:
    occupation_avg_ratings = data[data['occupation'] == occupation].groupby('movie_id')['rating'].mean()
    occupation_avg_ratings.name = f'{occupation}_avg_rating'
    item1 = item1.merge(occupation_avg_ratings, left_on='movie_id', right_index=True, how='left')

# Fill null values with 3
item1 = item1.fillna(3)

item1

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDB_URL,unknown,Action,Adventure,Animation,Children,...,10.0_avg_rating,8.0_avg_rating,1.0_avg_rating,4.0_avg_rating,11.0_avg_rating,12.0_avg_rating,7.0_avg_rating,15.0_avg_rating,16.0_avg_rating,2.0_avg_rating
0,1,Toy Story (1995),01-Jan-1995,3.0,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,3.647059,4.0,4.083333,4.043478,3.8,4.166667,3.000000,3.666667,4.00,3.5
1,2,GoldenEye (1995),01-Jan-1995,3.0,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,3.000000,3.0,3.000000,3.125000,3.0,4.500000,2.666667,2.000000,3.00,3.0
2,3,Four Rooms (1995),01-Jan-1995,3.0,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,2.333333,3.0,3.000000,3.363636,2.0,3.000000,2.000000,3.000000,1.00,3.0
3,4,Get Shorty (1995),01-Jan-1995,3.0,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,3.500000,3.0,4.666667,3.739130,3.5,3.500000,3.000000,3.600000,3.75,3.0
4,5,Copycat (1995),01-Jan-1995,3.0,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,3.333333,3.0,3.000000,3.000000,3.0,4.500000,3.000000,3.500000,4.00,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,3.0,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,3.000000,3.0,3.000000,3.000000,3.0,3.000000,3.000000,3.000000,3.00,3.0
1678,1679,B. Monkey (1998),06-Feb-1998,3.0,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,3.000000,3.0,3.000000,3.000000,3.0,3.000000,3.000000,3.000000,3.00,3.0
1679,1680,Sliding Doors (1998),01-Jan-1998,3.0,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,3.000000,3.0,3.000000,3.000000,3.0,3.000000,3.000000,3.000000,3.00,3.0
1680,1681,You So Crazy (1994),01-Jan-1994,3.0,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,3.000000,3.0,3.000000,3.000000,3.0,3.000000,3.000000,3.000000,3.00,3.0


In [22]:
data

Unnamed: 0,user_id,rating,movie_id,unknown,Action,Adventure,Animation,Children,Comedy,Crime,...,Sci-Fi,Thriller,War,Western,age,occupation,gender_F,gender_M,release_year,age_group
0,1,5,1,0,0,0,1,1,1,0,...,0,0,0,0,24,19.0,0,1,1995,19-25
1,1,3,2,0,1,1,0,0,0,0,...,0,1,0,0,24,19.0,0,1,1995,19-25
2,1,4,3,0,0,0,0,0,0,0,...,0,1,0,0,24,19.0,0,1,1995,19-25
3,1,3,4,0,1,0,0,0,1,0,...,0,0,0,0,24,19.0,0,1,1995,19-25
4,1,3,5,0,0,0,0,0,0,1,...,0,1,0,0,24,19.0,0,1,1995,19-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,458,4,648,0,0,0,0,0,1,0,...,0,0,0,0,47,19.0,0,1,1952,46-55
99996,458,4,1101,0,0,0,0,0,0,0,...,0,0,0,0,47,19.0,0,1,1993,46-55
99997,459,3,934,0,0,0,0,0,0,0,...,0,0,0,0,22,18.0,0,1,1996,19-25
99998,460,3,10,0,0,0,0,0,0,0,...,0,0,1,0,44,13.0,1,0,1996,36-45


In [47]:
average_ratings

Unnamed: 0,user_id,gender,age,occupation,1922-1950_avg_rating,1951-1970_avg_rating,1971-1980_avg_rating,1981-1990_avg_rating,1991-1998_avg_rating,male_avg_rating,female_avg_rating


In [53]:
data1 = data.copy()
bins = [1922, 1950, 1970, 1980, 1990, np.inf]
labels = ['1922-1950', '1951-1970', '1971-1980', '1981-1990', '1991-1998']
data1['year_group'] = pd.cut(data1['release_year'], bins=bins, labels=labels)
data1['gender'] = np.where(data1['gender_F']==1, 'female', 'male')

genre_cols = ['unknown','Action', 'Adventure', 'Animation', 'Children',
          'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
          'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
          'Sci-Fi', 'Thriller', 'War', 'Western']

# Initialize a DataFrame to store the average ratings
average_ratings = pd.DataFrame(columns=['user_id', 'gender', 'age', 'occupation'] + [f'{label}_avg_rating' for label in labels] + [f'{genre}_avg_rating' for genre in genre_cols])

# For each user
for user_id in data1['user_id'].unique():
    # Select the rows for this user
    user_rows = data1[data1['user_id'] == user_id]
    
    # Get the user's gender, age, and occupation
    gender = "male" if user_rows['gender_M'].iloc[0] == 1 else "female"
    age = user_rows['age'].iloc[0]
    occupation = user_rows['occupation'].iloc[0]
    
    # Calculate the average rating for each release period
    period_avg_ratings = [user_rows[user_rows['year_group'] == label]['rating'].mean() for label in labels]
    
    # Calculate the average rating for each genre
    genre_avg_ratings = [(user_rows[user_rows[genre] == 1]['rating'].mean()) for genre in genre_cols]
    
    # Append the average ratings to the DataFrame
    average_ratings = average_ratings.append(pd.Series([user_id, gender, age, occupation] + period_avg_ratings + genre_avg_ratings, index=average_ratings.columns), ignore_index=True)

# Fill null values with 3
average_ratings = average_ratings.fillna(3)

average_ratings

  average_ratings = average_ratings.append(pd.Series([user_id, gender, age, occupation] + period_avg_ratings + genre_avg_ratings, index=average_ratings.columns), ignore_index=True)


Unnamed: 0,user_id,gender,age,occupation,1922-1950_avg_rating,1951-1970_avg_rating,1971-1980_avg_rating,1981-1990_avg_rating,1991-1998_avg_rating,unknown_avg_rating,...,Fantasy_avg_rating,Film-Noir_avg_rating,Horror_avg_rating,Musical_avg_rating,Mystery_avg_rating,Romance_avg_rating,Sci-Fi_avg_rating,Thriller_avg_rating,War_avg_rating,Western_avg_rating
0,1,male,24,19.0,3.600000,3.133333,4.050000,3.934783,3.521505,4.0,...,3.5,5.0,3.461538,2.923077,3.600000,3.931818,4.000000,3.615385,3.680000,3.666667
1,2,female,53,13.0,3.000000,3.000000,5.000000,3.000000,3.666667,3.0,...,3.0,4.5,3.000000,3.000000,3.500000,4.125000,3.750000,3.583333,3.666667,3.000000
2,3,male,23,20.0,3.000000,3.000000,3.000000,3.000000,2.796296,3.0,...,3.0,2.5,2.400000,2.000000,3.181818,3.400000,2.750000,2.523810,2.800000,3.000000
3,4,male,24,19.0,3.000000,3.000000,4.500000,3.000000,4.380952,3.0,...,3.0,3.0,4.000000,5.000000,4.000000,4.333333,3.833333,3.909091,4.500000,3.000000
4,5,female,33,13.0,3.333333,3.187500,3.000000,3.062500,2.707071,4.0,...,2.5,5.0,2.535714,3.333333,3.000000,2.315789,3.515152,2.947368,3.214286,2.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,939,female,26,18.0,3.000000,3.000000,5.000000,3.000000,4.250000,3.0,...,4.0,3.0,3.000000,4.000000,4.000000,4.800000,4.125000,4.083333,5.000000,3.000000
939,940,male,32,0.0,4.000000,3.571429,3.666667,3.538462,3.375000,3.0,...,3.0,4.0,3.000000,3.142857,4.333333,3.541667,3.052632,3.350000,3.266667,3.000000
940,941,male,20,18.0,3.000000,3.000000,3.000000,3.000000,4.045455,3.0,...,3.0,3.0,3.000000,4.000000,5.000000,5.000000,3.875000,4.000000,5.000000,3.000000
941,942,female,48,10.0,4.600000,4.466667,4.125000,4.700000,4.000000,3.0,...,4.0,5.0,3.666667,4.400000,4.000000,4.411765,4.166667,4.000000,4.700000,4.666667


In [93]:
data1

Unnamed: 0,user_id,rating,movie_id,unknown,Action,Adventure,Animation,Children,Comedy,Crime,...,War,Western,age,occupation,gender_F,gender_M,release_year,age_group,year_group,gender
0,1,5,1,0,0,0,1,1,1,0,...,0,0,24,19.0,0,1,1995,19-25,1991-1998,male
1,1,3,2,0,1,1,0,0,0,0,...,0,0,24,19.0,0,1,1995,19-25,1991-1998,male
2,1,4,3,0,0,0,0,0,0,0,...,0,0,24,19.0,0,1,1995,19-25,1991-1998,male
3,1,3,4,0,1,0,0,0,1,0,...,0,0,24,19.0,0,1,1995,19-25,1991-1998,male
4,1,3,5,0,0,0,0,0,0,1,...,0,0,24,19.0,0,1,1995,19-25,1991-1998,male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,458,4,648,0,0,0,0,0,1,0,...,0,0,47,19.0,0,1,1952,46-55,1951-1970,male
99996,458,4,1101,0,0,0,0,0,0,0,...,0,0,47,19.0,0,1,1993,46-55,1991-1998,male
99997,459,3,934,0,0,0,0,0,0,0,...,0,0,22,18.0,0,1,1996,19-25,1991-1998,male
99998,460,3,10,0,0,0,0,0,0,0,...,1,0,44,13.0,1,0,1996,36-45,1991-1998,female


In [89]:
recommendations = {}
for i in range(test_scaled.shape[0]):
    distances, indices = knn.kneighbors(np.array([test_scaled[i]]))
    recommended_movie_ids = train_data.iloc[indices[0]]['movie_id'].tolist()
    user_id = test_data.iloc[i]['user_id']
    recommendations[user_id] = list(set(recommended_movie_ids))

In [88]:
def calculate_film_rating(user_id, movie_id):
    # Get the user and movie data
    user_ratings = average_ratings[average_ratings['user_id'] == user_id]
    movie = item1[item1['movie_id'] == movie_id]

    # Calculate the genre part of the formula
    genre_cols = ['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    genre_ratings = [movie[genre].values[0] * user_ratings[f'{genre}_avg_rating'].values[0] for genre in genre_cols if movie[genre].values[0] == 1]
    genre_part = sum(genre_ratings) / len(genre_ratings) if genre_ratings else 0

    # Calculate gender rating
    gender_part = movie['Female_avg_rating'].values[0] if user_ratings['gender'].values[0] == "female" else movie['Male_avg_rating'].values[0]
    
    # Calculate age rating
    age = user_ratings["age"].values[0]
    if (age >= 7 and age <= 10):
        age_part = movie['7-10_avg_rating'].values[0]
    elif (age >= 11 and age <= 14):
        age_part = movie['11-14_avg_rating'].values[0]
    elif (age >= 15 and age <= 18):
        age_part = movie['15-18_avg_rating'].values[0]
    elif (age >= 19 and age <= 25):
        age_part = movie['19-25_avg_rating'].values[0]
    elif (age >= 26 and age <= 35):
        age_part = movie['26-35_avg_rating'].values[0]
    elif (age >= 36 and age <= 45):
        age_part = movie['36-45_avg_rating'].values[0]
    elif (age >= 46 and age <= 55):
        age_part = movie['46-55_avg_rating'].values[0]
    elif (age >= 56 and age <= 65):
        age_part = movie['56-65_avg_rating'].values[0]
    else:
        age_part = movie['65+_avg_rating'].values[0]
    
    # Calculate occupation rating
    occupation_part = movie[str(user_ratings["occupation"].values[0]) + "_avg_rating"].values[0]
    
    # Calculate release year rating
    try:
        year = int((movie["release_date"].values[0])[-4:])
    except:
        year = 1980
    if (year >= 1922 and year <= 1950):
        year_part = average_ratings['1922-1950_avg_rating'].values[0]
    elif (year >= 1951 and year <= 1970):
        year_part = average_ratings['1951-1970_avg_rating'].values[0]
    elif (year >= 1971 and year <= 1980):
        year_part = average_ratings['1971-1980_avg_rating'].values[0]
    elif (year >= 1981 and year <= 1990):
        year_part = average_ratings['1981-1990_avg_rating'].values[0]
    elif (year >= 1991 and year <= 1998):
        year_part = average_ratings['1991-1998_avg_rating'].values[0]

    # Calculate the final film rating
    film_rating = 0.6 * genre_part + 0.1 * gender_part + 0.1 * age_part + 0.1 * occupation_part + 0.1 * year_part

    return film_rating

In [90]:
best_ratings = []
worst_ratings = []
for index in recommendations:
    ratings = []
    for i in range(1, 1682):
        ratings.append(calculate_film_rating(index, i))
    ratings.sort()
    best_ratings.append(np.mean(ratings[:5]))
    worst_ratings.append(np.mean(ratings[-5:]))
    print(index, np.mean(ratings[:5]), np.mean(ratings[-5:]))
print(np.mean(best_ratings))
print(np.mean(worst_ratings))

1.0 2.445961060444932 4.514129391847691
2.0 2.4886816269284715 4.175646498978567
3.0 2.1083410138248846 4.588772415756287
4.0 2.7768870967741934 4.664089779932228
5.0 2.218719220913434 4.238324056910939
6.0 2.684067792936376 4.306321654325752
7.0 2.945262248934528 4.326580032910405
8.0 1.9927562301874695 4.463817287665161
9.0 2.6196505376344086 4.606536021291029
10.0 3.1115533110510127 4.457312611667605
11.0 2.5788025880929104 3.928297020765867
12.0 1.3172530554998996 4.490178703961556
13.0 1.6856743471582185 4.007877934667881
14.0 3.077652641421225 4.411942852913575
15.0 1.2571505376344085 3.698415814632166
16.0 2.772150537634408 4.519482434680937
17.0 2.1703172043010754 3.9434082169061546
18.0 2.61741608228144 4.2314988008065875
19.0 2.697452124935996 4.540381431126329
20.0 1.893720430107527 3.8074711719920744
21.0 2.2979166158185196 3.6206453868674315
22.0 1.686152641421225 4.41046468615823
23.0 2.3054160822814405 4.109115043432936
24.0 2.671985974754558 4.57173329543344
25.0 2.7821

197.0 2.3137211084464653 4.248860928761424
198.0 2.2739875108528684 4.679743611227482
199.0 1.81391608228144 3.9515148176835586
200.0 2.2260899315738025 4.39041657416029
201.0 2.311809848513232 3.6914764891513934
202.0 2.058817204301075 4.030812790805711
203.0 2.6356236559139785 4.5076415639482335
204.0 2.3911505376344087 4.6380789210239755
205.0 2.5199297584136295 3.7856985438293456
206.0 1.6045747800586512 3.553773154827168
207.0 2.607160107758957 3.7787308992463955
208.0 1.8825518250310878 4.201896543936212
209.0 1.8654160822814405 4.402747345704387
210.0 2.8593870967741934 4.363387745155
211.0 1.657048161199964 4.000235348666723
212.0 2.21273783922171 4.403987188580011
213.0 2.8729569892473115 4.590030535990804
214.0 2.836555867227676 4.3480908790294075
215.0 2.7127204301075265 4.256769983814215
216.0 2.728958618442489 4.2802586681451595
217.0 1.9008151339077006 3.6322822293419876
218.0 2.7011455426294138 4.079902699946283
219.0 2.6521505376344083 4.503809646733471
220.0 2.64215053

389.0 2.8465347367686076 4.224325332746943
390.0 2.6201505376344087 4.591390764427664
391.0 2.6990136744975457 4.199527394955111
392.0 2.7616743471582184 4.327047703691863
393.0 2.5079398918052496 3.8801716790490333
394.0 2.812150537634408 4.360464206067084
395.0 2.831566122049993 4.536065798735174
396.0 2.21093051414114 4.048393953526026
397.0 2.5289545669384377 4.341053069364922
398.0 2.8295178506866527 4.147507669811058
399.0 2.109947171575503 3.6483641874941157
400.0 2.514126728110599 4.174115929685128
401.0 1.46041608228144 3.9275540364085133
402.0 2.900744868035191 4.532347738015892
403.0 2.605187114584029 4.3266934889094335
404.0 1.9782407405351052 3.850088178889583
405.0 1.5700510230713018 3.2362491551459294
406.0 2.6089882630158234 4.232310416970759
407.0 2.7181783055654023 3.928949369641245
408.0 1.8613172043010753 4.460805914153082
409.0 2.1268287806941384 4.2585732662742455
410.0 2.411916105399976 3.7936423052843202
411.0 2.7052204301075267 4.169736544605834
412.0 2.8068602

In [87]:
best_ratings

[2.2660899315738026]

In [58]:
recommendations[1]
r_best = recommendations.copy()
r_worst = recommendations.copy()
r_random = recommendations.copy()

[282, 340, 365, 303]

In [91]:
counter = 0
summator = 0

for i in range(len(recommendations)):
    if (i+1 != 441 and i+1 != 443):
        for movie_id in recommendations[i+1]:
            counter += 1
            summator += calculate_film_rating(i+1, movie_id)
        
summator/counter

3.4820578314590835

In [92]:

(3.4820578314590835 - 2.43479518969462) /(4.244778095778713 - 2.43479518969462)


0.5786036090419336