# 1) Data Collection & Exploring

In [1]:
#base path to dataset directory
path = 'C:/Users/Hamed/Desktop/asgmnt 1 (IRS)'

Data Importing

In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

The datasets are provided in .dat and that doesn't align with the required workflow. Hence, the datasets are converted from (.dat --> .csv)

In [None]:
movies_path = path + '/dataset/movies.dat'
ratings_path = path + '/dataset/ratings.dat'
users_path = path + '/dataset/users.dat'

movies = pd.read_csv(movies_path, delimiter='::', engine='python', encoding='ISO-8859-1', names=['MovieID', 'Title', 'Genres'])
ratings = pd.read_csv(ratings_path, delimiter='::', engine='python', encoding='ISO-8859-1', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
users = pd.read_csv(users_path, delimiter='::', engine='python', encoding='ISO-8859-1', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])

movies.to_csv(path + '/dataset/movies.csv', index=False)
ratings.to_csv(path + '/dataset/ratings.csv', index=False)
users.to_csv(path + '/dataset/users.csv', index=False)

Exploring movies dataset

In [3]:
movies = pd.read_csv( path + '/dataset/movies.csv') 
movies.shape         #Reads the movies.csv dataset and we obtained 3883 attributes with 3 features

(3883, 3)

In [4]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies.info() #gets the info of nature of the features

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MovieID  3883 non-null   int64 
 1   Title    3883 non-null   object
 2   Genres   3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [6]:
data1 = movies['MovieID'].unique().tolist()
len(data1)

3883

Now, with the ratings dataset

In [7]:
ratings = pd.read_csv( path + '/dataset/ratings.csv') 
ratings.shape

(1000209, 4)

In [8]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [9]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   UserID     1000209 non-null  int64
 1   MovieID    1000209 non-null  int64
 2   Rating     1000209 non-null  int64
 3   Timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [10]:
ratings.describe().T #gettings the statisical measures for each feature in ratings.csv
# as viewed, the min rating is 1 and the max is 5 (ordinal rating)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
UserID,1000209.0,3024.512,1728.413,1.0,1506.0,3070.0,4476.0,6040.0
MovieID,1000209.0,1865.54,1096.041,1.0,1030.0,1835.0,2770.0,3952.0
Rating,1000209.0,3.581564,1.117102,1.0,3.0,4.0,4.0,5.0
Timestamp,1000209.0,972243700.0,12152560.0,956703932.0,965302637.0,973018006.0,975220939.0,1046455000.0


Finally, with users dataset

In [11]:
users = pd.read_csv( path + '/dataset/users.csv') 
users.shape

(6040, 5)

In [12]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [13]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null   object
 2   Age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   Zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


In [None]:
users['Age'].min()

1

In [15]:
users['Age'].max()

56

# 2) Data Preprocessing

# 2.1 Data Cleaning

Checking if there any NaN values

In [16]:
movies.isnull().any()

MovieID    False
Title      False
Genres     False
dtype: bool

In [17]:
ratings.isnull().any()

UserID       False
MovieID      False
Rating       False
Timestamp    False
dtype: bool

In [18]:
users.isnull().any() #as displayed there's no NaN values shown

UserID        False
Gender        False
Age           False
Occupation    False
Zip-code      False
dtype: bool

# 2.2 Data Visualization

Displaying the most genre occuried for all movies

In [19]:
genre_counts = movies['Genres'].str.get_dummies(sep='|').sum().reset_index()
genre_counts.columns = ['Genre', 'Count']

fig = px.bar(genre_counts, x='Genre', y='Count',
             title='Distribution of Movie Genres',
             labels={'Count': 'Number of Movies'},
             color='Count',
             color_continuous_scale='Viridis')
fig.show()

Displaying average rating by gender

In [20]:
average_rating_gender = ratings.merge(users, on='UserID').groupby('Gender')['Rating'].mean().reset_index()
colors = ['cyan' if gender == 'M' else 'pink' for gender in average_rating_gender['Gender']]

fig = px.bar(average_rating_gender, x='Gender', y='Rating', 
             title='Average Rating by Gender', labels={'Rating': 'Average Rating'},
             color=average_rating_gender['Gender'],
             color_discrete_sequence=colors)
fig.show()


Displaying Average Rating based on Age group

In [21]:
users['AgeGroup'] = pd.cut(users['Age'], bins=[0, 18, 25, 35, 45, 55, 65], 
                           labels=['0-18', '19-25', '26-35', '36-45', '46-55', '56-65'])

average_rating_age = ratings.merge(users, on='UserID').groupby('AgeGroup')['Rating'].mean().reset_index()
fig = px.bar(average_rating_age, x='AgeGroup', y='Rating', 
             title='Average Rating by Age Group', labels={'Rating': 'Average Rating'})
fig.show()





# 3) Applying Collaborative Filtering algorithms. 

User-item Matrix

In [None]:
user_item_matrix = ratings.pivot(index='UserID', columns='MovieID', values='Rating')
user_item_matrix = user_item_matrix.fillna(0)
print(user_item_matrix)

user_item_matrix.to_csv(path + '/dataset/user_item_matrix.csv')

Similarity Calculation (Cosine Similiarity & Pearson Similiarity) for user

In [None]:
user_item_matrix = ratings.pivot(index='UserID', columns='MovieID', values='Rating')
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix.to_csv( path + '/dataset/user_item_matrix.csv')
cosine_similarities = cosine_similarity(user_item_matrix)

cosine_similarity_df = pd.DataFrame(cosine_similarities, 
                                    index=user_item_matrix.index, 
                                    columns=user_item_matrix.index)
print(cosine_similarity_df)

pearson_similarity_df = user_item_matrix.T.corr(method='pearson')
print(pearson_similarity_df)

cosine_similarity_df.to_csv( path + '/dataset/cosine_similarity.csv')
pearson_similarity_df.to_csv(path + '/dataset/pearson_similarity.csv')

User-based CF

In [None]:
def get_recommendations(user_id, similarity_matrix, user_item_matrix, num_recommendations):
    if user_id not in similarity_matrix.index:
        raise ValueError(f"User ID {user_id} not found in the similarity matrix")
    
    similar_users = similarity_matrix[user_id].sort_values(ascending=False).index[1:num_recommendations+1]
    similar_users_ratings = user_item_matrix.loc[similar_users].mean()
    user_ratings = user_item_matrix.loc[user_id]
    recommendations = similar_users_ratings[user_ratings == 0].sort_values(ascending=False).head(num_recommendations)
    return recommendations
user_id = 1
num_recommendations = 10

try:
    recommendations_cosine = get_recommendations(user_id, cosine_similarity_df, user_item_matrix, num_recommendations)
    recommendations_pearson = get_recommendations(user_id, pearson_similarity_df, user_item_matrix, num_recommendations)

    print("Recommendations using Cosine Similarity:")
    print(recommendations_cosine)

    print("Recommendations using Pearson Similarity:")
    print(recommendations_pearson)
except ValueError as e:
    print(e)

Recommendations using Cosine Similarity:
MovieID
2081    4.3
2078    3.6
364     3.3
2096    3.2
1282    3.0
593     2.9
596     2.8
2137    2.6
2085    2.6
2087    2.5
dtype: float64
Recommendations using Pearson Similarity:
MovieID
2081    4.3
2078    3.6
364     3.3
2096    3.2
1282    3.0
593     2.9
596     2.8
2137    2.6
2085    2.6
2087    2.5
dtype: float64


Similarity Calculation (Cosine Similiarity & Pearson Similiarity) for item

In [None]:
item_user_matrix = ratings.pivot(index='MovieID', columns='UserID', values='Rating')
item_user_matrix = item_user_matrix.fillna(0)
item_user_matrix.to_csv( path + '/dataset/item_user_matrix.csv')

cosine_similarities = cosine_similarity(item_user_matrix)
cosine_similarity_df = pd.DataFrame(cosine_similarities, 
                                    index=item_user_matrix.index, 
                                    columns=item_user_matrix.index)
print(cosine_similarity_df)

pearson_similarity_df = item_user_matrix.T.corr(method='pearson')
print(pearson_similarity_df)

cosine_similarity_df.to_csv( path + '/dataset/item_cosine_similarity.csv')
pearson_similarity_df.to_csv( path + '/dataset/item_pearson_similarity.csv')

MovieID      1         2         3         4         5         6         7     \
MovieID                                                                         
1        1.000000  0.390349  0.267943  0.178789  0.256569  0.347373  0.301490   
2        0.390349  1.000000  0.240946  0.155457  0.249970  0.244827  0.262772   
3        0.267943  0.240946  1.000000  0.192788  0.308290  0.187020  0.292230   
4        0.178789  0.155457  0.192788  1.000000  0.271990  0.125170  0.220024   
5        0.256569  0.249970  0.308290  0.271990  1.000000  0.148114  0.305107   
...           ...       ...       ...       ...       ...       ...       ...   
3948     0.309676  0.213650  0.190575  0.118902  0.174554  0.236447  0.191689   
3949     0.186633  0.140781  0.104837  0.096318  0.092403  0.201419  0.117660   
3950     0.093479  0.087013  0.062258  0.022588  0.051633  0.115331  0.059262   
3951     0.042829  0.026063  0.010073  0.024769  0.010750  0.029136  0.036102   
3952     0.182691  0.122185 

Item-basd CF

In [None]:
def get_item_recommendations(user_id, item_similarity_matrix, user_item_matrix, num_recommendations):
    
    if user_id not in user_item_matrix.index:
        raise ValueError(f"User ID {user_id} not found in the user-item matrix")
    user_ratings = user_item_matrix.loc[user_id]
    rated_items = user_ratings[user_ratings > 0].index
    item_scores = pd.Series(dtype='float64')

    for item in rated_items:
        similar_items = item_similarity_matrix[item].sort_values(ascending=False)
        scores = similar_items * user_ratings[item]
        item_scores = item_scores.add(scores, fill_value=0)

    item_scores = item_scores[~item_scores.index.isin(rated_items)]

    recommendations = item_scores.sort_values(ascending=False).head(num_recommendations)
    return recommendations

user_id = 1
num_recommendations = 10

try:
    recommendations_cosine = get_item_recommendations(user_id, cosine_similarity_df, user_item_matrix, num_recommendations)
    recommendations_pearson = get_item_recommendations(user_id, pearson_similarity_df, user_item_matrix, num_recommendations)

    print("Item-Based Recommendations using Cosine Similarity:")
    print(recommendations_cosine)

    print("Item-Based Recommendations using Pearson Similarity:")
    print(recommendations_pearson)
except ValueError as e:
    print(e)

Item-Based Recommendations using Cosine Similarity:
MovieID
1196    90.220575
364     87.477004
1198    86.821730
1265    86.334944
318     84.839760
2716    84.231164
2081    84.163843
1210    84.091455
593     83.944310
2987    82.569072
dtype: float64
Item-Based Recommendations using Pearson Similarity:
MovieID
364     60.954215
2081    58.396864
2096    55.293823
2080    55.090734
2087    54.978783
2078    54.308232
1282    54.270739
596     53.630219
1073    51.711382
2137    50.472960
dtype: float64
