In [1]:
#Import 
import os
import numpy as np
import pandas as pd
import jupyter_client
from sklearn.model_selection import train_test_split

In [2]:
#Load the Books dataset
Book_users = os.path.join("..", "Data", "Book Recommendation Raw Dataset", "Users.csv")
Book_ratings = os.path.join("..", "Data", "Book Recommendation Raw Dataset", "Ratings.csv")
Books = os.path.join("..", "Data", "Book Recommendation Raw Dataset", "Books.csv")

#Turn each dataset into a dataframe
Book_users_df = pd.read_csv(Book_users, nrows=10000)
Book_ratings_df = pd.read_csv(Book_ratings, nrows=10000)
Books_df = pd.read_csv(Books, low_memory=False, nrows=10000)

# Remove the 'Image-URL-S', 'Image-URL-M', and 'Image-URL-L' columns
Books_df = Books_df.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1)

In [3]:
Books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [4]:
Book_ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
Book_users_df.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [6]:
#Start by cleaning the Users Dataframe
Book_users_df.isnull().sum()

User-ID        0
Location       0
Age         3727
dtype: int64

In [7]:
#Since Age is the only column with missing values we can just remove those rows (This might cause issues later around merging)
Book_users_df.dropna(inplace=True)

In [8]:
#Now we deal with the missing values in the Book Ratings df
Book_ratings_df.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [9]:
#No missing values!!!!!

In [10]:
#Now we deal with the books df itself
Books_df.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              0
dtype: int64

In [11]:
#Since there are only two missing values we are going to replace them with unknown 
#Note here that I think a better option would be to find the missing entires and then look them up online to have all of the correct information
Books_df['Book-Author'].fillna('Unknown', inplace=True)
Books_df['Publisher'].fillna('Unknown', inplace=True)

In [12]:
Books_df.dtypes

ISBN                   object
Book-Title             object
Book-Author            object
Year-Of-Publication     int64
Publisher              object
dtype: object

In [13]:
Book_ratings_df.dtypes

User-ID         int64
ISBN           object
Book-Rating     int64
dtype: object

In [14]:
Book_users_df.dtypes

User-ID       int64
Location     object
Age         float64
dtype: object

In [15]:
Book_ratings_df['Book-Rating'].describe()

count    10000.000000
mean         1.974700
std          3.424884
min          0.000000
25%          0.000000
50%          0.000000
75%          4.000000
max         10.000000
Name: Book-Rating, dtype: float64

In [16]:
#Now we begin to merge the Books Data into one dataframe
# Merge Book_ratings_df with Books_df based on the 'ISBN' column
Books_merged_df = pd.merge(Book_ratings_df, Books_df, on='ISBN', how='inner')

In [17]:
# Merge the resulting DataFrame with Book_users_df based on the 'User-ID' column
Books_final_df = pd.merge(Books_merged_df, Book_users_df, on='User-ID', how='inner')

Books_final_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Location,Age
0,99,0451166892,3,The Pillars of the Earth,Ken Follett,1996,Signet Book,"franktown, colorado, usa",42.0
1,99,0786868716,0,The Five People You Meet in Heaven,Mitch Albom,2003,Hyperion,"franktown, colorado, usa",42.0
2,99,067976397X,0,Corelli's Mandolin : A Novel,LOUIS DE BERNIERES,1995,Vintage,"franktown, colorado, usa",42.0
3,99,0312252617,8,Fast Women,Jennifer Crusie,2001,St. Martin's Press,"franktown, colorado, usa",42.0
4,99,0312261594,8,Female Intelligence,Jane Heller,2001,St. Martin's Press,"franktown, colorado, usa",42.0


In [18]:
#Want to change the location here so that its just the country.
# Split the 'Location' column and extract the country (last part)
Books_final_df['Country'] = Books_final_df['Location'].str.split(',').str[-1].str.strip()

In [19]:
# Drop the original 'Location' column
Books_final_df.drop('Location', axis=1, inplace=True)

In [20]:
#Change the age column into an int rounded up for merging later
Books_final_df['Age'] = Books_final_df['Age'].apply(np.ceil).astype(int)

In [21]:
Books_final_df.isnull().sum()

User-ID                0
ISBN                   0
Book-Rating            0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              0
Age                    0
Country                0
dtype: int64

In [22]:
Books_final_df['Age'].unique()

array([42, 16, 43, 23, 37, 34, 62, 57, 17, 49, 51, 18, 26, 14, 19, 24, 32,
       44, 20, 41, 47, 38, 45, 35, 28, 27, 33, 15, 21])

In [23]:
Books_final_df['Country'].unique()

array(['usa', 'india', 'germany', 'spain', '', 'canada', 'france',
       'united kingdom', 'portugal', 'belgium', 'philippines'],
      dtype=object)

In [24]:
# Remove rows with invalid ages
Books_final_df = Books_final_df[(Books_final_df['Age'] >= 0) & (Books_final_df['Age'] <= 120)]

In [25]:
# Convert 'Age' column to integer
Books_final_df.loc[:, 'Age'] = Books_final_df['Age'].astype(int)

In [26]:
# Convert country names to lowercase and remove leading/trailing whitespace
Books_final_df.loc[:, 'Country'] = Books_final_df['Country'].str.lower().str.strip()

# Handle inconsistent country names and spellings
country_mapping = {
    'usa': 'united states',
    'united state': 'united states',
    'u.s.a.': 'united states',
    'america': 'united states',
    'ysa': 'united states',
    'uk': 'united kingdom',
    'u.k.': 'united kingdom',
    'españa': 'spain',
    'la france': 'france',
    'deutschland': 'germany',
    'italia': 'italy',
    'nz': 'new zealand',
    'phillipines': 'philippines',
    'philippines"': 'philippines',
    'quit': np.nan,
    'n/a': np.nan,
    'n/a - on the road': np.nan,
    '\\"n/a\\""': np.nan,
    'far away...': np.nan,
    'universe': np.nan,
    'everywhere and anywhere': np.nan,
    '': np.nan,
    # Add more mappings as needed
}

Books_final_df.loc[:, 'Country'] = Books_final_df['Country'].map(country_mapping).fillna(Books_final_df['Country'])

In [27]:
Books_final_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,Country
0,99,0451166892,3,The Pillars of the Earth,Ken Follett,1996,Signet Book,42,united states
1,99,0786868716,0,The Five People You Meet in Heaven,Mitch Albom,2003,Hyperion,42,united states
2,99,067976397X,0,Corelli's Mandolin : A Novel,LOUIS DE BERNIERES,1995,Vintage,42,united states
3,99,0312252617,8,Fast Women,Jennifer Crusie,2001,St. Martin's Press,42,united states
4,99,0312261594,8,Female Intelligence,Jane Heller,2001,St. Martin's Press,42,united states


In [28]:
Books_final_df['Book-Rating'].unique()

array([ 3,  0,  8,  7,  6, 10,  9,  4,  5])

In [29]:
Books_final_df['Year-Of-Publication'].unique()

array([1996, 2003, 1995, 2001, 2002, 1987, 2000, 1986, 1997, 1998, 1994,
       1978, 1980, 2004, 1984,    0, 1981, 1999, 1988, 1993, 1991, 1968,
       1992, 1983, 1952, 1979, 1990, 1989])

In [30]:
# Remove non-numeric values and invalid years
#Books_final_df = Books_final_df[Books_final_df['Year-Of-Publication'].str.isdigit()]
#Books_final_df = Books_final_df[Books_final_df['Year-Of-Publication'].astype(int) <= 2023]

In [31]:
# Convert 'Year-Of-Publication' column to integer
Books_final_df.loc[:, 'Year-Of-Publication'] = Books_final_df['Year-Of-Publication'].astype(int)

In [32]:
Books_final_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,Country
0,99,0451166892,3,The Pillars of the Earth,Ken Follett,1996,Signet Book,42,united states
1,99,0786868716,0,The Five People You Meet in Heaven,Mitch Albom,2003,Hyperion,42,united states
2,99,067976397X,0,Corelli's Mandolin : A Novel,LOUIS DE BERNIERES,1995,Vintage,42,united states
3,99,0312252617,8,Fast Women,Jennifer Crusie,2001,St. Martin's Press,42,united states
4,99,0312261594,8,Female Intelligence,Jane Heller,2001,St. Martin's Press,42,united states


In [33]:
Books_final_df = Books_final_df.rename(columns={'Age': 'age', 'Country':'country'})

In [34]:
#Now we will start to look at the movies dataset
# File paths
movies_file = os.path.join("..", "Data", "Movie Recommendation Raw Dataset", "movies.dat")
ratings_file = os.path.join("..", "Data", "Movie Recommendation Raw Dataset", "ratings.dat")
users_file = os.path.join("..", "Data", "Movie Recommendation Raw Dataset", "users.dat")

# Read the first few lines of each file to check the encoding as utf-8 did not work
with open(movies_file, 'r', encoding='latin1') as file:
    print("movies.dat:")
    for _ in range(5):
        print(file.readline().strip())

with open(ratings_file, 'r', encoding='latin1') as file:
    print("\nratings.dat:")
    for _ in range(5):
        print(file.readline().strip())

with open(users_file, 'r', encoding='latin1') as file:
    print("\nusers.dat:")
    for _ in range(5):
        print(file.readline().strip())

movies.dat:
1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
3::Grumpier Old Men (1995)::Comedy|Romance
4::Waiting to Exhale (1995)::Comedy|Drama
5::Father of the Bride Part II (1995)::Comedy

ratings.dat:
1::1193::5::978300760
1::661::3::978302109
1::914::3::978301968
1::3408::4::978300275
1::2355::5::978824291

users.dat:
1::F::1::10::48067
2::M::56::16::70072
3::M::25::15::55117
4::M::45::7::02460
5::M::25::20::55455


In [35]:
# Read movies.dat
movies_df = pd.read_csv(movies_file, delimiter='::', header=None, names=['movie_id', 'title', 'genres'], encoding='latin1', engine='python', nrows=10000)

# Read ratings.dat
movie_ratings_df = pd.read_csv(ratings_file, delimiter='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], encoding='latin1', engine='python', nrows=10000)

# Read users.dat
movie_users_df = pd.read_csv(users_file, delimiter='::', header=None, names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], encoding='latin1', engine='python', nrows=10000)


In [36]:
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [37]:
movie_ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [38]:
movie_users_df.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [39]:
movies_df.isnull().sum()

movie_id    0
title       0
genres      0
dtype: int64

In [40]:
movies_df.duplicated(subset='movie_id').sum()

0

In [41]:
movies_df.duplicated().sum()

0

In [42]:
movies_df.dtypes

movie_id     int64
title       object
genres      object
dtype: object

In [43]:
movie_ratings_df.isnull().sum()

user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64

In [44]:
movie_ratings_df['rating'].describe()

count    10000.000000
mean         3.633900
std          1.077213
min          1.000000
25%          3.000000
50%          4.000000
75%          4.000000
max          5.000000
Name: rating, dtype: float64

In [45]:
movie_ratings_df.dtypes

user_id      int64
movie_id     int64
rating       int64
timestamp    int64
dtype: object

In [46]:
movie_users_df.isnull().sum()

user_id       0
gender        0
age           0
occupation    0
zip_code      0
dtype: int64

In [47]:
movie_users_df.duplicated(subset='user_id').sum()

0

In [48]:
movie_users_df.dtypes

user_id        int64
gender        object
age            int64
occupation     int64
zip_code      object
dtype: object

In [49]:
# Merge movie_users_df with movie_ratings_df
movies_merged_df = pd.merge(movie_users_df, movie_ratings_df, on='user_id', how='left')

In [50]:
# Merge the resulting DataFrame with movies_df
movies_final_df = pd.merge(movies_merged_df, movies_df, on='movie_id', how='left')

movies_final_df.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code,movie_id,rating,timestamp,title,genres
0,1,F,1,10,48067,1193.0,5.0,978300760.0,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,F,1,10,48067,661.0,3.0,978302109.0,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,F,1,10,48067,914.0,3.0,978301968.0,My Fair Lady (1964),Musical|Romance
3,1,F,1,10,48067,3408.0,4.0,978300275.0,Erin Brockovich (2000),Drama
4,1,F,1,10,48067,2355.0,5.0,978824291.0,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [51]:
movies_final_df.isnull().sum()

user_id          0
gender           0
age              0
occupation       0
zip_code         0
movie_id      5970
rating        5970
timestamp     5970
title         5970
genres        5970
dtype: int64

In [52]:
movies_final_df['gender'].unique()

array(['F', 'M'], dtype=object)

In [53]:
movies_final_df['age'].unique()

array([ 1, 56, 25, 45, 50, 35, 18])

In [54]:
movies_final_df['rating'].unique()

array([ 5.,  3.,  4.,  2.,  1., nan])

In [55]:
#Lastly we look at the music dataset
# File paths
music_file = os.path.join("..", "Data", "Music Recommendation Raw Dataset", "usersha1-artmbid-artname-plays.tsv")
users_file = os.path.join("..", "Data", "Music Recommendation Raw Dataset", "usersha1-profile.tsv")

In [56]:
# Read music.tsv file
music_df = pd.read_csv(music_file, sep='\t', header=None, names=['user_id', 'artist_id', 'artist_name', 'plays'], nrows=10000)


# Read users.tsv file
music_users_df = pd.read_csv(users_file, sep='\t', header=None, names=['user_id', 'gender', 'age', 'country', 'signup'], nrows=10000)

In [57]:
music_df.head()

Unnamed: 0,user_id,artist_id,artist_name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706


In [58]:
music_users_df.head()

Unnamed: 0,user_id,gender,age,country,signup
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0,Germany,"Feb 1, 2007"
1,00001411dc427966b17297bf4d69e7e193135d89,f,,Canada,"Dec 4, 2007"
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,,,Germany,"Sep 1, 2006"
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,m,19.0,Mexico,"Apr 28, 2008"
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,m,28.0,United States,"Jan 27, 2006"


In [59]:
music_df['plays'] = music_df['plays'].astype(int)
music_users_df['age'] = pd.to_numeric(music_users_df['age'], errors='coerce')
music_users_df['signup'] = pd.to_datetime(music_users_df['signup'], format='%b %d, %Y', errors='coerce')
music_users_df.replace('', np.nan, inplace=True)

In [60]:
music_df.isnull().sum()

user_id          0
artist_id      123
artist_name      0
plays            0
dtype: int64

In [61]:
music_users_df.isnull().sum()

user_id       0
gender      941
age        2107
country       0
signup        0
dtype: int64

In [62]:
# Drop unwanted columns from music
music_df = music_df[['user_id', 'artist_name', 'plays']]

# Drop unwanted columns from music_users
music_users_df = music_users_df[['user_id', 'age', 'country']]

In [63]:
music_df.isnull().sum()

user_id        0
artist_name    0
plays          0
dtype: int64

In [64]:
music_users_df.isnull().sum()

user_id       0
age        2107
country       0
dtype: int64

In [65]:
# Handle missing values
music_df.dropna(inplace=True)
music_users_df.dropna(inplace=True)

In [66]:
# Merge on user_id
music_final_df = pd.merge(music_df, music_users_df, on='user_id', how='inner')

In [67]:
music_final_df.head()

Unnamed: 0,user_id,artist_name,plays,age,country
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,22.0,Germany
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,22.0,Germany
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,22.0,Germany
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,22.0,Germany
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,22.0,Germany


In [68]:
music_final_df['country'].unique()

array(['Germany', 'Mexico', 'United States', 'United Kingdom', 'Finland',
       'Poland', 'Spain', 'Sweden', 'Portugal', 'Australia', 'Belgium',
       'Guatemala', 'Netherlands', 'Bulgaria', 'Italy', 'Austria',
       'Korea, Republic of', 'Brazil', 'South Africa', 'Japan', 'Greece',
       'Russian Federation', 'Iran, Islamic Republic of', 'Ukraine',
       'Chile', 'Thailand', 'Norway', 'Slovakia', 'Turkey',
       'Czech Republic', 'France', 'El Salvador', 'Belarus', 'Canada',
       'Estonia', 'Uruguay', 'Lithuania', 'Ireland', 'Croatia',
       'Argentina', 'Lebanon'], dtype=object)

In [69]:
music_country_mapping = {
    'united states': 'united states',
    'united kingdom': 'united kingdom',
    'korea, republic of': 'south korea',
    'korea, democratic people\'s republic of': 'north korea',
    'iran, islamic republic of': 'iran',
    'syrian arab republic': 'syria',
    'virgin islands, u.s.': 'united states virgin islands',
    'tanzania, united republic of': 'tanzania',
    'virgin islands, british': 'british virgin islands',
    'united states minor outlying islands': 'united states',
    'palestinian territory, occupied': 'palestine',
    'holy see (vatican city state)': 'vatican city',
    "cote d'ivoire": 'ivory coast',
    'congo, the democratic republic of the': 'democratic republic of the congo',
    'falkland islands (malvinas)': 'falkland islands',
    'saint kitts and nevis': 'saint kitts and nevis',
    'marshall islands': 'marshall islands',
}

music_final_df['country'] = music_final_df['country'].str.lower().map(music_country_mapping).fillna(music_final_df['country'].str.lower())

In [70]:
music_final_df['age'].unique()

array([22., 19., 28., 20., 17., 24., 27., 23., 18., 30., 26., 14., 34.,
       31., 21., 32., 16., 47., 29., 54., 33., 57., 36., 25., 45., 15.,
       38., 43.])

In [71]:
# Convert age values to integers
music_final_df['age'] = music_final_df['age'].astype(int)

# Remove negative or unrealistic age values
music_final_df = music_final_df[(music_final_df['age'] >= 0) & (music_final_df['age'] <= 120)]

In [72]:
music_final_df.head()

Unnamed: 0,user_id,artist_name,plays,age,country
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,22,germany
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,22,germany
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,22,germany
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,22,germany
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,22,germany


In [73]:
music_final_df.dtypes

user_id        object
artist_name    object
plays           int64
age             int64
country        object
dtype: object

In [74]:
movies_final_df.dtypes

user_id         int64
gender         object
age             int64
occupation      int64
zip_code       object
movie_id      float64
rating        float64
timestamp     float64
title          object
genres         object
dtype: object

In [75]:
Books_final_df.dtypes

User-ID                 int64
ISBN                   object
Book-Rating             int64
Book-Title             object
Book-Author            object
Year-Of-Publication     int64
Publisher              object
age                     int64
country                object
dtype: object

In [76]:
# Create user-item matrix for books
books_matrix = Books_final_df.reset_index().pivot_table(index='index', columns='Book-Title', values='Book-Rating', fill_value=0)
books_matrix = books_matrix / 10.0  # Normalize book ratings to 0-1 range


# Create user-item matrix for music
music_matrix = music_final_df.reset_index().pivot_table(index='index', columns='artist_name', values='plays', fill_value=0)
music_matrix = music_matrix / music_matrix.max().max()  # Normalize music plays to 0-1 range


# Create user-item matrix for movies
movies_matrix = movies_final_df.reset_index().pivot_table(index='index', columns='title', values='rating', fill_value=0)
movies_matrix = movies_matrix / 5.0  # Normalize movie ratings to 0-1 range

In [77]:
# Merge user-item matrices
merged_matrix = pd.merge(books_matrix, music_matrix, left_index=True, right_index=True, how='outer')
merged_matrix = pd.merge(merged_matrix, movies_matrix, left_index=True, right_index=True, how='outer')
merged_matrix.fillna(0, inplace=True)

In [78]:
merged_matrix.head()

Unnamed: 0_level_0,A Kiss Remembered,All He Ever Wanted: A Novel,Always Daddy's Girl: Understanding Your Father's Impact on Who You Are,Angels &amp; Demons,Atonement : A Novel,Before I Say Good-Bye,Black Beauty (Illustrated Classics),Bless The Beasts And Children : Bless The Beasts And Children,Blood Oath,Breathing Lessons,...,Yojimbo (1961),You Can't Take It With You (1938),You've Got Mail (1998),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),Young Sherlock Holmes (1985),Your Friends and Neighbors (1998),Zero Effect (1998),eXistenZ (1999)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
merged_matrix.describe()

Unnamed: 0,A Kiss Remembered,All He Ever Wanted: A Novel,Always Daddy's Girl: Understanding Your Father's Impact on Who You Are,Angels &amp; Demons,Atonement : A Novel,Before I Say Good-Bye,Black Beauty (Illustrated Classics),Bless The Beasts And Children : Bless The Beasts And Children,Blood Oath,Breathing Lessons,...,Yojimbo (1961),You Can't Take It With You (1938),You've Got Mail (1998),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),Young Sherlock Holmes (1985),Your Friends and Neighbors (1998),Zero Effect (1998),eXistenZ (1999)
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,8e-05,7e-05,0.0,0.0001,0.0,0.0,0.0,5e-05,7e-05,0.0,...,0.00038,8e-05,0.00072,0.00116,0.00036,0.00028,0.00028,0.0001,0.0002,0.00026
std,0.008,0.007,0.0,0.01,0.0,0.0,0.0,0.005,0.007,0.0,...,0.018218,0.008,0.023655,0.031855,0.016489,0.01414,0.012959,0.01,0.012328,0.012488
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.8,0.7,0.0,1.0,0.0,0.0,0.0,0.5,0.7,0.0,...,1.0,0.8,1.0,1.0,1.0,0.8,0.8,1.0,1.0,0.8


In [80]:
#Formula for calculating sparsity
sparsity = 1.0 - (merged_matrix.astype(bool).sum().sum() / (merged_matrix.shape[0] * merged_matrix.shape[1]))
print(f"Sparsity: {sparsity:.2%}")

Sparsity: 99.97%


I will need to think of a way to deal with this sparsity later

In [81]:
# Create a copy of the merged matrix for implicit ratings
merged_matrix_implicit = merged_matrix.copy()

# Set non-zero values to 1 (implicit positive interaction)
merged_matrix_implicit[merged_matrix_implicit > 0] = 1

In [82]:
merged_matrix_implicit.describe()

Unnamed: 0,A Kiss Remembered,All He Ever Wanted: A Novel,Always Daddy's Girl: Understanding Your Father's Impact on Who You Are,Angels &amp; Demons,Atonement : A Novel,Before I Say Good-Bye,Black Beauty (Illustrated Classics),Bless The Beasts And Children : Bless The Beasts And Children,Blood Oath,Breathing Lessons,...,Yojimbo (1961),You Can't Take It With You (1938),You've Got Mail (1998),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),Young Sherlock Holmes (1985),Your Friends and Neighbors (1998),Zero Effect (1998),eXistenZ (1999)
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.0001,0.0001,0.0,0.0001,0.0,0.0,0.0,0.0001,0.0001,0.0,...,0.0005,0.0001,0.001,0.0014,0.0005,0.0004,0.0005,0.0001,0.0003,0.0005
std,0.01,0.01,0.0,0.01,0.0,0.0,0.0,0.01,0.01,0.0,...,0.022356,0.01,0.031609,0.037392,0.022356,0.019997,0.022356,0.01,0.017319,0.022356
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [83]:
# Define the path to the directory where you want to save the CSV files
save_directory = os.path.join("..", "Data", "Cleaned-Data")

In [84]:
# Define the file path
implicit_path = os.path.join(save_directory, 'merged_matrix_implicit.csv')

In [86]:
# Save the data to CSV files
merged_matrix_implicit.to_csv(implicit_path, index=False)

In [89]:
# Verify shape
print(f"Implicit Matrix Shape: {merged_matrix_implicit.shape}")

Training Data Shape: (8000, 6855)
Testing Data Shape: (2000, 6855)
Implicit Matrix Shape: (10000, 6855)
