Import re Library 

In [15]:
import pandas as pd
import requests, re

# Load data
We read the CSV file and load it into a pandas DataFrame. 

In [16]:
ratings = pd.read_csv('training/ratings.csv')
books = pd.read_csv('training/books.csv')

We print the first few records and a summary of the data for a quick examination.

In [17]:
print(ratings.head())
print(ratings.describe())

   book_id  user_id  rating
0        1      314       5
1        1      439       3
2        1      588       5
3        1     1169       4
4        1     1185       4
             book_id        user_id         rating
count  981756.000000  981756.000000  981756.000000
mean     4943.275636   25616.759933       3.856534
std      2873.207415   15228.338826       0.983941
min         1.000000       1.000000       1.000000
25%      2457.000000   12372.000000       3.000000
50%      4921.000000   25077.000000       4.000000
75%      7414.000000   38572.000000       5.000000
max     10000.000000   53424.000000       5.000000


In [18]:
user_ratings = ratings.groupby('user_id')['rating'].count()
user_rating_counts = ratings['user_id'].value_counts()
# Count the number of users for each number of ratings
users_with_ratings = user_rating_counts.groupby(user_ratings).count()
# Create a list of rating count thresholds
rating_thresholds = list(range(5, 100, 5))

# Count the number of users with fewer than X ratings, excluding the previous ranks
count_per_threshold = []
previous_count = 0
total_users = 53424  # Total number of users
for threshold in rating_thresholds:
    count = user_ratings[user_ratings < threshold].count() - previous_count
    count_per_threshold.append(count)
    previous_count += count

# Calculate the percentage of the whole user base
percent_per_threshold = [round((count / total_users) * 100) for count in count_per_threshold]

# Create the DataFrame
df = pd.DataFrame({"fewer than X": rating_thresholds, "count": count_per_threshold, "percent": percent_per_threshold})

# Print the DataFrame
(df)


Unnamed: 0,fewer than X,count,percent
0,5,17714,33
1,10,11305,21
2,15,5859,11
3,20,3907,7
4,25,2759,5
5,30,2082,4
6,35,1671,3
7,40,1305,2
8,45,1020,2
9,50,875,2


In [19]:
filter_out= 15
filtered_ratings = ratings[~ratings['user_id'].isin(user_rating_counts[user_rating_counts < filter_out].index.tolist())]
filtered_ratings.loc[:, 'user_id'] = filtered_ratings.groupby('user_id').ngroup()
# Get unique user IDs from the ratings data


# Count the number of ratings per user
rating_counts = filtered_ratings.groupby('user_id').size().reset_index(name='rating_count')

# Create the users DataFrame
users = pd.DataFrame(rating_counts)
ratings = filtered_ratings
users['new_data'] = False
users.head()

Unnamed: 0,user_id,rating_count,new_data
0,0,76,False
1,1,16,False
2,2,24,False
3,3,19,False
4,4,42,False


In [20]:
#API_key
with open('../api_key', 'rb') as key_file:
    api_key = key_file.read().decode()
with open('../search_engine_id', 'rb') as key_file:
    search_engine_id = key_file.read().decode()


In [21]:
for index, row in books.iterrows():
    image_url = row['image_url']

    if image_url.startswith('https://s.gr-assets.com/'):
        book_title = row['title']
        # Remove special characters and spaces
        search_title = re.sub(r'[^\w\s-]', '', book_title)

        # Replace spaces with hyphens
        search_title = re.sub(r'\s', '+', search_title)

        # Convert to lowercase
        search_title = search_title.lower()
        search_term = f"{search_title}+book+cover+amazon"

        # Construct the search URL
        search_url = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={search_term}"
        # Perform the search and retrieve the image URLs
        response = requests.get(search_url)
        search_results = response.json()
        items = search_results.get("items", [])  # Get the list of items from the search results

        image_url = None
        amazon_link = None

        for item in items:
            pagemap = item.get("pagemap", {})  # Get the pagemap dictionary of the item
            scraped = pagemap.get("scraped", [])  # Get the list of scraped items

            if scraped:
                image_link = scraped[0].get("image_link")  # Get the image link from the scraped item
                if image_link:
                    image_url = image_link  # Found an image link, assign it to image_url

                link = item.get("link")  # Get the link from the item
                if link:
                    amazon_link = link  # Found an Amazon link, assign it to amazon_link
                    break    
        books.at[index, 'image_url'] = image_url
        books.at[index, 'amazon_link'] = amazon_link


KeyboardInterrupt: 

In [None]:
ratings.to_csv('ratings.csv',index=False)
books.to_csv('books.csv',index=False)
users.to_csv('users.csv',index=False)
