# **A20000882_Maryam Elgohary**





# **Web Scraping dataset**




In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.request import urlopen
import time

book_urls = {
    "jane eyre": "https://www.goodreads.com/book/show/10210.Jane_Eyre",
    "1984": "https://www.goodreads.com/book/show/61439040-1984",
    'wuthering': 'https://www.goodreads.com/book/show/6185.Wuthering_Heights',
    'picture of dorian ': 'https://www.goodreads.com/book/show/5297.The_Picture_of_Dorian_Gray',
    'catcher in rye': 'https://www.goodreads.com/book/show/5107.The_Catcher_in_the_Rye',
    'sense and sensibility':'https://www.goodreads.com/book/show/14935.Sense_and_Sensibility',
    'great expectations':'https://www.goodreads.com/book/show/2623.Great_Expectations',
    'tale of cities':'https://www.goodreads.com/book/show/1953.A_Tale_of_Two_Cities',
    'brave new world': 'https://www.goodreads.com/book/show/5129.Brave_New_World',
    'macbeth':'https://www.goodreads.com/book/show/43913694-macbeth',
    "The Great Gatsby": "https://www.goodreads.com/book/show/4671.The_Great_Gatsby"
}

book_reviews = {}

for book_name, book_url in book_urls.items():
    response = requests.get(book_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    users = soup.find_all("div", class_="ReviewerProfile__name")
    ratings = soup.find_all("span", class_="RatingStars")


    reviews = {}
    for i in range(min(len(users), len(ratings))):
        user_name = users[i].text.strip()



        rating_value = float(rating_text.split()[1]) if "out of" in rating_text else None


        reviews[user_name] = rating_value


    book_reviews[book_name] = reviews


    time.sleep(2)


book_dfs = [pd.DataFrame.from_dict(book_reviews[book], orient="index", columns=[book]) for book in book_urls.keys()]

common_df = book_dfs[0]
for df in book_dfs[1:]:
    common_df = pd.concat([common_df, df], axis=1)

common_df['Review Count'] = common_df.notnull().sum(axis=1)
common_users_df = common_df[common_df['Review Count'] >= 5].drop(columns=['Review Count'])


print(common_users_df)


                         jane eyre  1984  wuthering  picture of dorian   \
Sean Barrs                     0.0  5.00       1.00                4.13   
emma                           0.0  5.00       0.00                5.00   
Matthew                        5.0  5.00        NaN                 NaN   
Lisa of Troy                   5.0   NaN       1.00                4.00   
Emily May                      5.0  4.00       3.89                0.00   
Nayra.Hassan                   1.0   NaN        NaN                5.00   
Anne                           1.0   NaN        NaN                5.00   
Ahmad Sharabiani               3.0  4.00       1.00                5.00   
Henry Avila                    2.0   NaN       4.00                5.00   
هدى يحيى                       NaN  4.19        NaN                5.00   
Stephen                        NaN  5.00        NaN                0.00   
Luís                           NaN  4.00        NaN                 NaN   
Mario the lone bookwolf  

# **Handling missing data**

In [None]:
import pandas as pd
import numpy as np

file_path = '/content/dl33lataset.xlsx'
df = pd.read_excel(file_path)

total_elements = df.iloc[:, 1:].size

row_means = df.iloc[:, 1:].mean(axis=1, skipna=True)

def fill_row_with_mean(row):
    if pd.isna(row).any():
        mean_value = row_means[row.name]
    return row

df.iloc[:, 1:] = df.iloc[:, 1:].apply(fill_row_with_mean, axis=1)

num_missing = int(total_elements * 0.1)

all_indices = [(i, j) for i in range(df.shape[0]) for j in range(1, df.shape[1])]
missing_indices = np.random.choice(range(len(all_indices)), num_missing, replace=False)


for index in missing_indices:
    row, col = all_indices[index]
    df.iat[row, col] = np.nan


print("DataFrame with Missing Values (NaN):")
print(df)

output_file_path = '/content/dl33lataset.xlsx'
df.to_excel(output_file_path, index=False)

print("\nDataFrame with Filled Missing Values:")
print(df)


DataFrame with Missing Values (NaN):
                 Unnamed: 0  jane eyre      1984  wuthering  \
0                Sean Barrs   0.000000  5.000000       1.00   
1                      emma   0.000000  5.000000       3.20   
2                   Matthew   5.000000  5.000000       3.00   
3              Lisa of Troy   5.000000  2.000000       1.00   
4                 Emily May   5.000000  4.000000       3.89   
5              Nayra.Hassan   1.000000  3.333333       3.00   
6                      Anne   3.000000  2.000000       3.00   
7          Ahmad Sharabiani   3.000000  4.000000       1.00   
8               Henry Avila   5.000000  2.000000       4.00   
9                  هدى يحيى   3.465556  4.190000       3.00   
10                      NaN   2.000000  3.010000       3.00   
11                     Luís   2.000000  4.000000       3.00   
12  Mario the lone bookwolf   3.700000  4.000000        NaN   

    picture of dorian   catcher in rye  sense and sensibility  \
0              

In [None]:
common_users_df = pd.read_excel('/content/dl33lataset.xlsx')
item_based = common_users_df.transpose()
item_based.to_excel('itemBased.xlsx', index=True)

# **computing Average**

In [None]:
import pandas as pd
import numpy as np


user_based_df = pd.read_excel('/content/ddataset.xlsx')
user_based_df = user_based_df.drop(columns=["Unnamed: 0"])

row_averages = user_based_df.mean(axis=1)

for index, avg in enumerate(row_averages):
    print(f"Average rating for User {index + 1}: {avg}")

Average rating for User 1: 2.34
Average rating for User 2: 2.909090909090909
Average rating for User 3: 3.8
Average rating for User 4: 2.4444444444444446
Average rating for User 5: 3.1709090909090913
Average rating for User 6: 3.2869999999999995
Average rating for User 7: 3.8888888888888884
Average rating for User 8: 3.489
Average rating for User 9: 4.1000000000000005
Average rating for User 10: 3.380909090909091
Average rating for User 11: 3.2627272727272727
Average rating for User 12: 3.666666666666667
Average rating for User 13: 3.4


# **Cosine Similarity User-based**

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

user_based_df = pd.read_excel('/content/ddataset.xlsx')

user_based_df = user_based_df.drop(columns=["Unnamed: 0"])

filled_df = user_based_df.fillna(0)

target_row = filled_df.iloc[2].values.reshape(1, -1)


similarities = cosine_similarity(target_row, filled_df)

similarity_df = pd.DataFrame(similarities.flatten(), columns=["Cosine Similarity"])
similarity_df['User'] = filled_df.index

similarity_df = similarity_df[similarity_df['User'] != filled_df.index[2]]

top_similar_users = similarity_df.nlargest(2, "Cosine Similarity")

null_columns = user_based_df.columns[user_based_df.iloc[2].isnull()]


ratings_from_neighbors = user_based_df.loc[top_similar_users['User'].values, null_columns]


weights = top_similar_users['Cosine Similarity'].values
weighted_ratings = ratings_from_neighbors.T.dot(weights) / weights.sum()


user_based_df.loc[3, null_columns] = weighted_ratings


print(user_based_df)


    jane eyre      1984  wuthering  picture of dorian   catcher in rye  \
0    0.000000  5.000000      1.000            4.130000        3.000000   
1    0.000000  5.000000      0.000            5.000000        2.000000   
2    5.000000  5.000000      3.000            3.000000        3.000000   
3    2.444444  2.000000      1.000            2.444444        1.000000   
4    5.000000  4.000000      3.890            0.000000        3.000000   
5    1.000000  2.000000      3.287            5.000000        3.000000   
6    3.000000  3.888889      3.000            5.000000        3.000000   
7    3.000000  4.000000      3.489            5.000000        1.000000   
8    5.000000  2.000000      4.000            5.000000        5.000000   
9    2.000000  4.190000      3.000            5.000000        3.000000   
10   2.000000  5.000000      3.000            0.000000        3.800000   
11   2.000000  4.000000      3.000            3.000000        3.666667   
12   2.000000  4.000000      3.000    

# **Cosine Similarity item-based**





In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


item_based_df = pd.read_excel('/content/itemBased.xlsx')


item_based_df = item_based_df.drop(columns=["Unnamed: 0"], errors='ignore')


for col in item_based_df.columns:
    item_based_df[col] = pd.to_numeric(item_based_df[col], errors='coerce')


item_based_df = item_based_df.select_dtypes(include=[np.number])

if item_based_df.shape[1] == 0:
    raise ValueError("No numeric columns available for similarity calculation. Please ensure your dataset has numeric rating columns.")


filled_df = item_based_df.fillna(0)

if filled_df.shape[1] == 0:
    raise ValueError("After filling NaN values, there are still no columns available for similarity calculation.")

row_index = 2
row_ratings = filled_df.iloc[row_index].values.reshape(1, -1)


similarities = cosine_similarity(row_ratings, filled_df)


similarity_df = pd.DataFrame(similarities.flatten(), columns=["Cosine Similarity"])
similarity_df['item'] = filled_df.index


similarity_df = similarity_df[similarity_df['item'] != row_index]

top_similar_item = similarity_df.nlargest(2, "Cosine Similarity")

null_columns = item_based_df.columns[item_based_df.iloc[row_index].isnull()]
ratings_from_neighbors = item_based_df.loc[top_similar_item['item'].values, null_columns]


weights = top_similar_item['Cosine Similarity'].values
weighted_ratings = ratings_from_neighbors.T.dot(weights) / weights.sum()

item_based_df.loc[row_index, null_columns] = weighted_ratings


print("Updated DataFrame:\n", item_based_df)


Updated DataFrame:
        0         1    2         3     4      5         6      7    8     9  \
0   2.34  2.909091  NaN       NaN   NaN    NaN       NaN    NaN  NaN   NaN   
1   0.00  0.000000  5.0  2.444444  5.00  1.000  3.000000  3.000  5.0  2.00   
2   5.00  5.000000  5.0  2.000000  4.00  2.000  3.888889  4.000  2.0  4.19   
3   1.00  0.000000  3.0  1.000000  3.89  3.287  3.000000  3.489  4.0  3.00   
4   4.13  5.000000  3.0  2.444444  0.00  5.000  5.000000  5.000  5.0  5.00   
5   3.00  2.000000  3.0  1.000000  3.00  3.000  3.000000  1.000  5.0  3.00   
6   0.00  1.000000  1.0  3.000000  1.00  1.000  5.000000  2.000  5.0  1.00   
7   2.34  0.000000  5.0  2.000000  0.00  5.000  2.000000  4.000  2.0  2.00   
8   4.00  5.000000  5.0  5.000000  5.00  3.870  5.000000  5.000  5.0  5.00   
9   0.00  5.000000  3.8  3.000000  3.99  3.000  4.000000  2.000  3.0  4.00   
10  2.34  5.000000  5.0  5.000000  5.00  5.000  5.000000  3.890  4.1  5.00   
11  3.93  4.000000  3.0  0.000000  4.00  4.0

# **Pearson Correlation User-Based**

In [None]:
import pandas as pd
import numpy as np

user_based_df = pd.read_excel('/content/ddataset.xlsx')
user_based_df = user_based_df.drop(columns=["Unnamed: 0"])

target_user_index = 2
target_user = user_based_df.iloc[target_user_index]
null_columns = user_based_df.columns[target_user.isnull()]


user_means = user_based_df.mean(axis=1)


def predict_rating(user_index, item):
    target_user = user_based_df.iloc[user_index]
    target_user_mean = user_means[user_index]


    similarities = user_based_df.apply(lambda x: target_user.corr(x), axis=1)
    similar_users = similarities.drop(user_index).dropna()
    rated_users = user_based_df[~user_based_df[item].isnull()].index
    neighbors = similar_users[similar_users.index.isin(rated_users)]
    top_neighbors = neighbors.nlargest(2)
    numerator = sum(top_neighbors[i] * (user_based_df.loc[i, item] - user_means[i]) for i in top_neighbors.index)
    denominator = top_neighbors.sum()

    if denominator == 0:
        return target_user_mean


    prediction = target_user_mean + (numerator / denominator)
    return prediction


for item in null_columns:
    predicted_rating = predict_rating(target_user_index, item)
    user_based_df.at[target_user_index, item] = predicted_rating


print(user_based_df)


    jane eyre      1984  wuthering  picture of dorian   catcher in rye  \
0    0.000000  5.000000      1.000            4.130000        3.000000   
1    0.000000  5.000000      0.000            5.000000        2.000000   
2    5.000000  5.000000      3.000            3.000000        3.000000   
3    2.444444  2.000000      1.000            2.444444        1.000000   
4    5.000000  4.000000      3.890            0.000000        3.000000   
5    1.000000  2.000000      3.287            5.000000        3.000000   
6    3.000000  3.888889      3.000            5.000000        3.000000   
7    3.000000  4.000000      3.489            5.000000        1.000000   
8    5.000000  2.000000      4.000            5.000000        5.000000   
9    2.000000  4.190000      3.000            5.000000        3.000000   
10   2.000000  5.000000      3.000            0.000000        3.800000   
11   2.000000  4.000000      3.000            3.000000        3.666667   
12   2.000000  4.000000      3.000    

# **Pearson correlation User based experimenting the mae**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


user_based_df = pd.read_excel('/content/sdataset.xlsx')
user_based_df = user_based_df.drop(columns=["Unnamed: 0"])


def predict_rating(user_index, item, df):
    target_user = df.iloc[user_index]
    target_user_mean = target_user.mean()

    similarities = df.apply(lambda x: target_user.corr(x), axis=1)
    similar_users = similarities.drop(user_index).dropna()

    rated_users = df[~df[item].isnull()].index
    neighbors = similar_users[similar_users.index.isin(rated_users)]

    top_neighbors = neighbors.nlargest(2)
    numerator = sum(top_neighbors[i] * (df.loc[i, item] - df.mean(axis=1)[i]) for i in top_neighbors.index)
    denominator = top_neighbors.sum()

    if denominator == 0:
        return target_user_mean

    prediction = target_user_mean + (numerator / denominator)
    return prediction

user_index = 5
item = 'jane eyre'
known_rating = user_based_df.loc[user_index, item]


user_based_df.loc[user_index, item] = np.nan

predicted_rating = predict_rating(user_index, item, user_based_df)

mae = abs(predicted_rating - known_rating)

# Display the results
print(f'Known Rating: {known_rating}')
print(f'Predicted Rating: {predicted_rating}')
print(f'Mean Absolute Error (MAE): {mae}')

# Optionally, reset the original rating back
user_based_df.loc[user_index, item] = known_rating


Known Rating: 1
Predicted Rating: 1.4184970876609877
Mean Absolute Error (MAE): 0.4184970876609877


# **item-based Pearson Correlation**





In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

# Load the dataset
item_based_df = pd.read_excel('/content/itemBased.xlsx')  # Adjust the file path as needed

# Drop the first column if it's an unnecessary index column
item_based_df = item_based_df.drop(columns=["Unnamed: 0"], errors='ignore')

# Try to convert columns that are expected to be numeric
for col in item_based_df.columns:
    item_based_df[col] = pd.to_numeric(item_based_df[col], errors='coerce')




item_based_df = item_based_df.select_dtypes(include=[np.number])


filled_df = item_based_df.fillna(0)



row_index = 2
row_ratings = filled_df.iloc[row_index].values


correlation_list = []

for index in filled_df.index:
    if index != row_index:
        other_ratings = filled_df.iloc[index].values

        if np.all(other_ratings == other_ratings[0]) or np.all(row_ratings == row_ratings[0]):
            continue


        try:
            correlation, _ = pearsonr(row_ratings, other_ratings)
            correlation_list.append({"Pearson Correlation": correlation, "User": index})
        except ValueError:
            continue  # Skip if there's a calculation error

correlation_df = pd.DataFrame(correlation_list)



top_similar_users = correlation_df.nlargest(2, "Pearson Correlation")

null_columns = item_based_df.columns[item_based_df.iloc[row_index].isnull()]  # Get columns where the selected row has null values
ratings_from_neighbors = item_based_df.loc[top_similar_users['User'].values, null_columns]

weights = top_similar_users['Pearson Correlation'].values
weighted_ratings = ratings_from_neighbors.T.dot(weights) / weights.sum()

item_based_df.loc[row_index, null_columns] = weighted_ratings



print("Updated DataFrame:\n", item_based_df)


Updated DataFrame:
        0         1    2         3     4      5         6      7    8     9  \
0   2.34  2.909091  NaN       NaN   NaN    NaN       NaN    NaN  NaN   NaN   
1   0.00  0.000000  5.0  2.444444  5.00  1.000  3.000000  3.000  5.0  2.00   
2   5.00  5.000000  5.0  2.000000  4.00  2.000  3.888889  4.000  2.0  4.19   
3   1.00  0.000000  3.0  1.000000  3.89  3.287  3.000000  3.489  4.0  3.00   
4   4.13  5.000000  3.0  2.444444  0.00  5.000  5.000000  5.000  5.0  5.00   
5   3.00  2.000000  3.0  1.000000  3.00  3.000  3.000000  1.000  5.0  3.00   
6   0.00  1.000000  1.0  3.000000  1.00  1.000  5.000000  2.000  5.0  1.00   
7   2.34  0.000000  5.0  2.000000  0.00  5.000  2.000000  4.000  2.0  2.00   
8   4.00  5.000000  5.0  5.000000  5.00  3.870  5.000000  5.000  5.0  5.00   
9   0.00  5.000000  3.8  3.000000  3.99  3.000  4.000000  2.000  3.0  4.00   
10  2.34  5.000000  5.0  5.000000  5.00  5.000  5.000000  3.890  4.1  5.00   
11  3.93  4.000000  3.0  0.000000  4.00  4.0