In [None]:
import pandas as pd

def clean_and_save_file(file_path, output_path):
    
    columns_to_drop = [
        'ipc-rating-star--maxRating', 'ipc-title-link-wrapper href', 'ipc-title__text', 
        'ipc-html-content-inner-div', 'ipc-voting__label__text', 'ipc-voting__dot-separator', 
        'ipc-voting__label__count', 'ipc-voting__label__count 2', 'ipc-link href', 
        'ipc-inline-list__item', 'ipc-btn__text'
    ]
    
    df = pd.read_csv(file_path)
    
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')
    
    df.to_csv(output_path, index=False)
    print(f"Cleaned data saved to {output_path}")

files = [
    "D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Black panther.csv",
    "D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/AntMan.csv",
    "D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Deadpool&Wolverine.csv",
    "D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/doctorstrange.csv",
    "D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/spiderman noway home.csv",
    "D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/thor love&thunder.csv"
]
output_paths = [
    'D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_spiderman_noway_home.csv',
    'D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_doctorstrange.csv',
    'D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_thor_love_thunder.csv',
    'D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_black_panther.csv',
    'D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_antman.csv',
    'D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_deadpool_wolverine.csv'
]

for file_path, output_path in zip(files, output_paths):
    clean_and_save_file(file_path, output_path)


Cleaned data saved to D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_spiderman_noway_home.csv
Cleaned data saved to D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_doctorstrange.csv
Cleaned data saved to D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_thor_love_thunder.csv
Cleaned data saved to D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_black_panther.csv
Cleaned data saved to D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_antman.csv
Cleaned data saved to D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_deadpool_wolverine.csv


In [None]:
import pandas as pd
from fuzzywuzzy import process, fuzz

def find_similar_name_groups(file_paths, threshold=70):
    all_names = []
    
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        if 'names' in df.columns:
            all_names.extend(df['names'].dropna().unique())
        elif 'ipc-link' in df.columns:
            all_names.extend(df['ipc-link'].dropna().unique())
    
    all_names = sorted(set(all_names))
    similar_name_groups = []
    matched = set()
    
    for name in all_names:
        if name not in matched:
            group = [match[0] for match in process.extract(name, all_names, scorer=fuzz.token_sort_ratio) if match[1] >= threshold]
            matched.update(group)
            similar_name_groups.append(group)
    
    return similar_name_groups

def build_user_ratings_matrix(file_paths, similar_name_groups):
    user_ratings = pd.DataFrame(index=["/".join(group) for group in similar_name_groups])
    
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        movie_title = file_path.split('/')[-1].replace('.csv', '').replace('_', ' ') 
        
        if 'names' in df.columns:
            df_filtered = df[['names', 'ipc-rating-star--rating']].rename(columns={'names': 'Name', 'ipc-rating-star--rating': 'Rating'})
        elif 'ipc-link' in df.columns:
            df_filtered = df[['ipc-link', 'ipc-rating-star--rating']].rename(columns={'ipc-link': 'Name', 'ipc-rating-star--rating': 'Rating'})
        
        ratings = {}
        for group in similar_name_groups:
            group_ratings = df_filtered[df_filtered['Name'].isin(group)]['Rating']
            if not group_ratings.empty:
                ratings["/".join(group)] = group_ratings.min()
        
        user_ratings[movie_title] = pd.Series(ratings)
    
    return user_ratings

file_paths = [
    'D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_spiderman_noway_home.csv',
    'D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_doctorstrange.csv',
    'D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_thor_love_thunder.csv',
    'D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_black_panther.csv',
    'D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_antman.csv',
    'D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/cleaned_deadpool_wolverine.csv'
]

similar_name_groups = find_similar_name_groups(file_paths, threshold=70)

user_ratings_matrix = build_user_ratings_matrix(file_paths, similar_name_groups)

print(user_ratings_matrix)
user_ratings_matrix.to_csv('D:/Galala UNi/4th Year/AIE425Intelligent recommender systems/Assignments/Assignment1/Cleaned DataSets/user_ratings_matrix.csv', index=True)


                    cleaned spiderman noway home  cleaned doctorstrange  \
007Waffles                                   6.0                    NaN   
11ovz11                                     10.0                    NaN   
18Buddha                                     NaN                    9.0   
3xHCCH                                       9.0                    NaN   
80sHorror                                    NaN                    NaN   
...                                          ...                    ...   
zkzuber                                      NaN                    NaN   
zorba-36271                                  NaN                    NaN   
ztpbrmhw                                     NaN                    NaN   
zwjonas                                      NaN                    NaN   
zzzxxxcccvvv-43202                           1.0                    1.0   

                    cleaned thor love thunder  cleaned black panther  \
007Waffles                 

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr

data = {
    'spiderman no way home': [6, 7, np.nan, 7, 9],
    'doctor strange': [10, 6, 4, 5, np.nan],
    'thor love thunder': [9, np.nan, 8, 8, 5],
    'black panther': [7, 6, 6, 7, 10],
    'antman': [8, 9, 8, 10, 7],
    'deadpool wolverine': [7, 5, np.nan, 6, 5]
}
user_matrix = pd.DataFrame(data, index=['BrnzReviews', 'AvionPrince16', 'tkdlifemagazine', 'Entertainmentsp', 'Carycomic'])


def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

def pearson_similarity(vec1, vec2):
    mask = ~np.isnan(vec1) & ~np.isnan(vec2)
    if mask.sum() == 0:
        return 0  
    return pearsonr(vec1[mask], vec2[mask])[0]

def user_based_cf(matrix, target_user, similarity_measure='cosine'):
    user_similarities = pd.DataFrame(index=matrix.index, columns=matrix.index)
    
    for user1 in matrix.index:
        for user2 in matrix.index:
            if user1 != user2:
                if similarity_measure == 'cosine':
                    user_similarities.loc[user1, user2] = cosine_similarity(
                        matrix.loc[user1].fillna(0), matrix.loc[user2].fillna(0))
                elif similarity_measure == 'pearson':
                    user_similarities.loc[user1, user2] = pearson_similarity(
                        matrix.loc[user1], matrix.loc[user2])
    
    predictions = {}
    for item in matrix.columns[matrix.loc[target_user].isna()]:
        sim_sum = 0
        weighted_sum = 0
        for other_user in matrix.index:
            if other_user != target_user and not np.isnan(matrix.loc[other_user, item]):
                similarity = user_similarities.loc[target_user, other_user]
                rating = matrix.loc[other_user, item]
                sim_sum += similarity
                weighted_sum += similarity * rating
        predictions[item] = weighted_sum / sim_sum if sim_sum != 0 else np.nan
    
    return predictions

def adjusted_cosine_similarity(matrix, item1, item2):
    mean_centered_matrix = matrix.sub(matrix.mean(axis=1), axis=0)
    return cosine_similarity(mean_centered_matrix[item1].fillna(0), mean_centered_matrix[item2].fillna(0))

def item_based_cf(matrix, target_user):
    item_similarities = pd.DataFrame(index=matrix.columns, columns=matrix.columns)
    
    for item1 in matrix.columns:
        for item2 in matrix.columns:
            if item1 != item2:
                item_similarities.loc[item1, item2] = adjusted_cosine_similarity(matrix, item1, item2)
    
    predictions = {}
    for item in matrix.columns[matrix.loc[target_user].isna()]:
        sim_sum = 0
        weighted_sum = 0
        for other_item in matrix.columns:
            if other_item != item and not np.isnan(matrix.loc[target_user, other_item]):
                similarity = item_similarities.loc[item, other_item]
                rating = matrix.loc[target_user, other_item]
                sim_sum += similarity
                weighted_sum += similarity * rating
        predictions[item] = weighted_sum / sim_sum if sim_sum != 0 else np.nan
    
    return predictions

target_user = 'tkdlifemagazine'
user_based_predictions_cosine = user_based_cf(user_matrix, target_user, similarity_measure='cosine')
user_based_predictions_pearson = user_based_cf(user_matrix, target_user, similarity_measure='pearson')
item_based_predictions_adjusted_cosine = item_based_cf(user_matrix, target_user)

print("User-Based CF Predictions (Cosine) for tkdlifemagazine:", user_based_predictions_cosine)
print("User-Based CF Predictions (Pearson) for tkdlifemagazine:", user_based_predictions_pearson)
print("Item-Based CF Predictions (Adjusted Cosine) for tkdlifemagazine:", item_based_predictions_adjusted_cosine)


User-Based CF Predictions (Cosine) for tkdlifemagazine: {'spiderman no way home': 7.18279160508015, 'deadpool wolverine': 5.830992894285834}
User-Based CF Predictions (Pearson) for tkdlifemagazine: {'spiderman no way home': 3.9144297644201482, 'deadpool wolverine': 5.23899996112657}
Item-Based CF Predictions (Adjusted Cosine) for tkdlifemagazine: {'spiderman no way home': 8.356072607196047, 'deadpool wolverine': 7.334821301788396}
