<a href="https://colab.research.google.com/github/Kish-ie/recommender-system/blob/main/recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import joblib  # For saving the model


# Fix CSV file
def fix_csv(file_path, output_path):
    with open(file_path, 'r') as infile, open(output_path, 'w') as outfile:
        for line in infile:
            # Fix unclosed quotes (example logic)
            if line.count('"') % 2 != 0:  # Check if quotes are unbalanced
                line = line.replace('\n', '"\n')  # Close the quote
            outfile.write(line)


# Load dataset
def load_data(file_path):
    df = pd.read_csv(file_path, on_bad_lines='skip')  # Skip problematic rows
    print(df.head())  # Inspect the dataset
    return df


# Prepare data for Surprise library
def prepare_data(df):
    reader = Reader(rating_scale=(1, 5))  # Adjust the rating scale if needed
    data = Dataset.load_from_df(df[['UserId', 'ProductId', 'Score']], reader)
    return data


# Train and save the model
def train_and_save_model(data, model_path='model.pkl'):
    trainset, _ = train_test_split(data, test_size=0.2)
    model = SVD()  # Singular Value Decomposition (SVD)
    model.fit(trainset)

    # Save the model to disk
    joblib.dump(model, model_path)
    print(f"Model saved to {model_path}")
    return model


# Recommend products for a user
def recommend_products(model, user_id, df, num_recommendations=5):
    # Get all unique products
    unique_products = df['ProductId'].unique()

    # Get products already rated by the user
    user_rated_products = df[df['UserId'] == user_id]['ProductId'].tolist()

    # Filter out products already rated by the user
    products_to_predict = [p for p in unique_products if p not in user_rated_products]

    # Predict ratings for each product
    predictions = [
        (product, model.predict(user_id, product).est)
        for product in products_to_predict
    ]

    # Sort predictions by estimated rating (highest first)
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Return the top recommendations
    return predictions[:num_recommendations]


# Main function
def main():
    reviews_file_path = 'Reviews.csv'  # Replace with your reviews dataset file path
    fixed_file_path = 'reviews_fixed.csv'  # Path to save the fixed CSV file
    model_save_path = 'recommendation_model.pkl'  # Path to save the trained model

    # Fix the CSV file
    fix_csv(reviews_file_path, fixed_file_path)

    # Load the fixed dataset
    df_reviews = load_data(fixed_file_path)
    data = prepare_data(df_reviews)

    # Train and save the model
    model = train_and_save_model(data, model_save_path)

    # Generate recommendations for a user
    user_id = 'A3SGXH7AUHU8GW'  # Replace with a valid user ID from your dataset
    recommendations = recommend_products(model, user_id, df_reviews)

    # Print recommendations
    print(f"Top recommendations for user {user_id}:")
    for i, (product_id, predicted_rating) in enumerate(recommendations, 1):
        product_info = df_reviews[df_reviews['ProductId'] == product_id].iloc[0]
        print(f"{i}. Product ID: {product_id}, Predicted Rating: {predicted_rating:.2f}")
        print(f"   Summary: {product_info['Summary']}")
        print(f"   Text: {product_info['Text'][:100]}...")  # Print first 100 characters of the review
        print()


if __name__ == "__main__":
    main()

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

In [17]:
import pandas as pd
import joblib
from surprise import Dataset, Reader


# Load dataset
def load_data(file_path):
    df = pd.read_csv(file_path, on_bad_lines='skip')  # Skip problematic rows
    return df


# Recommend products for a user using the saved model
def recommend_products(model, user_id, df, num_recommendations=5):
    # Get all unique products
    unique_products = df['ProductId'].unique()

    # Get products already rated by the user
    user_rated_products = df[df['UserId'] == user_id]['ProductId'].tolist()

    # Filter out products already rated by the user
    products_to_predict = [p for p in unique_products if p not in user_rated_products]

    # Predict ratings for each product
    predictions = [
        (product, model.predict(user_id, product).est)
        for product in products_to_predict
    ]

    # Sort predictions by estimated rating (highest first)
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Return the top recommendations
    return predictions[:num_recommendations]


# Main function
def main():
    # Paths
    reviews_file_path = 'Reviews.csv'  # Path to the reviews dataset
    model_path = 'recommendation_model.pkl'  # Path to the saved model

    # Load the dataset
    df_reviews = load_data(reviews_file_path)

    # Load the trained model
    model = joblib.load(model_path)
    print("Model loaded successfully.")

    # Generate recommendations for a user
    user_id = 'A3SGXH7AUHU8GW'  # Replace with a valid user ID from your dataset
    recommendations = recommend_products(model, user_id, df_reviews)

    # Print recommendations
    print(f"Top recommendations for user {user_id}:")
    for i, (product_id, predicted_rating) in enumerate(recommendations, 1):
        product_info = df_reviews[df_reviews['ProductId'] == product_id].iloc[0]
        print(f"{i}. Product ID: {product_id}, Predicted Rating: {predicted_rating:.2f}")
        print(f"   Summary: {product_info['Summary']}")
        print(f"   Text: {product_info['Text'][:100]}...")  # Print first 100 characters of the review
        print()


if __name__ == "__main__":
    main()

Model loaded successfully.
Top recommendations for user A3SGXH7AUHU8GW:
1. Product ID: B0032RPLSY, Predicted Rating: 5.00
   Summary: Simply delicious flavor!
   Text: I have been using butter ghee for cooking purposes, and found this product on Amazon.  The color is ...

2. Product ID: B005EL6VOY, Predicted Rating: 5.00
   Summary: Tasty and healthy!!
   Text: My husband and I love Coach's Oats.  We first tried them on recommendation by a friend.  They are ki...

3. Product ID: B0007UQ73W, Predicted Rating: 5.00
   Summary: Staple item in my pantry
   Text: I am a professional chef and work on mega yachts all around the world. This is one of the finest aut...

4. Product ID: B000Q0IMOK, Predicted Rating: 5.00
   Summary: Quality Product...good value
   Text: This Chocolate can be found at Dean & DeLuca at twice the price.<br />The packaging is a plain plast...

5. Product ID: B00401OZ1U, Predicted Rating: 5.00
   Summary: Great pure taste
   Text: No sugary junk here. Pure water and m