In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('E:/project/amazon.csv')

In [None]:
print(df.head())

# Data Cleaning and Preprocessing


In [None]:
print(df.isnull().sum())

In [None]:

print("Column names:", df.columns.tolist())
print("Data types:\n", df.dtypes)

In [None]:
df.fillna(method='ffill',inplace=True)

In [7]:
# Drop rows with NaN ratings
df.dropna(subset=['rating', 'product_id', 'user_id'], inplace=True)  # Adjust as necessary

In [None]:
# Basic statistics of the dataset
print("Basic statistics:\n", df.describe())



In [None]:
#Feature Engineering
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year

# 3.Exploratory Data Analysis



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
#Top 5 popular products
top_products = df['product_id'].value_counts().head(5)
sns.barplot(x=top_products.index,y=top_products.values)
plt.title('Top 5 Most Purchased Products')
plt.xlabel('Product ID')
plt.ylabel('Purchase Count')
plt.show()


In [None]:

# Count of ratings per product
ratings_per_product = df.groupby('product_id')['rating'].count().reset_index()
ratings_per_product.columns = ['product_id', 'rating_count']

# Top 10 products by rating count
top_products = ratings_per_product.sort_values(by='rating_count', ascending=False).head(10)


In [None]:

# Plotting the top products by rating count
plt.figure(figsize=(12, 6))
sns.barplot(x='product_id', y='rating_count', data=top_products)
plt.title('Top 10 Amazon Products by Rating Count')
plt.xlabel('Product ID')
plt.ylabel('Number of Ratings')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Step 4: Visualization
import pandas as pd

# Load your DataFrame
df = pd.read_csv('amazon.csv')  # Adjust the file name accordingly

# Strip any leading or trailing spaces from column names
df.columns = df.columns.str.strip()

# Print columns for verification
print("Columns in DataFrame:", df.columns)

In [10]:

# Convert relevant columns to numeric, forcing errors to NaN
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df['rating_count'] = pd.to_numeric(df['rating_count'], errors='coerce')
df['discounted_price'] = pd.to_numeric(df['discounted_price'].str.replace('[$,]', '', regex=True), errors='coerce')
df['actual_price'] = pd.to_numeric(df['actual_price'].str.replace('[$,]', '', regex=True), errors='coerce')
df['discount_percentage'] = pd.to_numeric(df['discount_percentage'].str.replace('%', ''), errors='coerce')


In [None]:

# Drop rows with NaN values in the specified columns
df.dropna(subset=['rating', 'product_id', 'user_id'], inplace=True)

# Perform groupby and mean aggregation on the rating column
average_ratings = df.groupby('product_id')['rating'].mean().reset_index()

# Print the resulting DataFrame
print(average_ratings)




In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Set the style
sns.set(style='whitegrid')

# Create a histogram of ratings
plt.figure(figsize=(10, 6))
sns.histplot(df['rating'], bins=10, kde=True)
plt.title('Distribution of Product Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

# Average Rating by Product Category


In [11]:
# Calculate average ratings by category
average_ratings_by_category = df.groupby('category')['rating'].mean().reset_index()

# Sort values by average rating
average_ratings_by_category = average_ratings_by_category.sort_values(by='rating', ascending=False)

# Create a bar plot
# Limit to top 10 categories by average rating
top_categories = average_ratings_by_category.head(10)

In [None]:

# Create a bar plot
plt.figure(figsize=(12, 6))
sns.barplot(data=top_categories, x='rating', y='category', palette='viridis')
plt.title('Top 10 Average Ratings by Product Category')
plt.xlabel('Average Rating')
plt.ylabel('Product Category')
plt.show()


# Top 10 Products by Rating


In [None]:
# Calculate average ratings by product
top_rated_products = df.groupby('product_id')['rating'].mean().reset_index()
top_rated_products = top_rated_products.sort_values(by='rating', ascending=False).head(10)


In [None]:

# Create a horizontal bar plot
plt.figure(figsize=(10, 6))
sns.barplot(data=top_rated_products, x='rating', y='product_id', palette='mako')
plt.title('Top 10 Products by Average Rating')
plt.xlabel('Average Rating')
plt.ylabel('Product ID')
plt.show()

# Rating Count Distribution


In [None]:
# Create a histogram of rating counts
plt.figure(figsize=(10, 6))
sns.histplot(df['rating_count'], bins=30, kde=True)
plt.title('Distribution of Rating Counts')
plt.xlabel('Number of Ratings')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Exporting cleaned data to Excel for reporting
df.to_excel('cleaned_amazon_data.xlsx', index=False)
print("Cleaned Amazon data exported to 'cleaned_amazon_data.xlsx'")


# Model Building


# Collaborative Filtering(using Surprise)

In [None]:
pip install surprise

In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse

# Load your data into a pandas DataFrame called 'amazon'
amazon = pd.read_excel('cleaned_amazon_data.xlsx')  # Use pd.read_excel for Excel files

# Replace '|' with an appropriate value (e.g., NaN or the average rating) or remove the rows containing it
amazon['rating'] = pd.to_numeric(amazon['rating'], errors='coerce')
amazon.dropna(subset=['rating'], inplace=True)

# Prepare data for Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(amazon[['user_id', 'product_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

# SVD model
model = SVD()
model.fit(trainset)

# Evaluate model
predictions = model.test(testset)
print("RMSE:", rmse(predictions))

# Content-Based Filtering(using TF-IDF for product descriptions)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load your data into a pandas DataFrame called 'amazon_data'
amazon_data = pd.read_excel('/content/cleaned_amazon_data.xlsx')  # Replace 'your_data.csv' with your actual file path

# Compute TF-IDF matrix for product descriptions
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(amazon_data['about_product'].fillna(''))

def get_recommendations(product_name, top_n=5):
    """
    Recommends similar products based on product description similarity.
    """
    # Get the index of the product that matches the name
    product_indices = amazon_data[amazon_data['product_name'].str.contains(product_name, case=False, regex=False)].index

    # Check if any matching products were found
    if len(product_indices) == 0:
        print(f"No products found matching '{product_name}'.")
        return []  # Return an empty list if no matches

    # Get the similarity scores for the product
    similarity_scores = cosine_similarity(tfidf_matrix[product_indices[0]], tfidf_matrix)

    # Get the indices of the most similar products (excluding the product itself)
    similar_product_indices = similarity_scores.argsort()[0][::-1][1:top_n + 1]

    # Return the names of the most similar products
    return amazon_data['product_name'].iloc[similar_product_indices]

# Use a valid product name from your dataset
product_name_to_search = "Ambrane Unbreakable 60W / 3A Fast Charging 1.5m Braided Type C Cable for Smartphones, Tablets, Laptops & other Type C devices, PD Technology, 480Mbps Data Sync, Quick Charge 3.0 (RCT15A, Black)"
recommendations = get_recommendations(product_name_to_search)

# Remove duplicates
recommendations = recommendations.drop_duplicates()

# Print the deduplicated recommendations
print(recommendations)

# Evaluation


In [None]:
!pip install scikit-surprise

In [None]:
!pip install scikit-surprise # Installs the scikit-surprise package which contains the 'surprise' module
from surprise import accuracy # Imports the 'accuracy' module from the 'surprise' package

# Calculate RMSE and precision/recall (as needed)
rmse = accuracy.rmse(predictions) # Calculates the root mean squared error (RMSE) of the predictions

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load your data into a pandas DataFrame called 'amazon_data'
amazon_data = pd.read_excel('/content/cleaned_amazon_data.xlsx')  # Replace 'your_data.csv' with your actual file path

# Compute TF-IDF matrix for product descriptions
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(amazon_data['about_product'].fillna(''))

def get_recommendations(product_name, top_n=5):
    """
    Recommends similar products based on product description similarity.
    """
    # Get the index of the product that matches the name
    product_indices = amazon_data[amazon_data['product_name'].str.contains(product_name, case=False, regex=False)].index

    # Check if any matching products were found
    if len(product_indices) == 0:
        print(f"No products found matching '{product_name}'.")
        return []  # Return an empty list if no matches

    # Get the similarity scores for the product
    similarity_scores = cosine_similarity(tfidf_matrix[product_indices[0]], tfidf_matrix)

    # Get the indices of the most similar products (excluding the product itself)
    similar_product_indices = similarity_scores.argsort()[0][::-1][1:top_n + 1]

    # Return the names

# Visualization


In [None]:
from surprise import accuracy

# Calculate RMSE and precision/recall (as needed)
rmse = accuracy.rmse(predictions)


# Reporting


In [None]:
# Export final report data
amazon_data.to_excel("Amazon_Project_Report.xlsx", index=False)
