In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# File paths
transactions_path = r"C:\Users\Keertipriya Bellary\Transactions.csv"
products_path = r"C:\Users\Keertipriya Bellary\Products.csv"
customers_path = r"C:\Users\Keertipriya Bellary\Customers.csv"

# Step 1: Load datasets
transactions_df = pd.read_csv(transactions_path)
products_df = pd.read_csv(products_path)
customers_df = pd.read_csv(customers_path)

# Step 2: Data Cleaning
transactions_df.fillna({'TotalValue': 0}, inplace=True)
products_df.fillna({'Category': 'Unknown'}, inplace=True)
customers_df.fillna({'Region': 'Unknown'}, inplace=True)

transactions_df.drop_duplicates(inplace=True)
products_df.drop_duplicates(inplace=True)
customers_df.drop_duplicates(inplace=True)

# Merge datasets
merged_df = pd.merge(transactions_df, products_df, on="ProductID", how="left")
merged_df = pd.merge(merged_df, customers_df, on="CustomerID", how="left")

# Handle missing values
merged_df.fillna({
    'ProductName': 'Unknown',
    'Category': 'Unknown',
    'Region': 'Unknown'
}, inplace=True)

# Step 3: Feature Engineering
# Combine relevant customer and product information for the lookalike model
merged_df['CombinedInfo'] = merged_df['Region'] + " " + merged_df['Category'] + " " + merged_df['ProductName']

# Create customer profiles by aggregating the combined information
customer_profiles = merged_df.groupby('CustomerID')['CombinedInfo'].apply(' '.join).reset_index()

# Vectorize the textual data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
customer_vectors = vectorizer.fit_transform(customer_profiles['CombinedInfo'])

# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(customer_vectors)

# Create a DataFrame for similarity matrix
similarity_df = pd.DataFrame(cosine_sim_matrix, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])

# Step 4: Generate Lookalike Recommendations
lookalike_recommendations = {}

# Get the top 3 similar customers for CustomerIDs C0001 to C0020
for customer_id in customer_profiles['CustomerID'][:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Exclude the customer themselves
    # Convert similarity scores to percentages
    similar_customers = similar_customers * 100  
    recommendations = [f"{cust_id} ({similarity:.2f}%)" for cust_id, similarity in zip(similar_customers.index, similar_customers.values)]
    lookalike_recommendations[customer_id] = recommendations

# Step 5: Export Results to Lookalike.csv
# Prepare data for export
lookalike_list = [{'CustomerID': customer_id, 'Top3Lookalikes': ', '.join(lookalikes)} for customer_id, lookalikes in lookalike_recommendations.items()]

# Convert list to a DataFrame
lookalike_df = pd.DataFrame(lookalike_list)

# Save the DataFrame to a CSV file
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike recommendations with similarity percentages saved to Lookalike.csv.")


Lookalike recommendations with similarity percentages saved to Lookalike.csv.
