In [4]:
import pandas as pd

# Load datasets
customers = pd.read_csv(r'C:\Users\kranthi kumar goli\Downloads\Customers.csv')
products = pd.read_csv(r'C:\Users\kranthi kumar goli\Downloads\Products.csv')
transactions = pd.read_csv(r'C:\Users\kranthi kumar goli\Downloads\Transactions.csv')

#merging
transactions = transactions.merge(products, on="ProductID", how="left")
full_data = transactions.merge(customers, on="CustomerID", how="left")

In [5]:
# Create customer-level features
customer_features = (
    full_data.groupby("CustomerID")
    .agg(
        total_spending=("TotalValue", "sum"),
        avg_spending=("TotalValue", "mean"),
        num_transactions=("TransactionID", "count"),
        favorite_category=("Category", lambda x: x.mode()[0] if not x.mode().empty else None),
    )
    .reset_index()
)


In [6]:
from sklearn.preprocessing import StandardScaler

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ["total_spending", "avg_spending", "num_transactions"]
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix
similarity_matrix = cosine_similarity(customer_features[numerical_features])

# Create a DataFrame for the similarity matrix
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])


In [8]:
lookalikes = {}

for customer_id in customer_features["CustomerID"]:
    # Get similarity scores for this customer
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    
    # Store results
    lookalikes[customer_id] = list(zip(similar_customers.index, similar_customers.values))


In [9]:
# Prepare the output DataFrame
lookalike_output = []

for customer_id, similar in lookalikes.items():
    if customer_id in [f"C{str(i).zfill(4)}" for i in range(1, 21)]:
        for sim_id, score in similar:
            lookalike_output.append({"CustomerID": customer_id, "SimilarCustomerID": sim_id, "SimilarityScore": score})

lookalike_df = pd.DataFrame(lookalike_output)

# Save to CSV
lookalike_df.to_csv("kiranKumar_goli_Lookalike.csv", index=False)
