In [1]:
import pandas as pd
import numpy as np

In [3]:
transactions = pd.read_csv('Transactions.csv')
product = pd.read_csv('Products.csv')
customer = pd.read_csv('Customers.csv')

In [4]:
product.head(10) 

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31
5,P006,ActiveWear Rug,Home Decor,121.32
6,P007,SoundWave Cookbook,Books,420.15
7,P008,BookWorld Bluetooth Speaker,Electronics,146.85
8,P009,BookWorld Wall Art,Home Decor,325.01
9,P010,ComfortLiving Smartwatch,Electronics,350.13


In [5]:
customer.head(10)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15
5,C0006,Brittany Palmer,South America,2024-01-07
6,C0007,Paul Graves,Asia,2022-06-18
7,C0008,David Li,North America,2024-01-13
8,C0009,Joy Clark,Europe,2023-08-14
9,C0010,Aaron Cox,Europe,2022-12-15


In [7]:
transactions.head(10)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68
5,T00442,C0188,P067,2024-12-26 14:40:03,1,300.68,300.68
6,T00490,C0195,P067,2024-11-24 11:49:48,3,902.04,300.68
7,T00536,C0008,P067,2024-09-22 06:13:59,1,300.68,300.68
8,T00564,C0157,P067,2024-12-07 17:57:40,3,902.04,300.68
9,T00631,C0130,P067,2024-05-14 23:14:59,2,601.36,300.68


In [10]:
merged_data = transactions.merge(customer, on="CustomerID").merge(product, on="ProductID")

In [11]:
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending
    "ProductID": pd.Series.nunique,  # Number of unique products purchased
    "Category": lambda x: x.mode()[0],  # Most purchased product category
    "TransactionID": "count"  # Number of transactions
}).rename(columns={
    "TotalValue": "TotalSpending",
    "ProductID": "UniqueProducts",
    "Category": "MostPurchasedCategory",
    "TransactionID": "TransactionCount"
}).reset_index()

In [13]:
customer_features = customer_features.merge(
    customer[["CustomerID", "Region", "SignupDate"]],
    on="CustomerID",
    how="left"
)

In [14]:
customer_features["AvgTransactionValue"] = customer_features["TotalSpending"] / customer_features["TransactionCount"]

print(customer_features.head())

  CustomerID  TotalSpending  UniqueProducts MostPurchasedCategory  \
0      C0001        3354.52               5           Electronics   
1      C0002        1862.74               4              Clothing   
2      C0003        2725.38               4            Home Decor   
3      C0004        5354.88               8                 Books   
4      C0005        2034.24               3           Electronics   

   TransactionCount         Region  SignupDate  AvgTransactionValue  
0                 5  South America  2022-07-10              670.904  
1                 4           Asia  2022-02-13              465.685  
2                 4  South America  2024-03-07              681.345  
3                 8  South America  2022-10-09              669.360  
4                 3           Asia  2022-08-15              678.080  


In [15]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [16]:
numerical_features = ["TotalSpending", "UniqueProducts", "TransactionCount", "AvgTransactionValue"]

In [17]:
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(customer_features[numerical_features])

In [18]:
similarity_matrix = cosine_similarity(normalized_data)

In [19]:
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])

In [23]:
def get_top_similar_customers(customer_id, top_n=3):
    if customer_id not in similarity_df.index:
        raise ValueError(f"CustomerID {customer_id} not found.")
    customer_similarities = similarity_df.loc[customer_id].drop(customer_id)
    top_similar = customer_similarities.nlargest(top_n)
    return list(zip(top_similar.index, top_similar.values))

In [24]:
lookalike_map = {
    cust_id: get_top_similar_customers(cust_id, top_n=3)
    for cust_id in customer_features["CustomerID"][:20]
}

In [25]:
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_map.keys(),
    "SimilarCustomers": [", ".join([f"{cust}:{score:.2f}" for cust, score in lookalike_map[cust_id]]) for cust_id in lookalike_map.keys()]
})

In [26]:
print(lookalike_df.head())

  CustomerID                    SimilarCustomers
0      C0001  C0137:1.00, C0152:1.00, C0056:1.00
1      C0002  C0029:1.00, C0199:1.00, C0031:1.00
2      C0003  C0178:1.00, C0035:1.00, C0133:1.00
3      C0004  C0021:1.00, C0124:1.00, C0090:1.00
4      C0005  C0073:1.00, C0159:1.00, C0112:1.00


In [27]:
lookalike_df.to_csv("Lookalike.csv", index=False)

In [30]:
def get_top_3_recommendations(similarity_df):
    recommendations = []
    for customer_id in similarity_df.index:
        sorted_similarities = similarity_df.loc[customer_id].drop(customer_id).sort_values(ascending=False)
        top_3 = sorted_similarities.head(3)
        recommendations.append({
            "CustomerID": customer_id,
            "Recommendations": [
                {"SimilarCustomerID": similar_id, "SimilarityScore": score}
                for similar_id, score in zip(top_3.index, top_3.values)
            ]
        })
    return recommendations

recommendations = get_top_3_recommendations(similarity_df)

recommendations_data = []
for rec in recommendations[:20]:  
    customer_id = rec["CustomerID"]
    for similar in rec["Recommendations"]:
        recommendations_data.append({
            "CustomerID": customer_id,
            "SimilarCustomerID": similar["SimilarCustomerID"],
            "SimilarityScore": similar["SimilarityScore"]
        })

In [29]:
# Create a DataFrame for top 3 recommendations
top_3_df = pd.DataFrame(recommendations_data)

top_3_df.to_csv("Lookalike.csv", index=False)

print(top_3_df.head())

  CustomerID SimilarCustomerID  SimilarityScore
0      C0001             C0137         0.999993
1      C0001             C0152         0.999987
2      C0001             C0056         0.999578
3      C0002             C0029         0.999710
4      C0002             C0199         0.999393
