In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
transactions = pd.read_csv('C:/Users/91701/Downloads/Transactions.csv')
products = pd.read_csv('C:/Users/91701/Downloads/Products.csv')
customers = pd.read_csv('C:/Users/91701/Downloads/Customers.csv')

In [3]:
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])

In [4]:
transactions_products = pd.merge(transactions, products, on='ProductID', how='left')
full_data = pd.merge(transactions_products, customers, on='CustomerID', how='left')

In [5]:
customer_features = full_data.groupby('CustomerID').agg({
    'Quantity': 'sum',  # Total quantity purchased
    'TotalValue': 'sum',  # Total spending
    'Category': lambda x: x.value_counts().idxmax(),  # Most purchased category
    'Region': 'first',  # Region of the customer
    'SignupDate': 'first'  # Signup date
}).reset_index()

In [6]:
customer_features = pd.get_dummies(customer_features, columns=['Category', 'Region'], drop_first=True)

In [7]:
customer_features['SignupYear'] = customer_features['SignupDate'].dt.year
customer_features.drop(columns=['SignupDate'], inplace=True)

In [8]:
scaler = StandardScaler()
numerical_features = ['Quantity', 'TotalValue', 'SignupYear']
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])


In [9]:
customer_ids = customer_features['CustomerID']
feature_matrix = customer_features.drop(columns=['CustomerID']).values
similarity_matrix = cosine_similarity(feature_matrix)


In [10]:
def get_top_similar(customers, similarity_matrix, top_n=3):
    lookalike_dict = {}
    for idx, customer_id in enumerate(customers):
        similarities = list(enumerate(similarity_matrix[idx]))
        similarities = sorted(similarities, key=lambda x: x[1], reverse=True)  # Sort by similarity
        top_similar = [(customers[i], score) for i, score in similarities[1:top_n+1]]  # Exclude self-similarity
        lookalike_dict[customer_id] = top_similar
    return lookalike_dict

lookalike_dict = get_top_similar(customer_ids, similarity_matrix, top_n=3)

In [11]:
lookalike_list = []
for customer_id in customer_ids[:20]:
    top_similars = lookalike_dict[customer_id]
    lookalike_list.append({
        "CustomerID": customer_id,
        "Lookalikes": [
            {"SimilarCustomerID": sim[0], "SimilarityScore": sim[1]} for sim in top_similars
        ]
    })

In [12]:
lookalike_df = pd.DataFrame({
    "CustomerID": [entry['CustomerID'] for entry in lookalike_list],
    "Lookalikes": [entry['Lookalikes'] for entry in lookalike_list]
})

In [16]:
lookalike_df.to_csv("Lookalike.csv", index=False)


In [17]:
lookalike_df.head(20)

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[{'SimilarCustomerID': 'C0184', 'SimilaritySco..."
1,C0002,"[{'SimilarCustomerID': 'C0106', 'SimilaritySco..."
2,C0003,"[{'SimilarCustomerID': 'C0076', 'SimilaritySco..."
3,C0004,"[{'SimilarCustomerID': 'C0165', 'SimilaritySco..."
4,C0005,"[{'SimilarCustomerID': 'C0007', 'SimilaritySco..."
5,C0006,"[{'SimilarCustomerID': 'C0126', 'SimilaritySco..."
6,C0007,"[{'SimilarCustomerID': 'C0005', 'SimilaritySco..."
7,C0008,"[{'SimilarCustomerID': 'C0136', 'SimilaritySco..."
8,C0009,"[{'SimilarCustomerID': 'C0061', 'SimilaritySco..."
9,C0010,"[{'SimilarCustomerID': 'C0062', 'SimilaritySco..."
