# **Import Libraries**

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive

# **Data Loading**

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
customers = pd.read_csv('/content/drive/MyDrive/Customers.csv')
products = pd.read_csv('/content/drive/MyDrive/Products.csv')
transactions = pd.read_csv('/content/drive/MyDrive/Transactions.csv')


# **Lookalike Model**

In [10]:
# Merging transactions, products, and customers data
transactions_products = transactions.merge(products, on="ProductID", how="left")
full_data = transactions_products.merge(customers, on="CustomerID", how="left")

full_data_file = 'full_data.csv'
full_data.to_csv(full_data_file, index=False)

# Aggregate data for each customer
customer_features = full_data.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'TotalValue': 'sum',
    'Category': lambda x: x.mode()[0] if len(x.mode()) > 0 else None,
    'Region': 'first',
}).reset_index()

customer_features_file = 'customer_features.csv'
customer_features.to_csv(customer_features_file, index=False)

# Standardize numeric features
scaler = StandardScaler()
numeric_features = ['Quantity', 'TotalValue']
customer_features[numeric_features] = scaler.fit_transform(customer_features[numeric_features])

# Encode categorical features
encoder = OneHotEncoder()
categorical_features = ['Category', 'Region']
encoded_cats = encoder.fit_transform(customer_features[categorical_features]).toarray()

# Combining all features into a single feature matrix
feature_matrix = np.hstack([customer_features[numeric_features].values, encoded_cats])

# Compute cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

# Generating lookalike customers for the first 20 customers
customer_ids = customer_features['CustomerID'].values
top20_customers = customer_ids[:20]
lookalike_map = {}

for i, c_id in enumerate(top20_customers):
    similarity_scores = list(enumerate(similarity_matrix[i]))
    lookalike_customers = sorted(
        [(customer_ids[idx], round(score, 4)) for idx, score in similarity_scores if idx != i],
        key=lambda x: x[1],
        reverse=True
    )[:3]
    lookalike_map[c_id] = lookalike_customers

# Save lookalike results
lookalike = pd.DataFrame({
    "CustomerID": lookalike_map.keys(),
    "Lookalikes": [str(lookalike_customers) for lookalike_customers in lookalike_map.values()]
})
lookalike_file = 'Lookalike.csv'
lookalike.to_csv(lookalike_file, index=False)

pd.set_option("display.max_colwidth", None)

print("Lookalike Model Results for First 20 Customers:")
lookalike


Lookalike Model Results for First 20 Customers:


Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0184', 0.9935), ('C0048', 0.982), ('C0190', 0.9645)]"
1,C0002,"[('C0088', 0.997), ('C0092', 0.9326), ('C0106', 0.898)]"
2,C0003,"[('C0076', 0.9404), ('C0052', 0.9075), ('C0031', 0.8921)]"
3,C0004,"[('C0169', 0.9782), ('C0165', 0.9755), ('C0087', 0.9675)]"
4,C0005,"[('C0186', 0.9976), ('C0140', 0.9915), ('C0146', 0.9875)]"
5,C0006,"[('C0126', 0.9874), ('C0187', 0.9866), ('C0011', 0.9768)]"
6,C0007,"[('C0146', 1.0), ('C0005', 0.9871), ('C0115', 0.9837)]"
7,C0008,"[('C0065', 0.8535), ('C0059', 0.8477), ('C0160', 0.7635)]"
8,C0009,"[('C0198', 1.0), ('C0061', 0.9675), ('C0062', 0.9583)]"
9,C0010,"[('C0062', 0.9396), ('C0111', 0.9244), ('C0103', 0.8649)]"



The table shows the top 3 lookalike customers for each CustomerID, along with their similarity scores. These scores indicate how closely the lookalikes match the original customer based on shared characteristics or behaviors. For example, CustomerID C0001 has the top lookalike C0184 with a similarity score of 0.9935, followed by C048 and C0190. This information can be used to identify similar customer groups for personalized marketing, cross-selling, or retention strategies.

# **Lookalike Model Analysis**
This model identifies customers whose behaviors closely match a reference group using cosine similarity.

**Key insights:**

**Customer Similarity:** High similarity scores indicate customers with similar purchasing patterns.

**Targeting:** Helps identify potential high-value customers for targeted marketing and promotions.

**Personalization:** Enables more tailored strategies by grouping customers with similar behaviors.

**Scalability:** Can be applied to large datasets, making it useful for businesses to identify and engage lookalike customers.