In [3]:
import numpy as np
import pandas as pd

In [7]:
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Create customer profiles
customer_profiles = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'ProductID': 'nunique'
}).reset_index()

customer_profiles = customer_profiles.merge(customers[['CustomerID', 'Region']], on='CustomerID')

# One-hot encoding
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'])

# Normalize features
scaler = StandardScaler()
features = scaler.fit_transform(customer_profiles.drop('CustomerID', axis=1))

# Cosine similarity
similarity_matrix = cosine_similarity(features)

# Get top 3 similar customers
lookalike_results = {}
for idx, customer_id in enumerate(customer_profiles['CustomerID']):
    similar_indices = np.argsort(-similarity_matrix[idx])[1:4]  # Skip self-similarity
    similar_customers = customer_profiles.iloc[similar_indices]['CustomerID'].values
    scores = similarity_matrix[idx][similar_indices]
    lookalike_results[customer_id] = list(zip(similar_customers, scores))

# Save lookalike results
lookalike_df = pd.DataFrame([
    {'cust_id': key, 'lookalikes': value}
    for key, value in lookalike_results.items()
])
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)


In [9]:
a=pd.read_csv('/content/FirstName_LastName_Lookalike.csv')

In [10]:
print(a)

    cust_id                                         lookalikes
0     C0001  [('C0107', 0.9967254279047943), ('C0137', 0.99...
1     C0002  [('C0142', 0.9945338541294934), ('C0177', 0.98...
2     C0003  [('C0133', 0.9663811898389756), ('C0190', 0.96...
3     C0004  [('C0113', 0.988351024735706), ('C0102', 0.972...
4     C0005  [('C0186', 0.9985801194257661), ('C0159', 0.99...
..      ...                                                ...
194   C0196  [('C0066', 0.9388086443339202), ('C0170', 0.89...
195   C0197  [('C0073', 0.9975449329947235), ('C0167', 0.99...
196   C0198  [('C0063', 0.9945834523031237), ('C0014', 0.99...
197   C0199  [('C0166', 0.9855536644055365), ('C0197', 0.98...
198   C0200  [('C0022', 0.9763729421821856), ('C0138', 0.96...

[199 rows x 2 columns]


In [13]:
def calculate_accuracy(predictions, ground_truth):
    """Calculates the accuracy of the lookalike model predictions."""

    # Ensure customer IDs are in the same format (e.g., strings)
    predictions['cust_id'] = predictions['cust_id'].astype(str)
    ground_truth['cust_id'] = ground_truth['cust_id'].astype(str)

    merged_df = pd.merge(predictions, ground_truth, on='cust_id', how='inner')

    # Initialize counters
    correct_predictions = 0
    total_predictions = 0

    for index, row in merged_df.iterrows():
        predicted_lookalikes = set([x[0] for x in eval(row['lookalikes_x'])])
        actual_lookalikes = set([x[0] for x in eval(row['lookalikes_y'])])


        if predicted_lookalikes.intersection(actual_lookalikes):
          correct_predictions += 1
        total_predictions += 1

    if total_predictions == 0:
        print("Warning: No matching customer IDs found between predictions and ground truth.")
        return 0

    accuracy = correct_predictions / total_predictions
    return accuracy

In [15]:
accuracy = calculate_accuracy(a, ground_truth)
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 100.00%
