In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
# Create a total spending feature
total_spending = transactions.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spending.rename(columns={'TotalValue': 'TotalSpending'}, inplace=True)

# Count the number of transactions per customer
transaction_counts = transactions.groupby('CustomerID')['TransactionID'].count().reset_index()
transaction_counts.rename(columns={'TransactionID': 'TransactionCount'}, inplace=True)

In [18]:
# Merge datasets
customer_data = customers.merge(total_spending, on='CustomerID', how='left') \
                    .merge(transaction_counts, on='CustomerID', how='left')

# Fill NaN values with 0 for customers with no transactions
customer_data['TotalSpending'].fillna(0, inplace=True)
customer_data['TransactionCount'].fillna(0, inplace=True)

In [19]:
# Create additional features
customer_data['AverageTransactionValue'] = customer_data['TotalSpending'] / customer_data['TransactionCount']
customer_data['AverageTransactionValue'].fillna(0, inplace=True)  # Handle division by zero

In [20]:
# Select relevant features for similarity calculation
features = customer_data[['TotalSpending', 'TransactionCount', 'AverageTransactionValue']]

In [21]:
# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [22]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(features_scaled)

# Create a DataFrame for similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

In [23]:
# Function to get top 3 lookalikes for a given customer
def get_top_lookalikes(customer_id, n=3):
    similar_scores = similarity_df[customer_id].sort_values(ascending=False)
    top_lookalikes = similar_scores.iloc[1:n+1]  # Exclude the customer itself
    return top_lookalikes

In [24]:
# Initialize a dictionary to store the results
lookalike_results = {}

# Get lookalikes for the first 20 customers (C0001 to C0020)
for customer_id in customer_data['CustomerID'].head(20):
    lookalikes = get_top_lookalikes(customer_id)
    # Store the lookalikes in the required format
    lookalike_results[customer_id] = [(lookalike_id, score) for lookalike_id, score in zip(lookalikes.index, lookalikes.values)]

In [25]:
# Prepare the output in the required format
output_data = []
for cust_id, lookalikes in lookalike_results.items():
    output_data.append((cust_id, lookalikes))  # (CustomerID, List of (LookalikeID, Score))

In [26]:
# Convert to DataFrame for saving
lookalike_df = pd.DataFrame(output_data, columns=['CustomerID', 'Lookalikes'])

In [27]:
# Save to CSV
lookalike_df.to_csv('DOKKU_MANASA_Lookalike.csv', index=False)

In [28]:
lookalike = pd.read_csv('DOKKU_MANASA_Lookalike.csv')

In [29]:
lookalike.head(20)

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0137', 0.999217832279607), ('C0152', 0.992..."
1,C0002,"[('C0029', 0.9996304690463752), ('C0199', 0.99..."
2,C0003,"[('C0005', 0.9999316372091099), ('C0178', 0.99..."
3,C0004,"[('C0067', 0.9998110253764196), ('C0021', 0.99..."
4,C0005,"[('C0003', 0.9999316372091099), ('C0073', 0.99..."
5,C0006,"[('C0079', 0.9999839458199934), ('C0117', 0.99..."
6,C0007,"[('C0085', 0.9998021851965131), ('C0140', 0.99..."
7,C0008,"[('C0084', 0.9955933187236518), ('C0194', 0.99..."
8,C0009,"[('C0077', 0.9998301326192339), ('C0032', 0.99..."
9,C0010,"[('C0029', 0.9997659902062711), ('C0025', 0.99..."
