In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import csv

# features for similarity calculation
features = ['TotalSpend', 'AvgOrderValue', 'NumTransactions', 'AvgPrice', 'TotalQuantity']

# Standardize  features to have mean=0 and variance=1
scaler = StandardScaler()
customer_profiles[features] = scaler.fit_transform(customer_profiles[features])

# (k-NN) model
# n_neighbors=4 includes the customer itself and 3 nearest neighbors
knn = NearestNeighbors(n_neighbors=4, algorithm='auto').fit(customer_profiles[features])

# Find the top 3 lookalikes for each customer
distances, indices = knn.kneighbors(customer_profiles[features])

# dictionary to store lookalikes
lookalikes = {}
for i, customer_id in enumerate(customer_profiles['CustomerID']):
    # top 3 similar customers for each customer, excluding themselves
    similar_customers = [(customer_profiles.iloc[idx]['CustomerID'], float(distances[i][j]))
                         for j, idx in enumerate(indices[i]) if customer_id != customer_profiles.iloc[idx]['CustomerID']]
    lookalikes[customer_id] = similar_customers[:3]

# Convert the lookalikes dictionary to a suitable format for CSV
lookalikes_for_csv = []
for customer_id, similar_list in lookalikes.items():
    # Format the lookalikes with descriptive words
    lookalikes_str = '; '.join([f"Customer ID: {sim[0]} with Similarity Score: {sim[1]:.6f}" for sim in similar_list])
    lookalikes_for_csv.append({'CustomerID': customer_id, 'Lookalikes': lookalikes_str})

# Create a DataFrame and save to CSV
lookalikes_df = pd.DataFrame(lookalikes_for_csv)
lookalikes_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv has been created.")

Lookalike.csv has been created.


In [13]:
import pandas as pd
import numpy as np

# Assuming `lookalikes` is a dictionary with customer IDs and their lookalikes
lookalikes = {
    'C0001': [('C0137', np.float64(0.2214039003915598)), ('C0152', np.float64(0.3657473231895993)), ('C0191', np.float64(0.3996859373299195))],
    'C0002': [('C0029', np.float64(0.0926072364436609)), ('C0031', np.float64(0.40654195920725744)), ('C0142', np.float64(0.5281648966830289))],
    # Add more customers as needed
}

# Convert the lookalikes dictionary to a suitable format for CSV
lookalikes_for_csv = []

for customer_id, lookalike_list in lookalikes.items():
    lookalikes_str = '; '.join([f"{lookalike[0]}:{float(lookalike[1]):.6f}" for lookalike in lookalike_list])
    lookalikes_for_csv.append({'CustomerID': customer_id, 'Lookalikes': lookalikes_str})

# Create a DataFrame and save to CSV
lookalikes_df = pd.DataFrame(lookalikes_for_csv)
lookalikes_df.to_csv('Lookalike.csv', index=False)

print(lookalikes_df)

  CustomerID                                      Lookalikes
0      C0001  C0137:0.221404; C0152:0.365747; C0191:0.399686
1      C0002  C0029:0.092607; C0031:0.406542; C0142:0.528165


In [1]:
import pandas as pd

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Display  first few rows 
print(customers.head())
print(products.head())
print(transactions.head())

# Check for missing values
print(customers.isnull().sum())
print(products.isnull().sum())
print(transactions.isnull().sum())

# Fill missing values if any (example: fill missing region with 'Unknown')
customers['Region'] = customers['Region'].fillna('Unknown')

# Merge datasets to get complete transaction information
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Display the merged data
print(merged_data.head())
print(merged_data.columns)  # Check column names



# Aggregate transaction data to create customer profiles
customer_profiles = merged_data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean', 'count'],
    'Price_x': 'mean',
    'Quantity': 'sum'
}).reset_index()

# Flatten the MultiIndex columns
customer_profiles.columns = ['CustomerID', 'TotalSpend', 'AvgOrderValue', 'NumTransactions', 'AvgPrice', 'TotalQuantity']

# Merge customer profiles with customer information
customer_profiles = customer_profiles.merge(customers, on='CustomerID')

# Display the customer profiles
print(customer_profiles.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       