In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv("/content/drive/MyDrive/Customers.csv")
products = pd.read_csv("/content/drive/MyDrive/Products.csv")
transactions = pd.read_csv("/content/drive/MyDrive/Transactions.csv")

# Merge datasets
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Create a pivot table for customer-product purchase quantity
customer_product_matrix = pd.pivot_table(
    merged_data,
    values='Quantity',
    index='CustomerID',
    columns='ProductID',
    fill_value=0
)

# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(customer_product_matrix)

# Create a similarity DataFrame
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=customer_product_matrix.index,
    columns=customer_product_matrix.index
)

# Generate Lookalike recommendations for the first 20 customers
lookalikes = {}
for customer_id in customers['CustomerID'][:20]:  # First 20 customers
    if customer_id in similarity_df.index:
        # Get the top 3 similar customers (excluding the customer themselves)
        top_similar = similarity_df[customer_id].sort_values(ascending=False)[1:4]
        # Create a list of tuples (customer_id, similarity_score)
        lookalikes[customer_id] = list(zip(top_similar.index, top_similar.values))

# Save the results in the specified format
lookalike_df = pd.DataFrame.from_dict(lookalikes, orient='index')

# Save to a CSV file, where each row is a customer with their lookalike list and similarity scores
lookalike_df.to_csv("Lookalike.csv", header=False, index_label="CustomerID")

print("Lookalike recommendations with similarity scores have been saved to Lookalike.csv")


Lookalike recommendations with similarity scores have been saved to Lookalike.csv


In [19]:
import pandas as pd

# Load datasets
customers = pd.read_csv("/content/drive/MyDrive/Customers.csv")
products = pd.read_csv("/content/drive/MyDrive/Products.csv")
transactions = pd.read_csv("/content/drive/MyDrive/Transactions.csv")

# Merge datasets to create a comprehensive view
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Display the first few rows of the merged data
print(merged_data.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [20]:
# Create customer profiles
customer_profiles = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Quantity': 'sum',
    'Region': 'first',  # Assuming Region is the same for a customer
    'ProductID': lambda x: list(set(x))  # Unique products purchased
}).reset_index()

# Rename columns for clarity
customer_profiles.rename(columns={
    'TotalValue': 'TotalSpent',
    'TransactionID': 'TransactionCount',
    'Quantity': 'TotalQuantity',
    'ProductID': 'UniqueProducts'
}, inplace=True)

# Display the customer profiles
print(customer_profiles.head())

  CustomerID  TotalSpent  TransactionCount  TotalQuantity         Region  \
0      C0001     3354.52                 5             12  South America   
1      C0002     1862.74                 4             10           Asia   
2      C0003     2725.38                 4             14  South America   
3      C0004     5354.88                 8             23  South America   
4      C0005     2034.24                 3              7           Asia   

                                     UniqueProducts  
0                    [P083, P029, P022, P096, P054]  
1                          [P071, P019, P095, P004]  
2                          [P035, P025, P002, P006]  
3  [P049, P038, P025, P024, P077, P097, P053, P008]  
4                                [P025, P039, P012]  


In [21]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Select relevant features for similarity calculation
features = customer_profiles[['TotalSpent', 'TransactionCount', 'TotalQuantity']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Calculate cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Convert similarity matrix to DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])

In [22]:
# Function to get lookalikes
def get_lookalikes(customer_id, top_n=3):
    # Get the similarity scores for the given customer
    scores = similarity_df[customer_id].sort_values(ascending=False)

    # Exclude the customer themselves and get the top N lookalikes
    lookalikes = scores[scores.index != customer_id].head(top_n)

    return lookalikes

# Generate lookalikes for the first 20 customers
lookalike_results = {}
for customer_id in customer_profiles['CustomerID'][:20]:
    lookalikes = get_lookalikes(customer_id)
    lookalike_results[customer_id] = list(zip(lookalikes.index, lookalikes.values))

# Convert results to DataFrame for saving
lookalike_list = []
for cust_id, lookalikes in lookalike_results.items():
    for lookalike_id, score in lookalikes:
        lookalike_list.append({'CustomerID': cust_id, 'LookalikeID': lookalike_id, 'SimilarityScore': score})

lookalike_df = pd.DataFrame(lookalike_list)

# Display the lookalike results
print(lookalike_df)

   CustomerID LookalikeID  SimilarityScore
0       C0001       C0164         0.997598
1       C0001       C0103         0.995394
2       C0001       C0069         0.986073
3       C0002       C0029         0.999754
4       C0002       C0031         0.998986
5       C0002       C0077         0.994313
6       C0003       C0176         0.902950
7       C0003       C0027         0.875121
8       C0003       C0010         0.832965
9       C0004       C0075         0.997789
10      C0004       C0165         0.994442
11      C0004       C0113         0.993976
12      C0005       C0123         0.999781
13      C0005       C0131         0.999628
14      C0005       C0058         0.999561
15      C0006       C0079         0.999882
16      C0006       C0117         0.989525
17      C0006       C0196         0.945252
18      C0007       C0125         0.998032
19      C0007       C0140         0.997960
20      C0007       C0092         0.997904
21      C0008       C0179         0.998199
22      C00

In [23]:
# Save the lookalike results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

In [24]:
from google.colab import files

# Download the CSV file
files.download('Lookalike.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>