#### Lookalike

In [1]:
import pandas as pd

In [2]:
# Load datasets
customers = pd.read_csv('/Users/jaini/Downloads/Customers.csv')
products = pd.read_csv('/Users/jaini/Downloads/Products.csv')
transactions = pd.read_csv('/Users/jaini/Downloads/Transactions.csv')

In [3]:
# Merge transactions with customers
transactions_customers = pd.merge(transactions, customers, on='CustomerID')

In [4]:
transactions_customers.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00761,C0199,P022,2024-10-01 05:57:09,4,550.16,137.54,Andrea Jenkins,Europe,2022-12-03
2,T00626,C0199,P079,2024-08-17 12:06:08,2,834.74,417.37,Andrea Jenkins,Europe,2022-12-03
3,T00963,C0199,P008,2024-10-26 00:01:58,2,293.7,146.85,Andrea Jenkins,Europe,2022-12-03
4,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04


In [5]:
# Merge the result with products
full_data = pd.merge(transactions_customers, products, on='ProductID')

print(full_data.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [6]:
# Check for Price_x and Price_y
print(full_data[['Price_x', 'Price_y']].head())  # Preview to confirm they are the same

# Drop one and rename the other
full_data = full_data.drop(columns=['Price_y']).rename(columns={'Price_x': 'Price'})

# Verify the changes
print(full_data.head())

   Price_x  Price_y
0   300.68   300.68
1   300.68   300.68
2   300.68   300.68
3   300.68   300.68
4   300.68   300.68
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price     CustomerName         Region  SignupDate  \
0      300.68  300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68  300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68  300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36  300.68  Travis Campbell  South America  2024-04-11   
4      902.04  300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName    

In [7]:
# Aggregate data by CustomerID
customer_features = full_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',    # Total spending
    'Quantity': 'sum',      # Total quantity purchased
    'Price': 'mean',        # Average product price
    'Category': lambda x: x.mode()[0],  # Most common product category
    'Region': lambda x: x.mode()[0]     # Most common region
}).reset_index()

# Encode categorical variables (Category and Region)
customer_features = pd.get_dummies(customer_features, columns=['Category', 'Region'], drop_first=True)

print(customer_features.head())

  CustomerID  TotalValue  Quantity       Price  Category_Clothing  \
0      C0001     3354.52        12  278.334000                  0   
1      C0002     1862.74        10  208.920000                  1   
2      C0003     2725.38        14  195.707500                  0   
3      C0004     5354.88        23  240.636250                  0   
4      C0005     2034.24         7  291.603333                  0   

   Category_Electronics  Category_Home Decor  Region_Europe  \
0                     1                    0              0   
1                     0                    0              0   
2                     0                    1              0   
3                     0                    0              0   
4                     1                    0              0   

   Region_North America  Region_South America  
0                     0                     1  
1                     0                     0  
2                     0                     1  
3             

#### Compute Customer Similarity

In [8]:
from sklearn.preprocessing import StandardScaler

# Standardize the numerical features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(customer_features.iloc[:, 1:])  # Exclude CustomerID

print(features_scaled[:5])

[[-0.06170143 -0.12203296  0.09467022 -0.54056248  1.84992492 -0.51721942
  -0.57928445 -0.54831888  1.54041597]
 [-0.87774353 -0.44800021 -0.90401592  1.84992492 -0.54056248 -0.51721942
  -0.57928445 -0.54831888 -0.6491753 ]
 [-0.40585722  0.20393428 -1.09410928 -0.54056248 -0.54056248  1.93341543
  -0.57928445 -0.54831888  1.54041597]
 [ 1.03254704  1.67078689 -0.44770193 -0.54056248 -0.54056248 -0.51721942
  -0.57928445 -0.54831888  1.54041597]
 [-0.78392861 -0.93695108  0.28558127 -0.54056248  1.84992492 -0.51721942
  -0.57928445 -0.54831888 -0.6491753 ]]


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
similarity_matrix = cosine_similarity(features_scaled)

# Convert similarity matrix to a DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])
print(similarity_df.head())

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000 -0.299452  0.152988  0.302983  0.558908  0.454814   
C0002      -0.299452  1.000000 -0.105868 -0.279348  0.026018 -0.331578   
C0003       0.152988 -0.105868  1.000000  0.361307 -0.296941  0.224052   
C0004       0.302983 -0.279348  0.361307  1.000000 -0.456930  0.599778   
C0005       0.558908  0.026018 -0.296941 -0.456930  1.000000 -0.136302   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.543651 -0.422514 -0.334661 -0.391361  ...  0.495132  0.941914   
C0002      -0.131731 -0.278346  0.577223  0.692261  ... -0.114244 -0.169091   
C0003      -0.413286  0.353550 -0.345491 -0.117472  ...  0.414297  0.144901   
C0004      -0.412542  0.067459 -0.649571 -0.268918  ...  0.538855  0.049054   
C0005  

#### Recommend Similar Customers

In [10]:
def recommend_customers(customer_id, top_n=3):
    # Get similarity scores for the given customer
    scores = similarity_df[customer_id]
    
    # Exclude the customer itself and sort by similarity
    similar_customers = scores.drop(customer_id).sort_values(ascending=False).head(top_n)
    
    # Return as a list of tuples (CustomerID, Similarity Score)
    return list(zip(similar_customers.index, similar_customers.values))

In [11]:
# Generate recommendations
lookalike_recommendations = {
    customer: recommend_customers(customer)
    for customer in customer_features['CustomerID'][:20]
}

# Convert recommendations to a DataFrame
lookalike_df = pd.DataFrame([
    {'CustomerID': customer, 'SimilarCustomers': recommendations}
    for customer, recommendations in lookalike_recommendations.items()
])

print(lookalike_df.head())

  CustomerID                                   SimilarCustomers
0      C0001  [(C0181, 0.9806488873652772), (C0120, 0.963992...
1      C0002  [(C0088, 0.9917448788793644), (C0106, 0.943550...
2      C0003  [(C0031, 0.948158131730056), (C0052, 0.9424308...
3      C0004  [(C0165, 0.9663279632163544), (C0169, 0.942305...
4      C0005  [(C0140, 0.9934347203051204), (C0186, 0.988532...


#### Saving output to a CSV

In [12]:
import csv

# Save recommendations to Lookalike.csv
with open('Jaini_John_Lookalike.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['cust_id', 'similar_customers'])
    for customer, recommendations in lookalike_recommendations.items():
        writer.writerow([customer, recommendations])