**Task 2: Lookalike Model**

In [14]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [15]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Inspect the datasets
print("Customers Data:")
print(customers.head())
print("\nProducts Data:")
print(products.head())
print("\nTransactions Data:")
print(transactions.head())


Customers Data:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Products Data:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Transactions Data:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127 

In [16]:
# Merge the data to create a unified dataset
merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')
merged_data = pd.merge(merged_data, products, on='ProductID', how='left')

In [17]:
# Check the merged data columns to identify the correct Price column
print("\nMerged Data Columns:")
print(merged_data.columns)


Merged Data Columns:
Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')


In [18]:
# Use 'Price_y' as it corresponds to the product price in the Products dataset
merged_data['Price'] = merged_data['Price_y']

In [19]:
# Drop unnecessary columns like 'Price_x', 'Price_y' (from both Transactions and Products)
merged_data = merged_data.drop(['Price_x', 'Price_y'], axis=1)


In [20]:
# Aggregate the data to create customer profiles
customer_features = merged_data.groupby('CustomerID').agg({
    'Price': ['sum', 'mean'],
    'Quantity': 'sum',
    'Category': 'nunique',
})

In [21]:
# Flatten the multi-level column headers
customer_features.columns = ['_'.join(col).strip() for col in customer_features.columns.values]


In [22]:
# Add demographic information (region, signup date)
customer_demo = customers[['CustomerID', 'Region', 'SignupDate']]
customer_features = customer_features.merge(customer_demo, on='CustomerID')

In [23]:
# Calculate the time since signup (in days)
customer_features['SignupDays'] = (pd.to_datetime('today') - pd.to_datetime(customer_features['SignupDate'])).dt.days

In [24]:
# Normalize the features (excluding CustomerID)
features_to_scale = ['Price_sum', 'Quantity_sum', 'Category_nunique', 'SignupDays']
scaler = StandardScaler()
customer_features[features_to_scale] = scaler.fit_transform(customer_features[features_to_scale])


In [25]:
# Check the final customer feature set
print("\nCustomer Features After Normalization:")
print(customer_features.head())


Customer Features After Normalization:
  CustomerID  Price_sum  Price_mean  Quantity_sum  Category_nunique  \
0      C0001   0.033326  278.334000     -0.122033          0.160540   
1      C0002  -0.806919  208.920000     -0.448000         -0.904377   
2      C0003  -0.886789  195.707500      0.203934          0.160540   
3      C0004   0.839461  240.636250      1.670787          0.160540   
4      C0005  -0.747783  291.603333     -0.936951         -0.904377   

          Region  SignupDate  SignupDays  
0  South America  2022-07-10    1.148752  
1           Asia  2022-02-13    1.600431  
2  South America  2024-03-07   -0.713270  
3  South America  2022-10-09    0.869141  
4           Asia  2022-08-15    1.038137  


In [26]:
# Calculate cosine similarity between customers based on their profiles
customer_profile = customer_features[['Price_sum', 'Quantity_sum', 'Category_nunique', 'SignupDays']]

In [27]:
# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(customer_profile)

In [28]:
# Convert the similarity matrix into a DataFrame for easy manipulation
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features.index, columns=customer_features.index)

In [33]:
# Get the top 3 lookalikes for the first 20 customers
lookalikes = {}
for customer_id in customer_features.index[:20]:
    # Sort the similarity scores in descending order, excluding the customer itself
    similar_customers = similarity_df[customer_id].drop(customer_id).sort_values(ascending=False).head(3)

    # Store the top 3 lookalikes and their similarity scores
    lookalikes[customer_id] = list(zip(similar_customers.index, similar_customers.values))


In [35]:
lookalike_data = []

In [36]:
# Iterate through the lookalikes dictionary and create the proper structure
for customer_id, similar_customers in lookalikes.items():
    # Unzip customer IDs and scores from the list of tuples
    lookalike_ids, lookalike_scores = zip(*similar_customers)

    # Create a row for each customer with their top 3 lookalikes and similarity scores
    lookalike_data.append([customer_id] + list(lookalike_ids) + list(lookalike_scores))

In [37]:
# Define the columns for the DataFrame
columns = ['CustomerID', 'Lookalike_1', 'Lookalike_2', 'Lookalike_3', 'Score_1', 'Score_2', 'Score_3']

In [38]:
# Create the DataFrame
lookalike_df = pd.DataFrame(lookalike_data, columns=columns)

In [39]:
# Save the result to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

In [55]:
from google.colab import files

# Download the Lookalike.csv file to your local system
files.download('Lookalike.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [40]:
# Check the first few rows of the result
print("\nTop 3 Lookalikes for First 20 Customers:")
print(lookalike_df.head())


Top 3 Lookalikes for First 20 Customers:
   CustomerID  Lookalike_1  Lookalike_2  Lookalike_3   Score_1   Score_2  \
0           0           78          151           10  0.998197  0.977191   
1           1          175           69           72  0.969665  0.967985   
2           2           29          110           30  0.941011  0.884952   
3           3          164          147          108  0.980833  0.973796   
4           4          130          158           82  0.993211  0.989701   

    Score_3  
0  0.967550  
1  0.966898  
2  0.817041  
3  0.964495  
4  0.987434  
