In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [3]:
# Load datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

# Merge datasets
merged = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')


In [5]:
print(transactions.columns)
print(products.columns)


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')
Index(['ProductID', 'ProductName', 'Category', 'Price'], dtype='object')


In [7]:
customer_features = merged.groupby('CustomerID').agg({
    'Region': 'first',            # Region information
    'Quantity': 'sum',            # Total quantity purchased
    'TotalValue': 'sum',          # Total transaction value
}).reset_index()


In [8]:
# Encode categorical variables
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

In [9]:
# Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

# Calculate cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Convert similarity scores to a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


In [10]:
# Function to get top 3 similar customers
def get_top_similar(customers_df, customer_id, top_n=3):
    similar_customers = customers_df[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
    return [(cust, round(score, 3)) for cust, score in similar_customers.items()]

# Generate recommendations for C0001-C0020
lookalike_map = {}
for customer_id in customer_features['CustomerID'][:20]:
    lookalike_map[customer_id] = get_top_similar(similarity_df, customer_id)

# Convert to a DataFrame
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Recommendations': lookalike_map[cust_id]}
    for cust_id in lookalike_map
])
lookalike_df.to_csv('Manasa_Katika_Lookalike.csv', index=False)


In [12]:
# Function to display top 3 similar customers for a given CustomerID
def display_top_3_similar(similarity_df, customer_id):
    # Sort similarity scores in descending order
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    print(f"Top 3 similar customers to {customer_id}:")
    for cust, score in similar_customers.items():
        print(f"CustomerID: {cust}, Similarity Score: {round(score, 3)}")

# Example: Display top 3 similar customers for C0001
display_top_3_similar(similarity_df, 'C0001')


Top 3 similar customers to C0001:
CustomerID: C0107, Similarity Score: 0.996
CustomerID: C0137, Similarity Score: 0.996
CustomerID: C0184, Similarity Score: 0.996
