# Task 2 Lookalike Model

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

df_customers = pd.read_csv("Customers.csv")
df_transactions = pd.read_csv("Transactions.csv")

df_transactions.columns = df_transactions.columns.str.strip()
df_customers.columns = df_customers.columns.str.strip()

amount_column = "TotalValue"

transaction_summary = df_transactions.groupby("CustomerID").agg(
    total_spent=(amount_column, "sum"),
    purchase_count=("TransactionID", "count")
).reset_index()

df_customers = df_customers.merge(transaction_summary, on="CustomerID", how="left").fillna(0)

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
categorical_features = ["Segment"] if "Segment" in df_customers.columns else []

encoded_features = encoder.fit_transform(df_customers[categorical_features]) if categorical_features else np.array([])

encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features)) if categorical_features else pd.DataFrame()

scaler = StandardScaler()
numerical_features = ["total_spent", "purchase_count"]
scaled_features = scaler.fit_transform(df_customers[numerical_features])

final_data = np.hstack((scaled_features, encoded_features)) if categorical_features else scaled_features
customer_ids = df_customers["CustomerID"].tolist()

similarity_matrix = cosine_similarity(final_data)

lookalike_results = {}

for idx, customer_id in enumerate(customer_ids[:20]):
    similar_indices = np.argsort(similarity_matrix[idx])[::-1][1:4]
    similar_customers = [(customer_ids[i], round(similarity_matrix[idx, i], 4)) for i in similar_indices]
    lookalike_results[customer_id] = similar_customers

lookalike_df = pd.DataFrame([
    {"CustomerID": cust_id, "Jai_Kumar_Lookalikes": str(similar_list)}
    for cust_id, similar_list in lookalike_results.items()
])
lookalike_df.to_csv("Jai_Kumar_Lookalike.csv", index=False)

print("Top 3 Jai_Kumar_Lookalike for the first 20 customers:")
for cust_id, similar_list in list(lookalike_results.items())[:20]:
    print(f"Customer: {cust_id}")
    for similar_customer, score in similar_list:
        print(f"\tJai_Kumar_Lookalike: {similar_customer}, Similarity Score: {score}")
    
print("\nJai_Kumar_Lookalike.csv has been successfully generated!")

print("\nExample Jai_Kumar_Lookalike.csv content (first 5 customers):")
sample_output = lookalike_df.head(5)
print(sample_output)


Top 3 Jai_Kumar_Lookalike for the first 20 customers:
Customer: C0001
	Jai_Kumar_Lookalike: C0076, Similarity Score: 1.0
	Jai_Kumar_Lookalike: C0152, Similarity Score: 1.0
	Jai_Kumar_Lookalike: C0164, Similarity Score: 1.0
Customer: C0002
	Jai_Kumar_Lookalike: C0029, Similarity Score: 0.9998
	Jai_Kumar_Lookalike: C0199, Similarity Score: 0.9995
	Jai_Kumar_Lookalike: C0010, Similarity Score: 0.9994
Customer: C0003
	Jai_Kumar_Lookalike: C0095, Similarity Score: 1.0
	Jai_Kumar_Lookalike: C0150, Similarity Score: 1.0
	Jai_Kumar_Lookalike: C0144, Similarity Score: 1.0
Customer: C0004
	Jai_Kumar_Lookalike: C0067, Similarity Score: 1.0
	Jai_Kumar_Lookalike: C0021, Similarity Score: 0.9999
	Jai_Kumar_Lookalike: C0075, Similarity Score: 0.9996
Customer: C0005
	Jai_Kumar_Lookalike: C0130, Similarity Score: 1.0
	Jai_Kumar_Lookalike: C0144, Similarity Score: 1.0
	Jai_Kumar_Lookalike: C0150, Similarity Score: 1.0
Customer: C0006
	Jai_Kumar_Lookalike: C0079, Similarity Score: 0.9999
	Jai_Kumar_Looka

### Jai Kumar