In [2]:
import pandas as pd

In [3]:
customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")

In [4]:
print(customers_df.head())
print(products_df.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31


In [5]:
# preprocessing

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [7]:
encoder = OneHotEncoder(sparse_output=False)

In [8]:
region_encoded = encoder.fit_transform(customers_df[["Region"]])

In [9]:
region_encoded_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(["Region"]))
customers_preprocessed = pd.concat([customers_df, region_encoded_df], axis=1)

In [10]:
##convering signup dates to numeric days 
customers_preprocessed["SignupDate"] = pd.to_datetime(customers_preprocessed["SignupDate"])
customers_preprocessed["SignupDate"] = (pd.Timestamp.now() - customers_preprocessed["SignupDate"]).dt.days
customer_features = customers_preprocessed.drop(columns = ["CustomerID","CustomerName","SignupDate","Region"])


In [11]:
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features)

In [12]:
similarity_matrix = cosine_similarity(customer_features_scaled)

In [13]:
first_20_ids = customers_df["CustomerID"][:20].tolist()  ##customer id from 1 to 20

In [14]:
lookalike_map = {}
for i, customer_id in enumerate(first_20_ids):
    similarities = list(enumerate(similarity_matrix[i]))
    sorted_similarities = sorted(
    [(customers_df.iloc[j]["CustomerID"], score) for j, score in similarities if j < len(customers_df) and j !=i],
    key = lambda x : x[1],
    reverse = True
    )
    lookalike_map[customer_id] = sorted_similarities[:3]   ## for top 3 similar customer

In [15]:
## format the result into a dataframe for CSV output
lookalike_data = []
for cust_id, similar_customers in lookalike_map.items():
    for similar_cust, score in similar_customers:
        lookalike_data.append({"CustomerID": cust_id, "LookalikeID": similar_cust, "Score": score})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_csv_path = "Gokul_Malav_Lookalike.csv"
lookalike_df.to_csv(lookalike_csv_path, index= False)

lookalike_csv_path

'Gokul_Malav_Lookalike.csv'

In [19]:
lookalike_df.sample(5)

Unnamed: 0,CustomerID,LookalikeID,Score
44,C0015,C0016,1.0
56,C0019,C0014,1.0
33,C0012,C0001,1.0
37,C0013,C0003,1.0
45,C0016,C0194,1.0
