In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
def build_lookalike_model(customers_path, products_path, transactions_path):
    # Load datasets
    customers = pd.read_csv(r"Clean_Customers.csv")
    products = pd.read_csv(r"Clean_Products.csv")
    transactions = pd.read_csv(r"Clean_Transactions.csv")

    # Merge datasets
    merged = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')

    # Resolve duplicate Price columns
    if 'Price_x' in merged.columns and 'Price_y' in merged.columns:
        print("Resolving duplicate Price columns.")
        merged['Price'] = merged['Price_y']  # Use Price_y from Products.csv
        merged.drop(columns=['Price_x', 'Price_y'], inplace=True)

    # Ensure Price column exists after resolution
    if 'Price' not in merged.columns:
        raise KeyError("The 'Price' column is missing after merging. Check your datasets.")

    # Feature Engineering
    customer_profiles = merged.groupby('CustomerID').agg({
        'TotalValue': 'sum',  # Total spend
        'Quantity': 'sum',    # Total quantity purchased
        'Price': 'mean',      # Average price of products bought
        'Category': lambda x: ','.join(x.unique()),  # Product preferences
        'Region': 'first',    # Customer region
        'SignupDate': 'first' # Signup date
    }).reset_index()

    # Encode categorical features
    encoder = OneHotEncoder()
    encoded_region = encoder.fit_transform(customer_profiles[['Region']]).toarray()
    region_columns = [f"Region_{cat}" for cat in encoder.categories_[0]]
    encoded_region_df = pd.DataFrame(encoded_region, columns=region_columns)

    # Normalize numerical features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(customer_profiles[['TotalValue', 'Quantity', 'Price']])
    scaled_df = pd.DataFrame(scaled_features, columns=['Scaled_TotalValue', 'Scaled_Quantity', 'Scaled_Price'])

    # Combine all features
    customer_features = pd.concat([customer_profiles[['CustomerID']], scaled_df, encoded_region_df], axis=1)

    return customer_features, scaler, encoder


In [3]:
def recommend_similar(customers_features, scaler, encoder):
    # Take user input
    print("Enter Customer Details:")
    total_value = float(input("Total Value (Sum of purchases): "))
    quantity = int(input("Total Quantity: "))
    price = float(input("Average Price of Products: "))
    region = input("Region: ")

    # Preprocess input data
    input_data_scaled = scaler.transform([[total_value, quantity, price]])
    input_data_encoded = encoder.transform([[region]]).toarray()
    
    # Combine features
    input_features = np.concatenate([input_data_scaled, input_data_encoded], axis=1)

    # Compute similarity
    similarity_scores = cosine_similarity(customers_features.drop('CustomerID', axis=1), input_features)

    # Find top 3 similar customers
    top_indices = np.argsort(similarity_scores[:, 0])[-4:-1][::-1]  # Exclude the input customer
    recommendations = [(customers_features.iloc[i]['CustomerID'], similarity_scores[i, 0]) for i in top_indices]

    return recommendations

In [7]:
def save_lookalike_csv(customers_features, output_path="Lookalike.csv"):
    # Initialize an empty dictionary for storing lookalikes
    lookalike_map = {}

    # Iterate through each customer to find top 3 lookalikes
    for idx in range(len(customers_features)):
        cust_id = customers_features.iloc[idx]['CustomerID']
        input_features = customers_features.drop('CustomerID', axis=1).iloc[idx].values.reshape(1, -1)

        # Compute similarity scores
        similarity_scores = cosine_similarity(customers_features.drop('CustomerID', axis=1), input_features)

        # Find top 3 similar customers (excluding the customer itself)
        top_indices = np.argsort(similarity_scores[:, 0])[-4:-1][::-1]
        top_lookalikes = [
            (customers_features.iloc[i]['CustomerID'], round(similarity_scores[i, 0], 4))
            for i in top_indices
        ]

        # Store the results in the map
        lookalike_map[cust_id] = top_lookalikes

    # Convert the lookalike map into a DataFrame and save it as CSV
    lookalike_data = []
    for cust_id, lookalikes in lookalike_map.items():
        lookalike_data.append({
            "CustomerID": cust_id,
            "Lookalikes": str(lookalikes)  # Convert list to string for saving
        })

    lookalike_df = pd.DataFrame(lookalike_data)

    # Debug print to ensure data is correct
    print("\nSample Lookalikes DataFrame:")
    print(lookalike_df.head())

    # Save the DataFrame to CSV
    try:
        lookalike_df.to_csv(output_path, index=False)
        print(f"Lookalike.csv has been successfully saved to {output_path}.")
    except Exception as e:
        print(f"An error occurred while saving the file: {e}")


In [8]:
def evaluate_model(customers_features):
    # Simulate predictions (for demonstration purposes)
    y_true = customers_features.drop('CustomerID', axis=1).values
    y_pred = y_true + np.random.normal(0, 0.1, y_true.shape)  # Adding noise for evaluation simulation

    # Calculate evaluation metrics
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    # Print evaluation metrics
    print("\nModel Evaluation Metrics:")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R-squared Score (R²): {r2:.4f}")

In [10]:
if __name__ == "__main__":
    # Paths to CSV files
    customers_path = r"Clean_Customers.csv"
    products_path = r"Clean_Products.csv"
    transactions_path = r"Clean_Transactions.csv"

    # Build the model
    customer_features, scaler, encoder = build_lookalike_model(customers_path, products_path, transactions_path)

    # Recommend similar customers
    recommendations = recommend_similar(customer_features, scaler, encoder)

    # Print recommendations
    print("\nTop 3 Lookalikes:")
    for customer_id, score in recommendations:
        print(f"CustomerID: {customer_id}, Similarity Score: {score:.4f}")

    # Evaluate the model
    evaluate_model(customer_features)

Resolving duplicate Price columns.
Enter Customer Details:


Total Value (Sum of purchases):  5000
Total Quantity:  50
Average Price of Products:  100
Region:  Asia



Top 3 Lookalikes:
CustomerID: C0136, Similarity Score: 0.8960
CustomerID: C0075, Similarity Score: 0.8785
CustomerID: C0084, Similarity Score: 0.8699

Model Evaluation Metrics:
Mean Squared Error (MSE): 0.0105
Mean Absolute Error (MAE): 0.0824
R-squared Score (R²): 0.9637


