In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
sns.set() 
from sklearn.model_selection import cross_val_score

In [3]:
ecommerce_data_path = '/kaggle/input/e-commerece-sales-data-2023-24/E-commerece sales data 2024.csv'
customer_details_path = '/kaggle/input/e-commerece-sales-data-2023-24/customer_details.csv'
product_details_path = '/kaggle/input/e-commerece-sales-data-2023-24/product_details.csv'

ecommerce_data = pd.read_csv(ecommerce_data_path)
customer_details = pd.read_csv(customer_details_path)
product_details = pd.read_csv(product_details_path)
print("E-commerce Sales Data:")
print(ecommerce_data.head())
print("\nCustomer Details:")
print(customer_details.head())
print("\nProduct Details:")
print(product_details.head())


E-commerce Sales Data:
   user id                        product id Interaction type  \
0      1.0  4c69b61db1fc16e7013b43fc926e502d         purchase   
1      2.0  66d49bbed043f5be260fa9f7fbff5957             view   
2      3.0  2c55cae269aebf53838484b0d7dd931a             like   
3      4.0  18018b6bc416dab347b1b7db79994afa             view   
4      5.0  e04b990e95bf73bbe6a3fa09785d7cd0             like   

        Time stamp  Unnamed: 4  
0  10/10/2023 8:00         NaN  
1  11/10/2023 8:00         NaN  
2  12/10/2023 8:00         NaN  
3  13/10/2023 8:00         NaN  
4  14/10/2023 8:00         NaN  

Customer Details:
   Customer ID  Age Gender Item Purchased  Category  Purchase Amount (USD)  \
0            1   55   Male         Blouse  Clothing                     53   
1            2   19   Male        Sweater  Clothing                     64   
2            3   50   Male          Jeans  Clothing                     73   
3            4   21   Male        Sandals  Footwear      

**Data Preprocessing**

In [4]:
# Check for missing values in each dataset
print(ecommerce_data.isnull().sum())
print(customer_details.isnull().sum())
print(product_details.isnull().sum())

# Dropping the 'Unnamed: 4' column from ecommerce_data as it appears to be fully NaN
ecommerce_data = ecommerce_data.drop(columns=['Unnamed: 4'])

# Fill missing values or drop rows/columns based on your decision
# Example: Filling missing 'Age' with median values in customer_details
customer_details['Age'] = customer_details['Age'].fillna(customer_details['Age'].median())


user id              295
product id           295
Interaction type     423
Time stamp           295
Unnamed: 4          3294
dtype: int64
Customer ID               0
Age                       0
Gender                    0
Item Purchased            0
Category                  0
Purchase Amount (USD)     0
Location                  0
Size                      0
Color                     0
Season                    0
Review Rating             0
Subscription Status       0
Shipping Type             0
Discount Applied          0
Promo Code Used           0
Previous Purchases        0
Payment Method            0
Frequency of Purchases    0
dtype: int64
Uniqe Id                     0
Product Name                 0
Brand Name               10002
Asin                     10002
Category                   830
Upc Ean Code              9968
List Price               10002
Selling Price              107
Quantity                 10002
Model Number              1772
About Product              273
Prod

In [13]:
def convert_price(price):
    if isinstance(price, str):
        # Remove dollar sign and extra spaces
        price = price.replace('$', '').replace(' ', '')

        # Handle ranges by taking the average of the low and high values, even if multiple dashes are present
        if '-' in price:
            parts = price.split('-')
            try:
                # Attempt to parse the first and last parts as the low and high values
                low = float(parts[0])
                high = float(parts[-1])
                return (low + high) / 2
            except ValueError:
                # If conversion fails, default to NaN
                return np.nan
        else:
            try:
                return float(price)
            except ValueError:
                return np.nan
    else:
        # If the input is not a string (likely NaN or already a number), return it directly
        return price

# Apply the function to the 'Selling Price' column
product_details['Selling Price'] = product_details['Selling Price'].apply(convert_price)

# Filling missing 'Selling Price' with the median value, calculating median on non-NaN values
median_price = product_details['Selling Price'].dropna().median()
product_details['Selling Price'] = product_details['Selling Price'].fillna(median_price)

# Verify the changes
print(product_details['Selling Price'].head())


0    237.68
1     99.95
2     34.99
3     28.91
4     17.49
Name: Selling Price, dtype: float64


In [8]:
# Dropping rows with missing 'user id', 'product id', or 'Time stamp'
ecommerce_data.dropna(subset=['user id', 'product id', 'Time stamp'], inplace=True)

# Handling missing 'Interaction type' by labeling them as 'unknown'
ecommerce_data['Interaction type'] = ecommerce_data['Interaction type'].fillna('unknown')


**Merging dataset**

In [15]:
# Merge ecommerce_data with customer_details on user id, adding suffixes to distinguish duplicate column names
merged_data = ecommerce_data.merge(
    customer_details, 
    left_on='user id', 
    right_on='Customer ID', 
    how='left', 
    suffixes=('_ecomm', '_cust')
)

# Print the first few rows to check the merged data
print(merged_data.head())


   user id                        product id Interaction type  \
0      1.0  4c69b61db1fc16e7013b43fc926e502d         purchase   
1      2.0  66d49bbed043f5be260fa9f7fbff5957             view   
2      3.0  2c55cae269aebf53838484b0d7dd931a             like   
3      4.0  18018b6bc416dab347b1b7db79994afa             view   
4      5.0  e04b990e95bf73bbe6a3fa09785d7cd0             like   

        Time stamp  Customer ID_x  Age_x Gender_x Item Purchased_x Category_x  \
0  10/10/2023 8:00            1.0   55.0     Male           Blouse   Clothing   
1  11/10/2023 8:00            2.0   19.0     Male          Sweater   Clothing   
2  12/10/2023 8:00            3.0   50.0     Male            Jeans   Clothing   
3  13/10/2023 8:00            4.0   21.0     Male          Sandals   Footwear   
4  14/10/2023 8:00            5.0   45.0     Male           Blouse   Clothing   

   Purchase Amount (USD)_x  ... Color_cust  Season Review Rating  \
0                     53.0  ...       Gray  Winter    

In [16]:
# Merge the resulting data with product_details on product id
final_data = merged_data.merge(product_details, left_on='product id', right_on='Uniqe Id', how='left')


**Feature engineering**

In [18]:
# Convert 'Time stamp' to datetime format specifying the day-first format
final_data['Time stamp'] = pd.to_datetime(final_data['Time stamp'], dayfirst=True)

# Extract day of the week and hour of the day
final_data['day_of_week'] = final_data['Time stamp'].dt.day_name()
final_data['hour_of_day'] = final_data['Time stamp'].dt.hour

# Display the new columns to confirm changes
print(final_data[['Time stamp', 'day_of_week', 'hour_of_day']].head())


           Time stamp day_of_week  hour_of_day
0 2023-10-10 08:00:00     Tuesday            8
1 2023-10-11 08:00:00   Wednesday            8
2 2023-10-12 08:00:00    Thursday            8
3 2023-10-13 08:00:00      Friday            8
4 2023-10-14 08:00:00    Saturday            8


**Encode Categorical Variables**

In [19]:
# Perform one-hot encoding on 'Interaction type' and 'day_of_week'
final_data_encoded = pd.get_dummies(final_data, columns=['Interaction type', 'day_of_week'])

# Display the first few rows to see the transformed dataset
print(final_data_encoded.head())


   user id                        product id          Time stamp  \
0      1.0  4c69b61db1fc16e7013b43fc926e502d 2023-10-10 08:00:00   
1      2.0  66d49bbed043f5be260fa9f7fbff5957 2023-10-11 08:00:00   
2      3.0  2c55cae269aebf53838484b0d7dd931a 2023-10-12 08:00:00   
3      4.0  18018b6bc416dab347b1b7db79994afa 2023-10-13 08:00:00   
4      5.0  e04b990e95bf73bbe6a3fa09785d7cd0 2023-10-14 08:00:00   

   Customer ID_x  Age_x Gender_x Item Purchased_x Category_x  \
0            1.0   55.0     Male           Blouse   Clothing   
1            2.0   19.0     Male          Sweater   Clothing   
2            3.0   50.0     Male            Jeans   Clothing   
3            4.0   21.0     Male          Sandals   Footwear   
4            5.0   45.0     Male           Blouse   Clothing   

   Purchase Amount (USD)_x     Location_x  ... Interaction type_purchase  \
0                     53.0       Kentucky  ...                      True   
1                     64.0          Maine  ...        

**Create Interaction Features**

In [20]:
# Create a new interaction feature by multiplying 'hour_of_day' and 'Purchase Amount (USD)'
final_data_encoded['hour_x_purchase_amount'] = final_data_encoded['hour_of_day'] * final_data_encoded['Purchase Amount (USD)']

# Display the first few rows to check the new feature
print(final_data_encoded[['hour_of_day', 'Purchase Amount (USD)', 'hour_x_purchase_amount']].head())


   hour_of_day  Purchase Amount (USD)  hour_x_purchase_amount
0            8                     53                     424
1            8                     64                     512
2            8                     73                     584
3            8                     90                     720
4            8                     49                     392


**Splitting Data into Training and Testing Sets**

In [22]:
from sklearn.model_selection import train_test_split

# Set the target as 'Interaction type_purchase' for predicting purchases
X = final_data_encoded.drop('Interaction type_purchase', axis=1)  # Features
y = final_data_encoded['Interaction type_purchase']  # Target variable

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the shapes of the resulting splits
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)


Training features shape: (2099, 112)
Testing features shape: (900, 112)
Training labels shape: (2099,)
Testing labels shape: (900,)


**Model Training**

In [24]:
# List all non-numeric columns in the DataFrame
non_numeric_columns = X_train.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)


Non-numeric columns: Index(['product id', 'Gender_x', 'Item Purchased_x', 'Category_x',
       'Location_x', 'Size_x', 'Color_x', 'Season_x', 'Subscription Status_x',
       'Shipping Type_x', 'Discount Applied_x', 'Promo Code Used_x',
       'Payment Method_x', 'Frequency of Purchases_x', 'Uniqe Id_x',
       'Product Name_x', 'Category_y', 'Upc Ean Code_x', 'Selling Price_x',
       'Model Number_x', 'About Product_x', 'Product Specification_x',
       'Technical Details_x', 'Shipping Weight_x', 'Product Dimensions_x',
       'Image_x', 'Variants_x', 'Product Url_x', 'Is Amazon Seller_x',
       'Gender_y', 'Item Purchased_y', 'Category_ecomm', 'Location_y',
       'Size_y', 'Color_ecomm', 'Season_y', 'Subscription Status_y',
       'Shipping Type_y', 'Discount Applied_y', 'Promo Code Used_y',
       'Payment Method_y', 'Frequency of Purchases_y', 'Gender',
       'Item Purchased', 'Category_cust', 'Location', 'Size', 'Color_cust',
       'Season', 'Subscription Status', 'Shipping Ty

In [29]:
# Drop columns where all values are NaN in the training set
X_train_numeric = X_train_numeric.dropna(axis=1, how='all')
X_test_numeric = X_test_numeric.dropna(axis=1, how='all')

# Now, fill any remaining NaNs with the median of each column
X_train_numeric.fillna(X_train_numeric.median(), inplace=True)
X_test_numeric.fillna(X_train_numeric.median(), inplace=True)

# Check again to ensure no NaN values remain
print("NaNs in Training Data after dropping and filling:", X_train_numeric.isnull().sum().sum())
print("NaNs in Testing Data after dropping and filling:", X_test_numeric.isnull().sum().sum())

# Assuming no NaNs, retry training the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_numeric, y_train)
y_pred = model.predict(X_test_numeric)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_numeric.fillna(X_train_numeric.median(), inplace=True)


NaNs in Training Data after dropping and filling: 0
NaNs in Testing Data after dropping and filling: 0
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

       False       1.00      1.00      1.00       633
        True       1.00      1.00      1.00       267

    accuracy                           1.00       900
   macro avg       1.00      1.00      1.00       900
weighted avg       1.00      1.00      1.00       900



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**Cross Validation**

In [30]:
# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, X_train_numeric, y_train, cv=5)

# Print the accuracy for each fold
print("Accuracy scores for each fold:", cv_scores)

# Print mean and standard deviation of scores
print("Mean cross-validation score: %.2f" % cv_scores.mean())
print("Standard deviation of cross-validation scores: %.2f" % cv_scores.std())


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy scores for each fold: [1. 1. 1. 1. 1.]
Mean cross-validation score: 1.00
Standard deviation of cross-validation scores: 0.00


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
from joblib import dump

# Save the model to a file
model_filename = 'logistic_regression_model.joblib'
dump(model, model_filename)

print(f"Model saved to {model_filename}")


Model saved to logistic_regression_model.joblib
