In [87]:
# Importing basic libraries for data manipulation and analysis
import numpy as np                # For numerical operations
import pandas as pd # For data manipulation and analysis

# Importing libraries for data visualization
import matplotlib.pyplot as plt   # For basic plotting
import seaborn as sns             # For advanced and aesthetically pleasing visualizations

# Importing libraries for statistical analysis
import scipy.stats as stats       # For statistical tests and distributions
from statsmodels.tsa.stattools import adfuller  # For time series analysis (stationarity test)
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf  # For autocorrelation and partial autocorrelation plots

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Setting up display options
pd.set_option('display.max_columns', None)  # Display all columns in DataFrame
pd.set_option('display.max_rows', 100)  # Set the number of rows to display
sns.set(style="whitegrid")  # Set Seaborn style for plots

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Warnings
import warnings
warnings.filterwarnings("ignore")  # Ignore warnings to keep the output clean


# Training Models (Regression)

In [88]:
# Load the data (assuming it's in a CSV file named 'customer_data.csv')
customer_info_df = pd.read_csv("/Users/Jamie/OneDrive/Documents/python_ws/project_delta/data/processed_data/customer_information_data.csv")
df = pd.read_csv("/Users/Jamie/OneDrive/Documents/python_ws/project_delta/data/processed_data/sales_data.csv")

customer_info_df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,age,gender,income_bracket,loyalty_program,membership_years,churned,marital_status,number_of_children,education_level,occupation
0,0,1,56,Other,High,False,0,False,Divorced,3,Bachelor's,Self-Employed
1,1,2,69,Female,Medium,False,2,False,Married,2,PhD,Unemployed
2,2,3,46,Female,Low,False,5,False,Married,3,Bachelor's,Self-Employed
3,3,4,32,Female,Low,False,0,False,Divorced,2,Master's,Employed
4,4,5,60,Female,Low,True,7,True,Divorced,2,Bachelor's,Employed


In [89]:
#Concating the two dataframes
df = pd.concat([df, customer_info_df], axis=1)

In [90]:
df.head()

Unnamed: 0.2,Unnamed: 0,transaction_id,transaction_date,product_id,product_category,quantity,unit_price,discount_applied,sales,total_sales,store_location,total_transactions,avg_transaction_value,avg_purchase_value,purchase_frequency,avg_items_per_transaction,total_items_purchased,total_returned_items,total_returned_value,total_discounts_received,avg_discount_used,max_single_purchase_value,min_single_purchase_value,Unnamed: 0.1,customer_id,age,gender,income_bracket,loyalty_program,membership_years,churned,marital_status,number_of_children,education_level,occupation
0,0,503290,2020-10-11 10:08:52,1480,Electronics,8,49.72,0.5,397.76,563.16,Location A,69,171.83,411.13,Weekly,8.64,367,0,750.4,415.01,0.02,679.25,0.28,0,1,56,Other,High,False,0,False,Divorced,3,Bachelor's,Self-Employed
1,1,347796,2021-12-08 01:07:40,1597,Groceries,7,817.76,0.32,5724.32,7554.57,Location C,8,20.18,268.71,Daily,9.6,475,4,551.6,801.79,0.33,491.56,4.65,1,2,69,Female,Medium,False,2,False,Married,2,PhD,Unemployed
2,2,493688,2020-02-17 09:40:48,5142,Toys,8,270.3,0.35,2162.4,7564.14,Location A,73,55.17,246.79,Weekly,1.55,138,0,629.19,264.31,0.47,938.26,7.3,2,3,46,Female,Low,False,5,False,Married,3,Bachelor's,Self-Employed
3,3,861348,2020-08-13 00:43:14,8447,Toys,2,547.84,0.1,1095.68,8125.92,Location A,20,15.79,178.92,Weekly,1.78,158,3,346.67,192.93,0.41,644.31,7.31,3,4,32,Female,Low,False,0,False,Divorced,2,Master's,Employed
4,4,535835,2021-07-02 11:59:03,6025,Clothing,4,785.29,0.17,3141.16,114.32,Location C,83,240.03,214.06,Yearly,9.38,263,2,979.91,497.26,0.22,162.86,1.92,4,5,60,Female,Low,True,7,True,Divorced,2,Bachelor's,Employed


In [78]:
# Selecting features for clustering
features = ['product_category', 'discount_applied', 'sales', 'store_location','avg_transaction_value', 'total_transactions', 'avg_purchase_value', 'purchase_frequency', 'avg_items_per_transaction', 'total_items_purchased', 'total_returned_items', 'total_returned_value', 'total_discounts_received', 'avg_discount_used', 'max_single_purchase_value', 'min_single_purchase_value', 'age', 'gender', 'loyalty_program', 'marital_status', 'number_of_children', 'occupation', 'income_bracket', 'education_level', 'membership_years']

# Preprocessing
numeric_features = ['discount_applied', 'sales', 'avg_transaction_value', 'total_transactions', 'avg_purchase_value', 'avg_items_per_transaction', 'total_items_purchased', 'total_returned_items', 'total_returned_value', 'total_discounts_received', 'avg_discount_used', 'max_single_purchase_value', 'min_single_purchase_value', 'age', 'number_of_children', 'membership_years']

categorical_features = ['product_category', 'store_location', 'purchase_frequency', 'gender', 'loyalty_program', 'marital_status', 'occupation', 'income_bracket', 'education_level']

In [91]:
preprocessor = ColumnTransformer(
  transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(drop='first'), categorical_features)
  ])

df = df[features]
X = preprocessor.fit_transform(df)

In [84]:
"""from sklearn.cluster import MiniBatchKMeans

# Determine optimal number of clusters using silhouette score
silhouette_scores = []
K = range(2, 10)
for k in K:
    kmeans = MiniBatchKMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    silhouette_scores.append(silhouette_score(X, kmeans.labels_))

optimal_k = K[np.argmax(silhouette_scores)]

# Plot silhouette scores
plt.plot(K, silhouette_scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of Clusters')
plt.show()"""

KeyboardInterrupt: 

In [93]:
# Apply K-means with optimal number of clusters
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)

Cluster
2    355514
0    329297
1    315189
Name: count, dtype: int64


In [96]:
print(df[['age', 'income_bracket', 'gender', 'occupation', 'marital_status', 'Cluster']])

        age income_bracket  gender     occupation marital_status  Cluster
0        56           High   Other  Self-Employed       Divorced        1
1        69         Medium  Female     Unemployed        Married        0
2        46            Low  Female  Self-Employed        Married        1
3        32            Low  Female       Employed       Divorced        1
4        60            Low  Female       Employed       Divorced        1
...     ...            ...     ...            ...            ...      ...
999995   39            Low  Female  Self-Employed        Married        0
999996   77            Low  Female       Employed         Single        1
999997   67         Medium    Male     Unemployed         Single        1
999998   36            Low    Male       Employed       Divorced        1
999999   71         Medium   Other     Unemployed       Divorced        2

[1000000 rows x 6 columns]


In [98]:
# Mean of numerical features per cluster
numerical_summary = df.groupby('Cluster').mean()
print(numerical_summary)