In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display 
from sklearn.model_selection import train_test_split  
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.cluster import KMeans

# Load the dataset  
website_traffic = pd.read_csv("website_traffic.csv")
website_traffic

In [None]:
website_traffic.info()

In [None]:
k = 6  # Determine how many of the largest correlations to consider (top 6)  

# Calculate the correlation matrix while restricting to numeric columns  
correlation_matrix = website_traffic.corr(numeric_only=True)  # Restricts the correlation calculation to only numeric columns in the DataFrame.  

# Display the correlation matrix  
display(correlation_matrix) 

# Retrieve the top k columns with the highest correlation to "Conversion Rate"  
cols = correlation_matrix.nlargest(k, "Conversion Rate")['Conversion Rate'].index  

# Print the indices of the top correlated columns  
print("\nTop correlated columns with 'Conversion Rate':")  
cols  # Prints out the indices (column names) of the highest correlated columns

In [None]:
# Calculate the correlation coefficients for the selected columns  
cm = np.corrcoef(website_traffic[cols].values.T)

# Create and display the heatmap  
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(cm, vmax=1.0, linewidth=0.01, square=True, annot = True, fmt=".2f",cmap='coolwarm', xticklabels=cols.values, annot_kws = {'size': 12}, yticklabels=cols.values)
plt.title('Correlation Heatmap (Sorted by Impact on Conversion Rate)', fontsize=16)  
plt.show()  

In [None]:
#Question 2

# Feature Selection: Choose features with a positive correlation as potential predictors for Conversion Rate.  
positive_corr_features = correlation_matrix['Conversion Rate'][correlation_matrix['Conversion Rate'] > 0].index  

# Print the positive correlated features with Conversion Rate  
print("\nFeatures with a positive correlation to 'Conversion Rate':")  
print(positive_corr_features)

# Create a DataFrame to show all the positive correlated features and their data  
positive_corr_data = website_traffic[positive_corr_features]  

# Display the DataFrame with positive correlated features  
# print("\nData for features with a positive correlation to 'Conversion Rate':")  
# display(positive_corr_data)  

In [None]:
# Question 3: Define X and y for prediction  
# drop to ensure that the feature matrix X only contains the input features and not the target variable, making it suitable for training predictive models.
x = website_traffic[positive_corr_features].drop(['Conversion Rate'], axis=1)  
y = website_traffic['Conversion Rate']  

print("\nFeature matrix (X):")  
display(x)  

print("\nTarget variable (y):")  
display(y)  

In [None]:
### Question 4: Data Splitting  
train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=0.2, random_state= 40)  

print("\nTraining feature matrix (train_x):")  
display(train_x)  

print("\nValidation feature matrix (val_x):")  
display(val_x)  

print("\nTraining target variable (train_y):")  
display(train_y)  

print("\nValidation target variable (val_y):")  
display(val_y)  

In [None]:
### Question 5: Calculate MAE using Decision Tree Regressor  
# Create a Decision Tree Regressor model  
model = LinearRegression()
# model = DecisionTreeRegressor(random_state= 42)
model.fit(train_x, train_y)  # Train the model

In [None]:
### Question 6: Analyse Results
val_y_pred = model.predict(val_x) 
# Evaluate the model using Mean Absolute Error (MAE)  
mae = mean_absolute_error(val_y, val_y_pred)  

print("\nModel Evaluation:")  
print(f"Mean Absolute Error (MAE): {mae}")  

# Interpretation of the MAE  
interpretation = (  
    "The Mean Absolute Error (MAE) of approximately {:.4f} indicates that, on average, the predictions made "  
    "by the Linear Regression model for the Conversion Rate are off by about {:.4f} units. Since the Conversion "  
    "Rate typically ranges between 0 and 1, this MAE value suggests that the model's predictions are quite accurate" 
    "(no vast differences between the trainning and testing), "  
    "with errors being relatively small compared to the range of possible values. This indicates that the Linear "  
    "Regression model is effectively capturing the underlying trends in the data, leading to a good performance in "  
    "predicting the target variable. Overall, the model demonstrates a strong ability to predict conversion rates with "  
    "minimal average error."  
).format(mae, mae)  

print("\nInterpretation of MAE:")  
print(interpretation)  

In [None]:
### Question 7-9: Determine the Optimal Number of Clusters  
from sklearn.preprocessing import StandardScaler  

# Standardize features  
# scaler = StandardScaler()  
# x_scaled = scaler.fit_transform(x)  # Use the previously defined `x` for features  

# Initialize a list to store inertia values  
inertia_values = []  

# Use a for loop to calculate inertia for cluster counts from 1 to 10  
for i in range(1, 11):  
    kmeans = KMeans(n_clusters=i, random_state=42)  
    kmeans.fit_predict(x)  # Fit on the x
    inertia_values.append(kmeans.inertia_)  # Collect the inertia value for each k  

# Plot the inertia values to visualize the elbow method  
plt.figure(figsize=(10, 6))  
plt.plot(range(1, 11), inertia_values, marker='o')  
plt.xlabel('Number of Clusters', fontsize=14)  
plt.ylabel('Inertia (WCSS)', fontsize=14)  
plt.title('Elbow Method for Determining Optimal Number of Clusters', fontsize=16)  
plt.xticks(range(1, 11))  # Set x-tick marks  
plt.grid(True)  
plt.show() 

In [None]:
# Fit KMeans with optimal clusters  
optimal_k = 5
kmeans = KMeans(n_clusters=optimal_k, random_state=42)  
clusters = kmeans.fit_predict(x)   

# Create a new DataFrame to hold the features and cluster labels  
clustered_data = pd.DataFrame(x, columns=x.columns)  # Use the column names from `x`  
clustered_data['Cluster'] = clusters  # Add the cluster labels  

# Print the new DataFrame with cluster labels   
print("\nDataset with Cluster Labels:")  
display(clustered_data)  

In [None]:
### Question 9 Cluster Analysis
cluster_means = clustered_data.groupby('Cluster').mean()  
print("\nMean values for each cluster:")  
cluster_means

In [None]:
# Question 10  
# Additional features to include  
additional_features = ['Bounce Rate', 'Conversion Rate']  
# Add the additional features to the clustered_data DataFrame  
for feature in additional_features:  
    if feature in website_traffic.columns:  # Ensure it exists in the original DataFrame  
        clustered_data[feature] = website_traffic[feature]  
    else:  
        print(f"{feature} does not exist in the original dataset.")  

# Recalculate mean values including additional features  
cluster_means = clustered_data.groupby('Cluster').mean()  
print("\nUpdated mean values for each cluster with additional features:")  
display(cluster_means)  

# Plot the mean values for each feature (including additional features) across clusters  
features = cluster_means.columns  
num_features = len(features)  

# Set the figure size and layout  
plt.figure(figsize=(num_features * 3, 7))  # Adjust width and height  

# Plot each feature's mean across clusters  
for i, feature in enumerate(features):  
    plt.subplot(1, num_features, i + 1)  # 1 row, num_features columns  
    plt.plot(cluster_means.index, cluster_means[feature], marker='o')  
    plt.title(feature)  
    plt.xlabel('Cluster')  
    plt.ylabel('Mean Value')  

plt.tight_layout()  
plt.show() 

In [None]:
### Question 11

# The analysis of the clusters reveals distinct patterns in user engagement and website performance.   
# Cluster 4 stands out as the best-performing group, characterized by high session duration, significant time spent on individual pages, a low bounce rate, and the highest conversion rate.   
# This suggests that users are highly engaged with the websites in this cluster, leading to effective conversions.   

# Conversely, Cluster 3 emerges as the worst-performing group, exhibiting the lowest page views, session duration, and conversion rate, i
#indicating users spend minimal time on these sites and are less likely to engage deeply with the content.   

# In contrast, Cluster 2 demonstrates strong user retention, reflected in high previous visits, while still maintaining significant page views.   
# This indicates strong user loyalty and retention, suggesting that users find value in returning to these websites.  

# Clusters 0 and 1 exhibit average performance, with Cluster 0 showing a concerningly high bounce rate, indicating potential issues with user engagement or content relevancy.   
# Notably, despite its high bounce rate, Cluster 0 maintains a very high conversion rate and a relatively high time on page.   
# This suggests that while many users leave quickly, those who stay are highly engaged and likely to convert.  

# Cluster 1 shows a balanced performance across most metrics, indicating stable but not outstanding user engagement.   
# This cluster could benefit from targeted improvements to boost specific metrics like conversion rate or session duration.  

# Overall, the findings highlight that Cluster 4 represents high traffic and effective user engagement,   
# while Cluster 3 signals low traffic and poor engagement. The sharp drop in conversion rate for Cluster 3,   
# combined with low session duration and page views, suggests that these websites may have significant issues with user experience or content relevance,   
# guiding potential strategies for improvement across different website performance metrics.  