In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
Bike = pd.read_csv('cleaned_df.csv')

In [None]:
# Step 1: Data Preparation
features = ['cost_of_bike', 'rating', 'price']
data = Bike[features]

In [None]:
# Step 2: Feature Scaling
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

In [None]:
# Step 3: Determine the Number of Clusters (K)
# Use the elbow method to find the optimal number of clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)

In [None]:
# Plot the elbow method
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.show()

In [None]:
# Based on the elbow method, choose an optimal K (number of clusters)
optimal_k = 3

In [None]:
# Step 4: Apply K-Means Clustering
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans.fit_predict(scaled_data)

In [None]:
# Step 5: Analyze Cluster Results
# Add cluster labels to the original DataFrame
Bike['cluster'] = cluster_labels

In [None]:
# Step 6: Visualization using PCA for simplicity (you can choose other features)
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(scaled_data)

In [None]:
# Visualize clusters in 2D space
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=cluster_labels, cmap='viridis', alpha=0.5)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
plt.title('K-Means Clustering')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

In [None]:
#silhouette_scores = []
#for i in range(2, 11):
    #kmeans = KMeans(n_clusters=i, random_state=42)
    #labels = kmeans.fit_predict(scaled_data)
    #silhouette_scores.append(silhouette_score(scaled_data, labels))

# Plot silhouette scores
#plt.plot(range(2, 11), silhouette_scores, marker='o')
#plt.title('Silhouette Analysis for Optimal K')
#plt.xlabel('Number of Clusters (K)')
#plt.ylabel('Silhouette Score')
#plt.show()

# Regression Analysis: Use regression models to analyze the relationship between various factors (criteria, rating, location) and the price of insurance products. This can help in determining the optimal pricing strategy for Bike ProteKt's products.

In [None]:
# Step 1: Data Preparation
# Select relevant features and the target variable (price)
features = ['criteria', 'rating', 'location']
target_variable = 'price'
data = Bike[features + [target_variable]]

In [None]:
# Handle categorical variables (one-hot encoding)
data = pd.get_dummies(data, columns=['criteria', 'location'], drop_first=True)

In [None]:
# Step 2: Split the data into training and testing sets
X = data.drop(target_variable, axis=1)
y = data[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 3: Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Step 4: Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# Step 5: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


- The MSE is 189.22. Lower MSE values are desirable, indicating better model performance. However, the interpretation depends on the scale of your target variable (price).
- R² is 0.0929, indicating that the model explains only 9.29% of the variability in the price.

In [None]:
# Step 6: Interpret the model coefficients
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print(coefficients)

'rating': For each unit increase in the rating, the price increases by approximately 0.52 units.
'criteria_Fahrrad': If the criteria are Fahrrad (compared to E-Bike), the price increases by approximately 7.09 units.
'location_bayern': If the location is Bayern (compared to other locations), the price decreases by approximately 1.53 units.

In [None]:
# Step 7: Visualize the predicted vs. actual prices
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual Prices vs. Predicted Prices')
plt.show()

The model's predictive performance is moderate, as indicated by the R-squared value.
The features in the model have a limited ability to explain the variability in prices.
The impact of each feature on the price is relatively small.

## Dynamic Pricing Models: Implement dynamic pricing models that adjust prices based on real-time market conditions, competitor pricing, and customer behavior.

In [None]:
# Assuming 'price' is the target variable, and other relevant columns are features
features = Bike[['criteria', 'cost_of_bike', 'assistance', 'segment', 'rating', 'location']]
target = Bike['price']

In [None]:
# Step 3: Convert categorical variables to numerical using one-hot encoding
features = pd.get_dummies(features, drop_first=True)

In [None]:
# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [None]:
# Step 5: Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Step 6: Make predictions on the test set
predictions = model.predict(X_test)

In [None]:
# Step 7: Evaluate the model
mse = mean_squared_error(y_test, predictions)
r_squared = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r_squared}")

In [None]:
# Step 8: Use the trained model for dynamic pricing
# Create a sample 'new_data' DataFrame with the same features used in training
new_data = {
    'criteria': ['Fahrrad'],
    'cost_of_bike': [10000],
    'assistance': ['Yes'],
    'segment': ['Performance recommendation'],
    'rating': [1.5],
    'location': ['bayern']
}

# Convert categorical variables to numerical using one-hot encoding
new_data_encoded = pd.get_dummies(pd.DataFrame(new_data), drop_first=True)

# Ensure the columns in new_data_encoded match the columns used during model training
missing_cols = set(features.columns) - set(new_data_encoded.columns)
for col in missing_cols:
    new_data_encoded[col] = 0

# Reorder the columns to match the order during model training
new_data_encoded = new_data_encoded[features.columns]


In [None]:
# Step 9: Use the trained model to predict prices for new data
new_predictions = model.predict(new_data_encoded)

# Display the predicted prices for the new data
print("Predicted Prices for New Data:")
print(new_predictions)

## Classification Models: Develop classification models to categorize customers into different segments based on their preferences, such as "Fahrrad" or "E-Bike," and other relevant criteria. This can inform targeted marketing strategies for each segment.

In [None]:
# Assuming 'Criteria' is the target variable, and other relevant columns are features
features = Bike[['cost_of_bike', 'assistance', 'segment', 'rating', 'location', 'product_variant', 'month']]
target = Bike['criteria']


In [None]:
# Step 3: Convert categorical variables to numerical using label encoding or one-hot encoding
# Using label encoding for simplicity
label_encoder = LabelEncoder()
features.loc[:, 'assistance'] = label_encoder.fit_transform(features['assistance'])
features.loc[:, 'segment'] = label_encoder.fit_transform(features['segment'])
features.loc[:, 'location'] = label_encoder.fit_transform(features['location'])
features.loc[:, 'product_variant'] = label_encoder.fit_transform(features['product_variant'])
features.loc[:, 'month'] = label_encoder.fit_transform(features['month'])


In [None]:
# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [None]:
# Step 5: Train a classification model (Random Forest Classifier in this example)
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)


In [None]:
# Step 6: Make predictions on the test set
predictions = classifier.predict(X_test)

In [None]:
# Step 7: Evaluate the classification model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy:it's around 52%, which indicates that the model's performance is slightly better than random guessing.

Confusion Matrix:

True Positives (TP): 2554 instances of "Fahrrad" correctly predicted.
True Negatives (TN): 2578 instances of "E-Bike" correctly predicted.
False Positives (FP): 2353 instances of "E-Bike" wrongly predicted as "Fahrrad."
False Negatives (FN): 2381 instances of "Fahrrad" wrongly predicted as "E
-Bike."
Preitives. For "E-Bike" and "Fahrrad," precision is both around 52%, meaning that when the model predicts an "E-Bike" or "Fahrrad," it's correct about 52% of the time.
al class. For both "E-Bike" and "Fahrrad," recall is around 52%, indicating that the model captures about 52% of the instances of each class.

F1-score: The F1-score is the weighted average of precision and recall. It is a good way to show that a classifer has a good value for both precision and recall. In this case, the F1-score is around 52%.