In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('/content/Electronic - Electronic.csv.csv')
df.head()

Unnamed: 0,Age,Items Purchased,Total Spent,Discount (%),Satisfaction Score,Warranty Extension,Gender,Region,Product Category,Payment Method,Revenue,Store Rating,Loyalty Score,Membership Status,Preferred Visit Time
0,56,1,29.226195,47.07738,1.0,1,Male,South,Accessories,UPI,149.252145,3.660461,3.597133,1.0,Evening
1,69,10,420.142612,7.985739,3.760294,1,Female,South,Accessories,Cash,1485.524222,3.551553,25.764903,1.0,Evening
2,46,4,127.742817,37.225718,1.77124,1,Male,East,Laptop,Credit Card,85.550131,3.922839,7.022399,1.0,Morning
3,32,9,417.722683,8.227732,1.926831,0,Female,East,Tablet,UPI,824.118724,3.860422,7.635412,1.0,Afternoon
4,60,13,608.031366,5.0,3.902927,0,Female,South,Tablet,UPI,2463.590392,3.81282,29.461119,0.0,Morning


## **Handle missing values**

In [None]:
df1=df.copy()

In [None]:
df.isnull().sum()

Unnamed: 0,0
Age,0
Items Purchased,0
Total Spent,0
Discount (%),0
Satisfaction Score,0
Warranty Extension,0
Gender,2
Region,0
Product Category,0
Payment Method,4


In [None]:
df.shape

(5000, 15)

In [None]:
#filling categorical values
for column in ["Gender", "Payment Method", "Preferred Visit Time"]:
    df1[column].fillna(df1[column].mode()[0], inplace=True)


In [None]:
#filling store rating
df1["Store Rating"].fillna(df1["Store Rating"].mean(), inplace=True)

In [None]:
#filling membership status
df1["Membership Status"].fillna(df1["Membership Status"].median(), inplace=True)

In [None]:
df1.isnull().sum()

Unnamed: 0,0
Age,0
Items Purchased,0
Total Spent,0
Discount (%),0
Satisfaction Score,0
Warranty Extension,0
Gender,0
Region,0
Product Category,0
Payment Method,0


In [None]:
# Columns to convert to integers
columns_to_convert = ['Satisfaction Score', 'Store Rating', 'Loyalty Score']
df1[columns_to_convert] = df1[columns_to_convert].round(0).astype(int)

In [None]:
df1.head()

Unnamed: 0,Age,Items Purchased,Total Spent,Discount (%),Satisfaction Score,Warranty Extension,Gender,Region,Product Category,Payment Method,Revenue,Store Rating,Loyalty Score,Membership Status,Preferred Visit Time
0,56,1,29.226195,47.07738,1,1,Male,South,Accessories,UPI,149.252145,4,4,1.0,Evening
1,69,10,420.142612,7.985739,4,1,Female,South,Accessories,Cash,1485.524222,4,26,1.0,Evening
2,46,4,127.742817,37.225718,2,1,Male,East,Laptop,Credit Card,85.550131,4,7,1.0,Morning
3,32,9,417.722683,8.227732,2,0,Female,East,Tablet,UPI,824.118724,4,8,1.0,Afternoon
4,60,13,608.031366,5.0,4,0,Female,South,Tablet,UPI,2463.590392,4,29,0.0,Morning


In [None]:
target_column = 'Loyalty Score'
unique_counts = df1[target_column].value_counts(normalize=True)  # Proportions
total_unique = df1[target_column].nunique()  # Total number of unique values

# Display results
print(f"Total Unique Values in '{target_column}': {total_unique}")
print("\nUnique Value Proportions:")
print(unique_counts)

# Decide if it's classification or regression
if total_unique / len(df1) < 0.05:  # Arbitrary threshold for classification
    print("\nLikely a classification problem.")
else:
    print("\nLikely a regression problem.")

Total Unique Values in 'Loyalty Score': 40

Unique Value Proportions:
Loyalty Score
7     0.0538
9     0.0516
8     0.0482
6     0.0452
10    0.0444
11    0.0394
12    0.0376
5     0.0368
13    0.0356
4     0.0354
17    0.0308
14    0.0296
25    0.0288
27    0.0288
28    0.0278
15    0.0274
26    0.0272
18    0.0270
29    0.0268
19    0.0258
16    0.0250
24    0.0248
30    0.0248
23    0.0246
22    0.0246
21    0.0244
20    0.0232
31    0.0222
3     0.0214
2     0.0184
32    0.0122
0     0.0112
33    0.0104
1     0.0080
34    0.0066
35    0.0056
36    0.0026
37    0.0012
38    0.0006
39    0.0002
Name: proportion, dtype: float64

Likely a classification problem.


In [None]:
# Categorizing 'Loyalty Score' into "Low", "Medium", "High"
loyalty_bins = [0, 10, 20, df1['Loyalty Score'].max()]
loyalty_labels = ['Low', 'Medium', 'High']
# Create a new categorical feature based on the bins
df1['Loyalty Category'] = pd.cut(df1['Loyalty Score'], bins=loyalty_bins, labels=loyalty_labels, right=True, include_lowest=True) # Define the bins and labels to include zero

In [None]:
df1.head()

Unnamed: 0,Age,Items Purchased,Total Spent,Discount (%),Satisfaction Score,Warranty Extension,Gender,Region,Product Category,Payment Method,Revenue,Store Rating,Loyalty Score,Membership Status,Preferred Visit Time,Loyalty Category
0,56,1,29.226195,47.07738,1,1,Male,South,Accessories,UPI,149.252145,4,4,1.0,Evening,Low
1,69,10,420.142612,7.985739,4,1,Female,South,Accessories,Cash,1485.524222,4,26,1.0,Evening,High
2,46,4,127.742817,37.225718,2,1,Male,East,Laptop,Credit Card,85.550131,4,7,1.0,Morning,Low
3,32,9,417.722683,8.227732,2,0,Female,East,Tablet,UPI,824.118724,4,8,1.0,Afternoon,Low
4,60,13,608.031366,5.0,4,0,Female,South,Tablet,UPI,2463.590392,4,29,0.0,Morning,High


In [None]:
df3=df1.copy()

# **Feature engineering**

In [None]:
df3['Region'] = df3['Region'].str.lower()
df3['Product Category'] = df3['Product Category'].str.lower()
df3['Gender'] = df3['Gender'].str.lower()
df3['Payment Method'] = df3['Payment Method'].str.lower()
df3['Preferred Visit Time'] = df3['Preferred Visit Time'].str.lower()
unique_values_region = df3['Region'].unique()
unique_values_product_category = df3['Product Category'].unique()

In [None]:
# 2. Create Aggregated Features
df3['Avg Spending per Purchase'] = df3['Total Spent'] / df3['Items Purchased']
df3['Discounted Revenue'] = df3['Revenue'] * (1 - df3['Discount (%)'] / 100)

# 3. Bin Age into age groups
bins = [0, 18, 30, 50, 65, 100]
labels = ['Under 18', '18-30', '31-50', '51-65', '65+']
df3['Age Group'] = pd.cut(df3['Age'], bins=bins, labels=labels)

# 4. Eliminate unnecessary columns
columns_to_drop = ['Total Spent', 'Items Purchased', 'Revenue', 'Discount (%)', 'Age']
df3.drop(columns=columns_to_drop, errors='ignore', inplace=True)

# Display updated data
df3[['Avg Spending per Purchase', 'Discounted Revenue', 'Age Group']].head()


Unnamed: 0,Avg Spending per Purchase,Discounted Revenue,Age Group
0,29.226195,78.988145,51-65
1,42.014261,1366.894138,65+
2,31.935704,53.70348,31-50
3,46.413631,756.312446,31-50
4,46.771644,2340.410872,51-65


In [None]:
df3.isna().sum()

Unnamed: 0,0
Satisfaction Score,0
Warranty Extension,0
Gender,0
Region,0
Product Category,0
Payment Method,0
Store Rating,0
Loyalty Score,0
Membership Status,0
Preferred Visit Time,0


In [None]:
df3.shape

(5000, 14)

In [None]:
df3.head()

Unnamed: 0,Satisfaction Score,Warranty Extension,Gender,Region,Product Category,Payment Method,Store Rating,Loyalty Score,Membership Status,Preferred Visit Time,Loyalty Category,Avg Spending per Purchase,Discounted Revenue,Age Group
0,1,1,male,south,accessories,upi,4,4,1.0,evening,Low,29.226195,78.988145,51-65
1,4,1,female,south,accessories,cash,4,26,1.0,evening,High,42.014261,1366.894138,65+
2,2,1,male,east,laptop,credit card,4,7,1.0,morning,Low,31.935704,53.70348,31-50
3,2,0,female,east,tablet,upi,4,8,1.0,afternoon,Low,46.413631,756.312446,31-50
4,4,0,female,south,tablet,upi,4,29,0.0,morning,High,46.771644,2340.410872,51-65


In [None]:
with open('electronics_data.pkl', 'wb') as file:
    pickle.dump(df4, file)


# **Scaling**

In [None]:
df4=df3.copy()

In [None]:
df4.head()

Unnamed: 0,Satisfaction Score,Warranty Extension,Gender,Region,Product Category,Payment Method,Store Rating,Loyalty Score,Membership Status,Preferred Visit Time,Loyalty Category,Avg Spending per Purchase,Discounted Revenue,Age Group
0,1,1,male,south,accessories,upi,4,4,1.0,evening,Low,29.226195,78.988145,51-65
1,4,1,female,south,accessories,cash,4,26,1.0,evening,High,42.014261,1366.894138,65+
2,2,1,male,east,laptop,credit card,4,7,1.0,morning,Low,31.935704,53.70348,31-50
3,2,0,female,east,tablet,upi,4,8,1.0,afternoon,Low,46.413631,756.312446,31-50
4,4,0,female,south,tablet,upi,4,29,0.0,morning,High,46.771644,2340.410872,51-65


In [None]:

from sklearn.preprocessing import RobustScaler

exclude_columns = ['Warranty Extension', 'Loyalty Score', 'Membership Status','Age Group','Loyalty Category']

numerical_columns = [col for col in df3.columns if df4[col].dtype in ['float64', 'int64'] and col not in exclude_columns]

robust_scaler = RobustScaler()

df4[numerical_columns] = robust_scaler.fit_transform(df4[numerical_columns])


In [None]:
# prompt: create a pickle file upto here

# Save the scaler to a pickle file
with open('robust_scaler.pkl', 'wb') as f:
    pickle.dump(robust_scaler, f)


# **Encoding**

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd

# Label encoding the target column
target_column = df4['Loyalty Category']
encoder = LabelEncoder()
df4['Loyalty Category'] = encoder.fit_transform(target_column)

# Defining the categorical columns
categorical_columns = ['Region', 'Payment Method', 'Product Category',
                       'Preferred Visit Time', 'Gender', 'Age Group']

# Check if all specified categorical columns exist in the DataFrame
existing_categorical_columns = [col for col in categorical_columns if col in df4.columns]

# One-hot encoding the categorical columns
one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)

# Fit-transform the encoder and create a DataFrame for encoded data
encoded_data = one_hot_encoder.fit_transform(df4[existing_categorical_columns])
encoded_df = pd.DataFrame(encoded_data, columns=one_hot_encoder.get_feature_names_out(existing_categorical_columns))

# Combine the original DataFrame with the encoded features
df4_encoded = pd.concat([df4.drop(columns=existing_categorical_columns), encoded_df], axis=1)

# Ensure the result is aligned and has no mismatched indices
df4_encoded.reset_index(drop=True, inplace=True)



In [None]:
import pickle

# Save the OneHotEncoder
with open('one_hot_encoder.pkl', 'wb') as f:
    pickle.dump(one_hot_encoder, f)

# Save the LabelEncoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

In [None]:
df6=df4_encoded.copy()

# **Model selection**

In [None]:
# Define the selected features
selected_features = [
    'Satisfaction Score', 'Avg Spending per Purchase', 'Discounted Revenue',
    'Region_north', 'Payment Method_credit card',
    'Preferred Visit Time_morning', 'Gender_male', 'Age Group_31-50'
]



In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Assuming df6 is the dataframe with necessary features and target

# 1. Define your features (X) and target (y)
X =  df6[selected_features]  # Features
y = df6['Loyalty Category']  # Define target column



# 3. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Initialize and train the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# 5. Predict on the test set
y_pred = rf_classifier.predict(X_test)

# 6. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# 7. Confusion Matrix and Classification Report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Classification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 82.30%
Confusion Matrix:
[[307   0  29]
 [  0 304  44]
 [ 29  75 212]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       336
           1       0.80      0.87      0.84       348
           2       0.74      0.67      0.71       316

    accuracy                           0.82      1000
   macro avg       0.82      0.82      0.82      1000
weighted avg       0.82      0.82      0.82      1000



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Define parameter grid
rf_param_grid = {
    'n_estimators': [100, 200],  # Number of trees (adjusted for simplicity)
    'max_depth': [None, 10, 20],  # Maximum depth of each tree
    'min_samples_split': [2, 5],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2],   # Minimum samples required to be at a leaf node
    'bootstrap': [True, False]    # Whether bootstrap samples are used
}

# GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the model on training data
grid_search_rf.fit(X_train, y_train)

# Best parameters and best score
print(f"Best Parameters for Random Forest: {grid_search_rf.best_params_}")
print(f"Best Score for Random Forest: {grid_search_rf.best_score_}")

# Best model
best_rf = grid_search_rf.best_estimator_


Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best Parameters for Random Forest: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Score for Random Forest: 0.8252512903338278


# **Cross validation**

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Initialize the models
rf_model = RandomForestClassifier(random_state=42)
svc_model = SVC(random_state=42)
log_reg_model = LogisticRegression(random_state=42, max_iter=10000)

# Perform 5-fold cross-validation and get accuracy scores for each fold
rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')
svc_cv_scores = cross_val_score(svc_model, X, y, cv=5, scoring='accuracy')
log_reg_cv_scores = cross_val_score(log_reg_model, X, y, cv=5, scoring='accuracy')

# Print cross-validation results
print(f"Random Forest CV Scores: {rf_cv_scores}")
print(f"SVC CV Scores: {svc_cv_scores}")
print(f"Logistic Regression CV Scores: {log_reg_cv_scores}")

# Calculate and print the mean accuracy
print(f"Random Forest Mean Accuracy: {rf_cv_scores.mean()}")
print(f"SVC Mean Accuracy: {svc_cv_scores.mean()}")
print(f"Logistic Regression Mean Accuracy: {log_reg_cv_scores.mean()}")


Random Forest CV Scores: [0.832 0.813 0.822 0.825 0.836]
SVC CV Scores: [0.831 0.818 0.807 0.825 0.834]
Logistic Regression CV Scores: [0.828 0.811 0.814 0.824 0.841]
Random Forest Mean Accuracy: 0.8256
SVC Mean Accuracy: 0.8229999999999998
Logistic Regression Mean Accuracy: 0.8235999999999999


In [None]:
# prompt: code for pickle file random forest

import pickle

# Save the trained Random Forest model to a pickle file
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf_classifier, file)

# **Testing**

In [None]:
df6[selected_features].head()

Unnamed: 0,Satisfaction Score,Avg Spending per Purchase,Discounted Revenue,Region_north,Payment Method_credit card,Preferred Visit Time_morning,Gender_male,Age Group_31-50
0,-0.333333,-0.17202,-0.252417,0.0,0.0,0.0,1.0,0.0
1,0.666667,0.571703,1.440715,0.0,0.0,0.0,0.0,0.0
2,0.0,-0.014441,-0.285657,0.0,1.0,1.0,1.0,1.0
3,0.0,0.82756,0.63802,0.0,0.0,0.0,0.0,1.0
4,0.666667,0.848381,2.720538,0.0,0.0,1.0,0.0,0.0


0.333333	-0.172020	-0.252417	0.0	0.0	0.0	1.0	0.0

In [None]:
pred = rf_classifier.predict([[-0.333333, -0.172020, -0.252417, 0.0, 0.0, 0.0, 1.0, 0.0]])
pred

array([1])

In [None]:
loyalty_labels

['Low', 'Medium', 'High']

In [None]:
# prompt: how to find the label of array([2])

loyalty_labels[pred[0]]

'Medium'