In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


In [18]:
# Load the dataset
df = pd.read_csv('/kaggle/input/customer-satisfaction-10k/Customer-survey-data.csv')

# Rename columns for ease of use
df.columns = ['Customer', 'Delivery_Experience', 'Food_Quality', 'Delivery_Speed', 'Order_Accurate']

# Display the first few rows of the dataset
df.head()

Unnamed: 0,Customer,Delivery_Experience,Food_Quality,Delivery_Speed,Order_Accurate
0,1,5.0,3.0,4.0,Yes
1,2,3.0,4.0,3.0,Yes
2,3,4.0,5.0,2.0,Yes
3,4,5.0,3.0,4.0,Yes
4,5,2.0,5.0,1.0,Yes


In [19]:
df.shape

(10616, 5)

In [20]:
df.isnull().sum()

Customer                 0
Delivery_Experience    418
Food_Quality           252
Delivery_Speed         239
Order_Accurate         660
dtype: int64

In [21]:
# Define preprocessing steps
numerical_features = ['Delivery_Experience', 'Food_Quality', 'Delivery_Speed']
categorical_features = ['Order_Accurate']

In [22]:
# Preprocessing for numerical data: impute missing values with median and standardize
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [23]:
# Preprocessing for categorical data: impute missing values and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [24]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [25]:

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

In [26]:
# Simplifying the target variable (assuming satisfaction scores >= 4 as satisfied)
df['Satisfaction'] = df['Delivery_Experience'].apply(lambda x: 1 if x >= 4 else 0 if pd.notnull(x) else np.nan)


In [27]:
# Drop rows with missing target values
df.dropna(subset=['Satisfaction'], inplace=True)

In [28]:
df.head()

Unnamed: 0,Customer,Delivery_Experience,Food_Quality,Delivery_Speed,Order_Accurate,Satisfaction
0,1,5.0,3.0,4.0,Yes,1.0
1,2,3.0,4.0,3.0,Yes,0.0
2,3,4.0,5.0,2.0,Yes,1.0
3,4,5.0,3.0,4.0,Yes,1.0
4,5,2.0,5.0,1.0,Yes,0.0


In [29]:
# Features and target variable
X = df.drop(columns=['Satisfaction', 'Customer'])
y = df['Satisfaction']

In [30]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [31]:
# Train the model
model.fit(X_train, y_train)

In [32]:
# Predict the labels of the test set
y_pred = model.predict(X_test)

In [33]:
# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

In [36]:
# Compute the accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

1.0

In [35]:
print(f"Confusion Matrix:\n{cm}")
print(f"Accuracy: {accuracy}")

# Print classification report
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[1681    0]
 [   0 1379]]
Accuracy: 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1681
         1.0       1.00      1.00      1.00      1379

    accuracy                           1.00      3060
   macro avg       1.00      1.00      1.00      3060
weighted avg       1.00      1.00      1.00      3060

