Imports


In [6]:
# Cell 1 - Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

import matplotlib
matplotlib.use('TkAgg')  # Or 'Qt5Agg' if TkAgg fails

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


 Create folder

In [7]:
os.makedirs("plots", exist_ok=True)
os.makedirs("models", exist_ok=True)


Current Directory check

In [8]:
import os
print("📁 Current directory:", os.getcwd())


📁 Current directory: c:\Users\joyos\Downloads\CUSTOMER_SATISFACTION_PREDICTION\plots


Load Dataset

In [12]:
import pandas as pd
data = pd.read_csv("../data/customer_support_tickets.csv")
print("🔹 Data loaded. Shape:", data.shape)


🔹 Data loaded. Shape: (8469, 17)


Drop rows with missing satisfaction ratings

In [13]:
# Cell 4 - Drop rows with missing satisfaction ratings
data = data.dropna(subset=["Customer Satisfaction Rating"])


Encode categorical variables



In [14]:
# Cell 5 - Encode categorical variables
categorical_cols = ['Customer Gender', 'Product Purchased', 'Ticket Type',
                    'Ticket Status', 'Ticket Priority', 'Ticket Channel']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


Segmentation (Console)

In [15]:
# Cell 6 - Segmentation (Console)
print("\n📊 Segmentation by Ticket Type:")
print(data.groupby('Ticket Type').size())

print("\n📊 Segmentation by Satisfaction Rating:")
print(data.groupby('Customer Satisfaction Rating').size())




📊 Segmentation by Ticket Type:
Ticket Type
0    544
1    516
2    533
3    596
4    580
dtype: int64

📊 Segmentation by Satisfaction Rating:
Customer Satisfaction Rating
1.0    553
2.0    549
3.0    580
4.0    543
5.0    544
dtype: int64


 Plot: Satisfaction Distribution

In [16]:
# Cell 7 - Plot: Satisfaction Distribution
sns.set(style="whitegrid")
plt.figure(figsize=(8, 5))
sns.histplot(data['Customer Satisfaction Rating'], bins=5, kde=True, color='skyblue')
plt.title("Customer Satisfaction Distribution")
plt.savefig("plots/satisfaction_distribution.png")
plt.show()


Plot: Ticket Status

In [17]:
# Cell 8 - Plot: Ticket Status
plt.figure(figsize=(6, 6))
data['Ticket Status'].value_counts().plot.pie(autopct='%1.1f%%',
    colors=sns.color_palette('pastel'), startangle=140)
plt.title("Ticket Status Distribution")
plt.ylabel('')
plt.savefig("plots/ticket_status_distribution.png")
plt.show()


Customer Age Distribution

In [18]:
# Cell 9 - Customer Age Distribution
plt.figure(figsize=(8, 5))
sns.histplot(data['Customer Age'], bins=20, kde=True, color='salmon')
plt.title("Customer Age Distribution")
plt.savefig("plots/age_distribution.png")
plt.show()


Gender Distribution

In [19]:
# Cell 10 - Gender Distribution
plt.figure(figsize=(6, 6))
data['Customer Gender'].value_counts().plot.pie(
    autopct='%1.1f%%', colors=sns.color_palette('Set2'), startangle=90)
plt.title("Customer Gender Distribution")
plt.ylabel('')
plt.savefig("plots/gender_distribution.png")
plt.show()


Ticket Channel Distribution

In [20]:
# Cell 11 - Ticket Channel Distribution
plt.figure(figsize=(10, 6))
channel_counts = data['Ticket Channel'].value_counts()
sns.barplot(x=channel_counts.index, y=channel_counts.values, palette='rocket')
plt.title("Ticket Channel Distribution")
plt.xticks(rotation=45)
plt.savefig("plots/ticket_channel_distribution.png")
plt.show()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=channel_counts.index, y=channel_counts.values, palette='rocket')


Avg Satisfaction by Gender

In [21]:
# Cell 12 - Avg Satisfaction by Gender
plt.figure(figsize=(8, 5))
avg_satisfaction = data.groupby('Customer Gender')['Customer Satisfaction Rating'].mean().reset_index()
sns.barplot(x='Customer Gender', y='Customer Satisfaction Rating', data=avg_satisfaction, palette='muted')
plt.title("Average Satisfaction by Gender")
plt.ylim(1, 5)
plt.savefig("plots/avg_satisfaction_by_gender.png")
plt.show()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Customer Gender', y='Customer Satisfaction Rating', data=avg_satisfaction, palette='muted')


Top 10 Products Purchased

In [22]:
# Cell 13 - Top 10 Products Purchased
plt.figure(figsize=(10, 6))
top_products = data['Product Purchased'].value_counts().head(10)
sns.barplot(y=top_products.index, x=top_products.values, palette='magma')
plt.title("Top 10 Products Purchased")
plt.savefig("plots/top_products.png")
plt.show()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(y=top_products.index, x=top_products.values, palette='magma')


Ticket Type Distribution

In [23]:
# Cell 14 - Ticket Type Distribution
plt.figure(figsize=(7, 6))
data['Ticket Type'].value_counts().plot.pie(autopct='%1.1f%%',
    colors=sns.color_palette('pastel'))
plt.title("Ticket Type Distribution")
plt.ylabel('')
plt.savefig("plots/ticket_type_distribution.png")
plt.show()




Ticket Priority Distribution

In [25]:
# Cell 15 - Ticket Priority Distribution
plt.figure(figsize=(7, 6))
data['Ticket Priority'].value_counts().plot.pie(autopct='%1.1f%%',
    colors=sns.color_palette('pastel'))
plt.title("Ticket Priority Distribution")
plt.ylabel('')
plt.savefig("plots/ticket_priority_distribution.png")
plt.show()


Age Group Ticket Volume

In [26]:
# Cell 16 - Age Group Ticket Volume
bins = [0, 20, 30, 40, 50, 60, 70, 80, 100]
labels = ['0-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-100']
data['Age Group'] = pd.cut(data['Customer Age'], bins=bins, labels=labels)
age_group_counts = data['Age Group'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
age_group_counts.plot(kind='bar', color='skyblue')
plt.title("Tickets Raised by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Number of Tickets")
plt.grid(axis='y')
plt.savefig("plots/tickets_by_age_group.png")
plt.show()


Ticket Type vs Age (Facet Grid)

In [27]:
# Cell 17 - Ticket Type vs Age (Facet Grid)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
g = sns.FacetGrid(data, col='Ticket Type', col_wrap=3, height=4)
g.map(sns.histplot, 'Customer Age', bins=20, kde=True)
g.set_titles('{col_name}')
g.set_axis_labels('Age', 'No. of Tickets')
g.fig.suptitle("Distribution of Ticket Types by Age", fontsize=14)
plt.subplots_adjust(top=0.85)
g.savefig("plots/facet_ticket_type_by_age.png")
plt.show()


Drop columns before ML

In [28]:
# Cell 18 - Drop columns before ML
data = data.drop(columns=['Customer Name', 'Customer Email', 'Ticket ID',
                          'Ticket Subject', 'Ticket Description', 'Resolution',
                          'Date of Purchase', 'First Response Time',
                          'Time to Resolution', 'Age Group'], errors='ignore')


Train-Test Split

In [29]:
# Cell 19 - Train-Test Split
X = data.drop('Customer Satisfaction Rating', axis=1)
y = data['Customer Satisfaction Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Scale features

In [30]:
# Cell 20 - Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Train Random Forest Model

In [31]:
# Cell 21 - Train Random Forest Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


Evaluate Model

In [32]:
# Cell 22 - Evaluate Model
y_pred = model.predict(X_test)
print("\n🎯 Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



🎯 Model Accuracy: 0.19855595667870035
Classification Report:
               precision    recall  f1-score   support

         1.0       0.17      0.17      0.17       168
         2.0       0.17      0.17      0.17       174
         3.0       0.24      0.25      0.25       175
         4.0       0.21      0.20      0.20       162
         5.0       0.20      0.21      0.20       152

    accuracy                           0.20       831
   macro avg       0.20      0.20      0.20       831
weighted avg       0.20      0.20      0.20       831



Feature Importance Plot

In [33]:
# Cell 23 - Feature Importance Plot
feature_importances = pd.Series(model.feature_importances_, index=X.columns).nlargest(10)
plt.figure(figsize=(8, 6))
feature_importances.plot(kind='barh', color='skyblue')
plt.title("Top 10 Feature Importances")
plt.tight_layout()
plt.savefig("plots/feature_importance.png")
plt.show()


Save Model

In [34]:
# Cell 24 - Save Model
joblib.dump(model, "models/random_forest_model.pkl")
print("\n✅ Model saved to models/random_forest_model.pkl")
print("📊 All graphs also shown and saved to plots/")



✅ Model saved to models/random_forest_model.pkl
📊 All graphs also shown and saved to plots/
