# Feature engineering .........

In [20]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
df = pd.read_csv(r"C:\Users\IQRA SHAIKH\OneDrive\Documents\GitHub\Customer_Satisfaction_prediction\project\customer_support_tickets.csv")
df.head()

Unnamed: 0,Ticket ID,Customer Name,Customer Email,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Type,Ticket Subject,Ticket Description,Ticket Status,Resolution,Ticket Priority,Ticket Channel,First Response Time,Time to Resolution,Customer Satisfaction Rating
0,1,Marisa Obrien,carrollallison@example.com,32,Other,GoPro Hero,2021-03-22,Technical issue,Product setup,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Social media,2023-06-01 12:15:36,,
1,2,Jessica Rios,clarkeashley@example.com,42,Female,LG Smart TV,2021-05-22,Technical issue,Peripheral compatibility,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Chat,2023-06-01 16:45:38,,
2,3,Christopher Robbins,gonzalestracy@example.com,48,Other,Dell XPS,2020-07-14,Technical issue,Network problem,I'm facing a problem with my {product_purchase...,Closed,Case maybe show recently my computer follow.,Low,Social media,2023-06-01 11:14:38,2023-06-01 18:05:38,3.0
3,4,Christina Dillon,bradleyolson@example.org,27,Female,Microsoft Office,2020-11-13,Billing inquiry,Account access,I'm having an issue with the {product_purchase...,Closed,Try capital clearly never color toward story.,Low,Social media,2023-06-01 07:29:40,2023-06-01 01:57:40,3.0
4,5,Alexander Carroll,bradleymark@example.com,67,Female,Autodesk AutoCAD,2020-02-04,Billing inquiry,Data loss,I'm having an issue with the {product_purchase...,Closed,West decision evidence bit.,Low,Email,2023-06-01 00:12:42,2023-06-01 19:53:42,1.0


In [8]:
df.isnull().sum()

Ticket ID                          0
Customer Name                      0
Customer Email                     0
Customer Age                       0
Customer Gender                    0
Product Purchased                  0
Date of Purchase                   0
Ticket Type                        0
Ticket Subject                     0
Ticket Description                 0
Ticket Status                      0
Resolution                      5700
Ticket Priority                    0
Ticket Channel                     0
First Response Time             2819
Time to Resolution              5700
Customer Satisfaction Rating    5700
Response Delay (hrs)            2819
Resolution Time (hrs)           5700
dtype: int64

In [None]:
# Filling missing text fields
df['Resolution'].fillna('Not Provided', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Resolution'].fillna('Not Provided', inplace=True)


In [None]:
# Handling missing values
df['Customer Satisfaction Rating'] = df['Customer Satisfaction Rating'].fillna(0)

In [3]:
# Converting date columns to datetime
df['Date of Purchase'] = pd.to_datetime(df['Date of Purchase'], errors='coerce')
df['First Response Time'] = pd.to_datetime(df['First Response Time'], errors='coerce')
df['Time to Resolution'] = pd.to_datetime(df['Time to Resolution'], errors='coerce')

In [4]:
# Time between purchase and first response
df['Response Delay (hrs)'] = (df['First Response Time'] - df['Date of Purchase']).dt.total_seconds() / 3600

In [5]:
# Time between first response and resolution
df['Resolution Time (hrs)'] = (df['Time to Resolution'] - df['First Response Time']).dt.total_seconds() / 3600

In [11]:
# Filling numerical features with median
df['Response Delay (hrs)'].fillna(df['Response Delay (hrs)'].median(), inplace=True)
df['Resolution Time (hrs)'].fillna(df['Resolution Time (hrs)'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Response Delay (hrs)'].fillna(df['Response Delay (hrs)'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Resolution Time (hrs)'].fillna(df['Resolution Time (hrs)'].median(), inplace=True)


In [12]:
df.isnull().sum()

Ticket ID                          0
Customer Name                      0
Customer Email                     0
Customer Age                       0
Customer Gender                    0
Product Purchased                  0
Date of Purchase                   0
Ticket Type                        0
Ticket Subject                     0
Ticket Description                 0
Ticket Status                      0
Resolution                         0
Ticket Priority                    0
Ticket Channel                     0
First Response Time             2819
Time to Resolution              5700
Customer Satisfaction Rating       0
Response Delay (hrs)               0
Resolution Time (hrs)              0
dtype: int64

In [13]:
# Droping unused datetime columns now that their derived columns are ready
df.drop(['First Response Time', 'Time to Resolution'], axis=1, inplace=True)

In [14]:
df.head(3)

Unnamed: 0,Ticket ID,Customer Name,Customer Email,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Type,Ticket Subject,Ticket Description,Ticket Status,Resolution,Ticket Priority,Ticket Channel,Customer Satisfaction Rating,Response Delay (hrs),Resolution Time (hrs)
0,1,Marisa Obrien,carrollallison@example.com,32,Other,GoPro Hero,2021-03-22,Technical issue,Product setup,I'm having an issue with the {product_purchase...,Pending Customer Response,Not Provided,Critical,Social media,0.0,19236.26,0.166667
1,2,Jessica Rios,clarkeashley@example.com,42,Female,LG Smart TV,2021-05-22,Technical issue,Peripheral compatibility,I'm having an issue with the {product_purchase...,Pending Customer Response,Not Provided,Critical,Chat,0.0,17776.760556,0.166667
2,3,Christopher Robbins,gonzalestracy@example.com,48,Other,Dell XPS,2020-07-14,Technical issue,Network problem,I'm facing a problem with my {product_purchase...,Closed,Case maybe show recently my computer follow.,Low,Social media,3.0,25259.243889,6.85


In [16]:
# Dropping columns not needed for modeling or recommendation
df.drop(columns=[
    'Ticket ID',
    'Customer Name',
    'Customer Email',
    'Ticket Subject',
    'Ticket Description',
    'Date of Purchase',       # Already used to create 'Purchase Month'
], inplace=True)

In [17]:
df.head(3)

Unnamed: 0,Customer Age,Customer Gender,Product Purchased,Ticket Type,Ticket Status,Resolution,Ticket Priority,Ticket Channel,Customer Satisfaction Rating,Response Delay (hrs),Resolution Time (hrs)
0,32,Other,GoPro Hero,Technical issue,Pending Customer Response,Not Provided,Critical,Social media,0.0,19236.26,0.166667
1,42,Female,LG Smart TV,Technical issue,Pending Customer Response,Not Provided,Critical,Chat,0.0,17776.760556,0.166667
2,48,Other,Dell XPS,Technical issue,Closed,Case maybe show recently my computer follow.,Low,Social media,3.0,25259.243889,6.85


In [None]:
#  Creating bins for Age Group
bins = [0, 20, 30, 40, 50, 60, 70, 100]
labels = ['<20', '20-29', '30-39', '40-49', '50-59', '60-69', '70+']
df['Age Group'] = pd.cut(df['Customer Age'], bins=bins, labels=labels)

In [19]:
%pip install scikit-learn

# label encoding categorical features
from sklearn.preprocessing import LabelEncoder
# Encode categoricals
cat_cols = ['Customer Gender', 'Product Purchased', 'Ticket Type', 'Ticket Status',
            'Resolution', 'Ticket Priority', 'Ticket Channel', 'Age Group']
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
customer-satisfaction-prediction 0.0.1 requires jupyter, which is not installed.
customer-satisfaction-prediction 0.0.1 requires openpyxl, which is not installed.
customer-satisfaction-prediction 0.0.1 requires plotly, which is not installed.
customer-satisfaction-prediction 0.0.1 requires pmdarima, which is not installed.
customer-satisfaction-prediction 0.0.1 requires statsmodels, which is not installed.
customer-satisfaction-prediction 0.0.1 requires streamlit, which is not installed.
customer-satisfaction-prediction 0.0.1 requires tqdm, which is not installed.
customer-satisfaction-prediction 0.0.1 requires xlrd, which is not installed.
movie-recommender-system 0.0.1 requires numpy==1.26.4, but you have numpy 2.3.1 which is incompatible.

[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To u

Collecting scikit-learn
  Using cached scikit_learn-1.7.0-cp312-cp312-win_amd64.whl.metadata (14 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.16.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.0-cp312-cp312-win_amd64.whl (10.7 MB)
Using cached joblib-1.5.1-py3-none-any.whl (307 kB)
Using cached scipy-1.16.0-cp312-cp312-win_amd64.whl (38.4 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.5.1 scikit-learn-1.7.0 scipy-1.16.0 threadpoolctl-3.6.0
Note: you may need to restart the kernel to use updated packages.


In [23]:
df.head(3)

Unnamed: 0,Customer Age,Customer Gender,Product Purchased,Ticket Type,Ticket Status,Resolution,Ticket Priority,Ticket Channel,Customer Satisfaction Rating,Response Delay (hrs),Resolution Time (hrs),Age Group
0,32,2,16,4,2,1530,0,3,0.0,19236.26,0.166667,1
1,42,0,21,4,2,1530,0,0,0.0,17776.760556,0.166667,2
2,48,2,10,4,0,343,2,3,3.0,25259.243889,6.85,2


In [None]:
# Save in the target project folder
df.to_csv(r"C:\Users\IQRA SHAIKH\OneDrive\Documents\GitHub\Customer_Satisfaction_prediction\project\processed_customer_satisfaction.csv", index=False)
print("✅ Saved processed dataset to 'project' folder.")

✅ Saved processed dataset to 'project' folder.


# 🎯 Satisfaction Prediction

In [21]:
# ----------------------------
X = df.drop('Customer Satisfaction Rating', axis=1)
y = df['Customer Satisfaction Rating'].astype(int)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)