INTITIALIZATION <img src="doggie.png" style="width:40px;height:40px"/>

In [1]:
#for data table, directories etc
import pandas as pd
import numpy as np
import os

#for text pre-processing
import re

#for NLPTown BERT (text processing)
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

#for machine learning (LogisticRegression)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.multioutput import MultiOutputClassifier

#for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

#for saving trained and fine-tuned model
import pickle

#for visualizations
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from wordcloud import WordCloud
from PIL import Image
from collections import Counter

#for summary analysis
from contextlib import redirect_stdout




In [2]:
#get dataset for sentiment analysis
dir_path = os.getcwd()
dir_path = os.path.join(dir_path, 'NLP process/dataset_preNLP.csv')
df = pd.read_csv(dir_path)
df


Unnamed: 0,feedback_id,order_id,customer_id,rating,feedback_text,feedback_category,sentiment,feedback_year,feedback_month,feedback_day,day_type,area,customer_segment,orders_time,delivery_status,delivery_time_mins
0,2234710,1961864118,30065862,4,"It was okay, nothing special.",Delivery,Neutral,2024,7,17,Weekday,Allahabad,Regular,08:34,On Time,-5
1,5450964,1549769649,9573071,3,The order was incorrect.,App Experience,Negative,2024,5,28,Weekday,Thrissur,New,13:14,On Time,2
2,482108,9185164487,45477575,3,"It was okay, nothing special.",App Experience,Neutral,2024,9,23,Weekday,Vellore,Inactive,13:07,On Time,4
3,4823104,9644738826,88067569,4,The product met my expectations.,App Experience,Neutral,2023,11,24,Weekday,Gaya,Premium,16:16,On Time,-1
4,3537464,5427684290,83298567,3,Product was damaged during delivery.,Delivery,Negative,2023,11,20,Weekday,Asansol,Premium,05:00,On Time,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,9024060,1669690997,62600289,3,Taste was not as expected.,Product Quality,Negative,2023,12,25,Weekday,Udaipur,Inactive,15:46,On Time,5
4996,4715968,8340761903,53640286,3,"Average experience, could improve.",Product Quality,Neutral,2023,11,27,Weekday,Mathura,Premium,09:18,On Time,-2
4997,9621021,5936301790,87059497,4,"Average experience, could improve.",App Experience,Neutral,2024,6,21,Weekday,Jamshedpur,Regular,19:09,On Time,3
4998,1134095,5710579377,67310893,4,"Average experience, could improve.",Delivery,Neutral,2024,6,6,Weekday,Chennai,New,14:58,On Time,-2


DATA PROCESSING FOR NLP <img src="doggie.png" style="width:40px;height:40px"/>

In [3]:
#text pre-processing
#removing emojis
def rmv_emotes(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & pictographs
        "\U0001F680-\U0001F6FF"  # Transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # Flags
        "\U00002700-\U000027BF"  # Dingbats
        "\U0001F900-\U0001F9FF"  # Supplemental symbols and pictographs
        "\U00002600-\U000026FF"  # Miscellaneous symbols
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub('',text)

#remove punctuations, symbols, digits, underscore
def rmv_inTxt(text):
    text = re.sub(r'[\d_]','',text).strip() # Digits & underscore
    return re.sub(r'[^\w\s]','',text).strip() # Not letters/words, not sequence etc

#get feedback text only
text_df = pd.DataFrame()
text_df['Text'] = df['feedback_text']

#text pre-processing
text_df['Text'] = text_df['Text'].apply(rmv_emotes) 
text_df['Text'] = text_df['Text'].apply(rmv_inTxt)
text_df

Unnamed: 0,Text
0,It was okay nothing special
1,The order was incorrect
2,It was okay nothing special
3,The product met my expectations
4,Product was damaged during delivery
...,...
4995,Taste was not as expected
4996,Average experience could improve
4997,Average experience could improve
4998,Average experience could improve


In [4]:
#load NLPTown BERT model and tokenizer
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_model = pipeline('sentiment-analysis',model=model,tokenizer=tokenizer)

#set iteration size, result list for batch result, text to list
batch_size = 512
results = []
texts = text_df['Text'].tolist()

#batch processing 
for each in range(0, len(texts), batch_size):
    batch = texts[each:each+batch_size]
    batch_result = sentiment_model(batch)
    results.extend(batch_result)

Device set to use cpu


In [5]:
#mapping sentiment on rating
def map_sentiment(star_label):
    #nlp labels "1 star","2 stars" etc
    star = int(star_label.split()[0])
    if star <= 2:
        sentiment = "negative"
    elif star == 3:
        sentiment = "neutral"
    else:
        sentiment = "positive"
    return star, sentiment

#get sentiment and score in text_df
stars, sentiments = zip(*[map_sentiment(each['label']) for each in results])
text_df['rating'] = stars
text_df['sentiment'] = sentiments
text_df['score'] = [round(each['score'],2) for each in results]

#save sentiments and score into .csv file
dir_path = os.getcwd()
dir_path = os.path.join(dir_path, 'analyzed_sentiment_score.csv')
text_df.to_csv(dir_path,index=False)
text_df


Unnamed: 0,Text,rating,sentiment,score
0,It was okay nothing special,3,neutral,0.80
1,The order was incorrect,1,negative,0.44
2,It was okay nothing special,3,neutral,0.80
3,The product met my expectations,5,positive,0.50
4,Product was damaged during delivery,1,negative,0.55
...,...,...,...,...
4995,Taste was not as expected,2,negative,0.44
4996,Average experience could improve,3,neutral,0.60
4997,Average experience could improve,3,neutral,0.60
4998,Average experience could improve,3,neutral,0.60


In [6]:
#load the obtained sentiment label dataframe
dir_path = os.getcwd()
dir_path = os.path.join(dir_path, 'analyzed_sentiment_score.csv')
sent_df = pd.read_csv(dir_path)

#preparing data for training and testing
le = LabelEncoder()
sent_df['sentiment_encoded'] = le.fit_transform(sent_df['sentiment'])
X = sent_df['Text']                             #features
y = sent_df[['sentiment_encoded', 'rating']]    #label

#splitting data - 80% train 20% test
#set random_state seed number to reproduce result, stratify for balanced proportion
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y['sentiment_encoded'])

#convert raw text data into numerical features 
#get only top 5000 most frequent words & single / double words as features (for better context)
vectorizer = TfidfVectorizer(max_features=5000,ngram_range=(1,2))    
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

#train logistic regression
log_rg = LogisticRegression(max_iter=2000)
multi_cls = MultiOutputClassifier(log_rg)
multi_cls.fit(X_train_vec,y_train)


#prediction using LogisticRgression and evaluation
log_pred = multi_cls.predict(X_test_vec)
y_test_vals = y_test.values
print('Logistic Regression Results')
# Check accuracy for each output
for i, col in enumerate(y_test.columns):
    acc = accuracy_score(y_test_vals[:, i], log_pred[:, i])
    print(f'Accuracy for {col} : {acc:.3f}')
    print(f'Classification report for {col}:\n{classification_report(y_test_vals[:, i], log_pred[:, i])}\n')





Logistic Regression Results
Accuracy for sentiment_encoded : 1.000
Classification report for sentiment_encoded:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       328
           1       1.00      1.00      1.00       271
           2       1.00      1.00      1.00       401

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000


Accuracy for rating : 1.000
Classification report for rating:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       163
           2       1.00      1.00      1.00       165
           3       1.00      1.00      1.00       271
           4       1.00      1.00      1.00        71
           5       1.00      1.00      1.00       330

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted a

In [7]:
#adding simple tuning to LogisticRegression with GridSearchCV
#hyperparameter for best score
params = {
    'estimator__C': [0.1, 1, 5], 
    'estimator__penalty': ['l2'],
    'estimator__solver': ['lbfgs', 'newton-cg'],
    'estimator__max_iter': [1000,2000]

}
multi_cls = MultiOutputClassifier(LogisticRegression())
grid_lr = GridSearchCV(multi_cls,param_grid=params,cv=3)
grid_lr.fit(X_train_vec,y_train)

#identify best parameters and score for LogisticRegression 
report_file = []
print('Best parameters: ', grid_lr.best_params_)
report_file.append(f'Best parameters: {grid_lr.best_params_}\n')
print('Best cross-validation score: ', grid_lr.best_score_)
report_file.append(f'Best cross-validation score: {grid_lr.best_score_}\n')

#evaluation after tuning
grid_lr_pred = grid_lr.best_estimator_.predict(X_test_vec)
print('\nLogisticRegression Result After Tuning')
for i, col in enumerate(y_test.columns):
    acc_line = f'Accuracy for {col} : {acc:.3f}'
    print(acc_line)
    cr = classification_report(y_test[col], grid_lr_pred[:, i])
    print(cr)
    report_file.append(f'{acc_line}\n')
    report_file.append(f'{cr}\n')

#save report to .txt file
with open('model_classification_report.txt','w',encoding='utf-8') as file:
    for line in report_file:
        file.write(line)




Best parameters:  {'estimator__C': 0.1, 'estimator__max_iter': 1000, 'estimator__penalty': 'l2', 'estimator__solver': 'lbfgs'}
Best cross-validation score:  1.0

LogisticRegression Result After Tuning
Accuracy for sentiment_encoded : 1.000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       328
           1       1.00      1.00      1.00       271
           2       1.00      1.00      1.00       401

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000

Accuracy for rating : 1.000
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       163
           2       1.00      1.00      1.00       165
           3       1.00      1.00      1.00       271
           4       1.00      1.00      1.00        71
           5       1.00      1.00      1.00       330

    accuracy             

In [8]:
#predict on original dataset, with trained model
dir_path = os.getcwd()
dir_path = os.path.join(dir_path, 'analyzed_sentiment_score.csv')
sent_df = pd.read_csv(dir_path)
predictions = grid_lr.best_estimator_.predict(vectorizer.transform(df['feedback_text']))
pred_df = pd.DataFrame(predictions, columns=['predicted_sentiment','predicted_rating'])

#combine with cleaned text dataset
pred_df['predicted_sentiment'] = le.inverse_transform(pred_df['predicted_sentiment'])
sent_df[['predicted_sentiment','predicted_rating']] = pred_df
sent_df

#save to .csv file
dir_path = os.getcwd()
dir_path = os.path.join(dir_path, 'prediction_sentiment.csv')
sent_df.to_csv(dir_path,index=False)

#save trained and fine-tune model
dir_path = os.getcwd()
dir_path = os.path.join(dir_path, 'sentiment_rating_classifier.pkl')
with open(dir_path,'wb') as file:
    pickle.dump(grid_lr,file)

Sentiment-Feedback Analysis on Blinkit data using Fine-tuned Model <img src="doggie.png" style="width:40px;height:40px"/>

In [9]:
#apply the predicted sentiments and ratings to the original datasst
dir_path = os.getcwd()
dir_path = os.path.join(dir_path, 'prediction_sentiment.csv')
pred_df = pd.read_csv(dir_path)
dir_path = os.getcwd()
dir_path = os.path.join(dir_path, 'NLP process/dataset_preNLP.csv')
df = pd.read_csv(dir_path)
df[['sentiment','rating','feedback_text']] = pred_df[['predicted_sentiment','predicted_rating','Text']]
dir_path = os.getcwd()
dir_path = os.path.join(dir_path, 'dataset_for_analysis.csv')
df.to_csv(dir_path,index=False)
df


Unnamed: 0,feedback_id,order_id,customer_id,rating,feedback_text,feedback_category,sentiment,feedback_year,feedback_month,feedback_day,day_type,area,customer_segment,orders_time,delivery_status,delivery_time_mins
0,2234710,1961864118,30065862,3,It was okay nothing special,Delivery,neutral,2024,7,17,Weekday,Allahabad,Regular,08:34,On Time,-5
1,5450964,1549769649,9573071,1,The order was incorrect,App Experience,negative,2024,5,28,Weekday,Thrissur,New,13:14,On Time,2
2,482108,9185164487,45477575,3,It was okay nothing special,App Experience,neutral,2024,9,23,Weekday,Vellore,Inactive,13:07,On Time,4
3,4823104,9644738826,88067569,5,The product met my expectations,App Experience,positive,2023,11,24,Weekday,Gaya,Premium,16:16,On Time,-1
4,3537464,5427684290,83298567,1,Product was damaged during delivery,Delivery,negative,2023,11,20,Weekday,Asansol,Premium,05:00,On Time,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,9024060,1669690997,62600289,2,Taste was not as expected,Product Quality,negative,2023,12,25,Weekday,Udaipur,Inactive,15:46,On Time,5
4996,4715968,8340761903,53640286,3,Average experience could improve,Product Quality,neutral,2023,11,27,Weekday,Mathura,Premium,09:18,On Time,-2
4997,9621021,5936301790,87059497,3,Average experience could improve,App Experience,neutral,2024,6,21,Weekday,Jamshedpur,Regular,19:09,On Time,3
4998,1134095,5710579377,67310893,3,Average experience could improve,Delivery,neutral,2024,6,6,Weekday,Chennai,New,14:58,On Time,-2


In [10]:
#correlation analysis to find relationships between features
df_sub = pd.DataFrame()
df_sub = df.copy()
df_sub.drop(columns=['feedback_id','order_id','customer_id','orders_time'],inplace=True)
df_sub['hour'] = pd.to_datetime(df['orders_time'],format='%H:%M').dt.hour
le = LabelEncoder()
for col in df_sub.select_dtypes(include=['object']).columns:
    df_sub[col] = le.fit_transform(df_sub[col])

corr = df_sub.corr()
fig = px.imshow(corr,text_auto=True,color_continuous_scale='RdBu_r',aspect='auto',title='Correlation Analysis')
fig.update_xaxes(tickangle=45)

#highlight sentiment
highlight_col = ['sentiment']
n = len(corr)
shapes = []
for col in highlight_col:
    if col in corr.columns:
        idx = corr.columns.get_loc(col)
        # vertical line
        shapes.append(dict(type="rect",
                        x0=idx-0.5, x1=idx+0.5,
                        y0=-0.5, y1=n-0.5,
                        line=dict(color="yellow", width=3),
                        fillcolor="rgba(0,0,0,0)"))
        # horizontal line
        shapes.append(dict(type="rect",
                        x0=-0.5, x1=n-0.5,
                        y0=idx-0.5, y1=idx+0.5,
                        line=dict(color="yellow", width=3),
                        fillcolor="rgba(0,0,0,0)"))
        fig.update_layout(shapes=shapes,title_font_weight='bold')
fig.show()


In [15]:
#overall sentiment distribution analysis
def customer_analysis(df):

    #get data customer segment and sentiment
    temp_df = pd.DataFrame()
    temp_df[['id','customer','sentiment']] = df[['feedback_id','customer_segment','sentiment']]

    #group customer segment and sentiment
    temp_df = temp_df.groupby(['customer','sentiment'])['id'].count().reset_index(name='count')

    #plotting graph
    fig = px.bar(temp_df,x='customer',y='count',color='sentiment',text='count',title='Sentiment by Customer Type', barmode='group',
                 labels={'customer':'Customer Type','count':'No.of Feedback','sentiment':'Sentiment'})
    
    fig.update_traces(textposition='outside',textfont_size=11)
    fig.update_layout(title_x=0.5, title_font_weight='bold')
    fig.show()

def category_analysis(df):
    
    #get data feedback category and sentiment
    temp_df = pd.DataFrame()
    temp_df[['id','category','sentiment']] = df[['feedback_id','feedback_category','sentiment']]

    #group feedback category and sentiment
    temp_df = temp_df.groupby(['category','sentiment'])['id'].count().reset_index(name='count')

    #plotting graph
    fig = px.bar(temp_df,x='category',y='count',color='sentiment',text='count',title='Sentiment by Feedback Category', barmode='group',
                 labels={'category':'Feedback Category','count':'No.of Feedback','sentiment':'Sentiment'})
    
    fig.update_traces(textposition='outside',textfont_size=11)
    fig.update_layout(title_x=0.5, title_font_weight='bold')
    fig.show()

def count_analysis(df):
    
    #get data for sentiment 
    temp_df = pd.DataFrame()
    temp_df[['id','sentiment']] = df[['feedback_id','sentiment']]

    #group by sentiment 
    temp_df = temp_df.groupby(['sentiment'])['id'].count().reset_index(name='count')
    
    #graph for boxplot sentiment/count
    fig = make_subplots(rows=1,cols=2,subplot_titles=('Sentiment Distribution (Bar)','Sentiment Distribution (Pie)'),
                        specs=[[{'type':'bar'},{'type':'pie'}]])
    fig1 = px.bar(temp_df,x='sentiment',y='count',color='sentiment',color_discrete_sequence=px.colors.qualitative.Plotly)

    #graph for pie sentiment/count
    fig2 = px.pie(temp_df,values='count',names='sentiment')

    #section for both graphs
    for trace in fig1.data:
        fig.add_trace(trace,row=1,col=1)
    for trace in fig2.data:
        fig.add_trace(trace,row=1,col=2)

    for annotation in fig['layout']['annotations']:
        annotation['font']['weight'] = 'bold'
    
    fig.update_xaxes(title_text="Sentiment", row=1, col=1)
    fig.update_yaxes(title_text="No. of Feedback", row=1, col=1)
    fig.update_traces(textinfo='label + percent',row=1,col=2)
    fig.update_layout(height=500,width=1000,showlegend=False)
    fig.show()

def delivery_analysis(df):

    #get data for delivery and sentiment
    temp_df = pd.DataFrame()
    temp_df[['id','sentiment','status','delay_time']] = df[['feedback_id','sentiment','delivery_status','delivery_time_mins']]

    #group by delivery status
    temp_df1 = temp_df.groupby(['status','sentiment'])['id'].count().reset_index(name='count')
    
    #group by delivery mins
    temp_df2 = temp_df.groupby(['delay_time','sentiment'])['id'].count().reset_index(name='count')
    temp_df2 = temp_df2.sort_values(by='delay_time',ascending=True)
    
    #plotting graph for sunburst on delivery status/sentiment
    fig = make_subplots(rows=2,cols=1,subplot_titles=('Sentiment Distribution on Delivery Status','Sentiment Distribution on Delivery Minutes'),
                        specs=[[{'type':'sunburst'}],[{'type':'xy'}]])
    fig1 = px.sunburst(temp_df1,path=['status','sentiment'],values='count',color='status', color_discrete_sequence=px.colors.qualitative.Plotly)
    fig1.update_traces(textinfo='label+percent entry')

    #plotting graph for line on delivery delay/sentiment
    fig2 = px.line(temp_df2,x='delay_time',y='count',color='sentiment',markers=True)

    #section for both graphs
    for trace in fig1.data:
        fig.add_trace(trace,row=1,col=1)
    for trace in fig2.data:
        fig.add_trace(trace,row=2,col=1)

    for annotation in fig['layout']['annotations']:
        annotation['font']['weight'] = 'bold'

    fig.update_xaxes(title_text='Delay (mins)',row=2,col=1)
    fig.update_yaxes(title_text='No.of Feedback',row=2,col=1)
    fig.update_layout(height=800,width=1000,legend=dict(title="Sentiment",x=0.5,y=-0.15,orientation='h',xanchor='center',yanchor='bottom'))
    
    fig.show()

#customer_analysis(df)
#category_analysis(df)
#count_analysis(df)
delivery_analysis(df)


In [19]:
#time-based analysis
def year_analysis(df):
    #get data year and sentiment
    temp_df = pd.DataFrame()
    temp_df[['id','sentiment','year']] = df[['feedback_id','sentiment','feedback_year']]

    #group year
    temp_df = temp_df.groupby(['year','sentiment'])['id'].count().reset_index(name='count')

    #plotting graph
    fig = px.bar(temp_df,x='year',y='count',color='sentiment',text='count',title='Sentiment Trend by Year', barmode='group',
                 labels={'year':'Year','count':'No.of Feedback','sentiment':'Sentiment'})
    
    fig.update_traces(textposition='outside',textfont_size=11)
    fig.update_xaxes(tickvals=(2023,2024))
    fig.update_yaxes(range=[None,max(temp_df['count']+200)])
    fig.update_layout(title_x=0.5, title_font_weight='bold')
    fig.show()

def month_analysis(df):
    #get data month and sentiment
    temp_df = pd.DataFrame()
    temp_df[['id','sentiment','month']] = df[['feedback_id','sentiment','feedback_month']]

    #map month to monthname
    monthMap = {1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'June',7:'July',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'}
    temp_df['monthName'] = temp_df['month'].map(lambda x:monthMap[x])

    #group month and sentiment
    temp_df = temp_df.groupby(['month','monthName','sentiment'])['id'].count().reset_index(name='count')
    temp_df.sort_values(by='month',ascending=True)

    #plotting graph
    fig = px.line(temp_df,x='monthName',y='count',color='sentiment',markers=True, title='Sentiment Trend by Month',
                  labels={'monthName':'Month','count':'No.of Feedback','sentiment':'Sentiment'})
    fig.update_layout(title_x=0.5, title_font_weight='bold')

    #plotting min and max for each sentiment
    for sentiment in temp_df['sentiment'].unique():
        df_sent = temp_df[temp_df['sentiment'] == sentiment]

        #find max
        max_row = df_sent[df_sent['count'] == df_sent['count'].max()]
        for _, row in max_row.iterrows():
            fig.add_annotation(
                x=row['monthName'], 
                y=row['count'],
                text=f"Max: {row['count']}", font=dict(color='green', size=10.5),
                showarrow=True, arrowhead=2, ax=0, ay=-15
            )

        #find min
        min_row = df_sent[df_sent['count'] == df_sent['count'].min()]
        for _, row in min_row.iterrows():
            fig.add_annotation(
                x=row['monthName'],
                y=row['count'],
                text=f"Min: {row['count']}", font=dict(color='red', size=10.5),
                showarrow=True, arrowhead=2, ax=0, ay=45
            )
    fig.show()

def day_analysis(df):
    #get data day,month,year and sentiment
    temp_df = pd.DataFrame()
    temp_df[['id','sentiment','day_type','year','month','day']] = df[['feedback_id','sentiment','day_type','feedback_year', 'feedback_month', 'feedback_day']]
    temp_df['date'] =  pd.to_datetime(temp_df[['year', 'month', 'day']])
    temp_df['dayName'] = temp_df['date'].dt.day_name()
    temp_df.drop(columns=['date','day','month'])
    
    #group day_type, day and sentiment
    dayMap = {'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}
    temp_df['dayNo'] = temp_df['dayName'].map(lambda x:dayMap[x])
    temp_df = temp_df.groupby(['day_type','dayNo','dayName','sentiment'])['id'].count().reset_index(name='count')
    temp_df = temp_df.sort_values(by='dayNo',ascending=True)
   
    #plotting graph for day_type and sentiment
    fig = make_subplots(rows=2,cols=1, subplot_titles=('Sentiment Distribution in Weekday/Weekend','Sentiment Trend by Day'), specs=[[{'type':'sunburst'}],[{'type':'bar'}]])
    fig1 = px.sunburst(temp_df,path=['day_type','dayName','sentiment'],values='count',color='dayName', color_discrete_sequence=px.colors.qualitative.Plotly)
    fig1.update_traces(textinfo='label+percent entry')
    
    #Plotting graph for day and sentiment
    fig2 = px.bar(temp_df,x='dayName',y='count',color='sentiment',barmode='group')

    #adding to one figure
    for trace in fig1.data:
        fig.add_trace(trace,row=1,col=1)
    for trace in fig2.data:
        fig.add_trace(trace,row=2,col=1)

    for annotation in fig['layout']['annotations']:
        annotation['font']['weight'] = 'bold'

    fig.update_xaxes(title_text="Day", row=2, col=1)
    fig.update_yaxes(title_text="No. of Feedback", row=2, col=1)
    fig.update_yaxes(range=[None,max(temp_df['count']+100)],row=2,col=1)
    fig.update_layout(height=800,width=1000,legend=dict(title="Sentiment",x=0.5,y=-0.15,orientation='h',xanchor='center',yanchor='bottom'))
    fig.show()
    
def hour_analysis(df):

    #get hour,hour_type and sentiment
    temp_df = pd.DataFrame()
    temp_df[['id','sentiment','order_time']] = df[['feedback_id','sentiment','orders_time']]
    temp_df['hour'] = pd.to_datetime(temp_df['order_time'], format='%H:%M').dt.hour
    temp_df['hr_type'] = temp_df['hour'].apply(get_hrType)

    #group hour, hour_type and sentiment
    temp_df = temp_df.groupby(['hr_type','hour','sentiment'])['id'].count().reset_index(name='count')
    temp_df = temp_df.sort_values(by='hour',ascending=True)
    
    #plotting graph for hour_type and sentiment
    fig = make_subplots(rows=2,cols=1, subplot_titles=('Sentiment Distribution in Morning/Afternoon/Evening/Night','Sentiment Trend by Hour'),
                         specs=[[{'type':'sunburst'}],[{'type':'xy'}]])
    fig1 = px.sunburst(temp_df,path=['hr_type','sentiment'],values='count',color='hr_type', color_discrete_sequence=px.colors.qualitative.Plotly)
    fig1.update_traces(textinfo='label+percent entry')

    #plotting graph for hour and sentiment
    fig2 = px.line(temp_df,x='hour',y='count',color='sentiment',markers=True)

    #adding to one figure
    for trace in fig1.data:
        fig.add_trace(trace,row=1,col=1)
    for trace in fig2.data:
        fig.add_trace(trace,row=2,col=1)

    for annotation in fig['layout']['annotations']:
        annotation['font']['weight'] = 'bold'

    fig.update_xaxes(title_text="Hour", row=2, col=1)
    fig.update_xaxes(range=[0,24],dtick=2,row=2,col=1)
    fig.update_yaxes(title_text="No. of Feedback", row=2, col=1)
    fig.update_layout(height=800,width=1000,legend=dict(title="Sentiment",x=0.5,y=-0.15,orientation='h',xanchor='center',yanchor='bottom'))
    fig.show()

    
def get_hrType(hour):
    if 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 16:
        return 'Afternoon'
    elif 16 <= hour < 20:
        return 'Evening'
    else:
        return 'Night'
    
#year_analysis(df)
#month_analysis(df)
#day_analysis(df)
hour_analysis(df)

In [20]:
#geographical analysis
def geo_analysis(df):

    #get data area and sentiment
    temp_df = pd.DataFrame()
    temp_df[['id','sentiment','area']] = df[['feedback_id','sentiment','area']]

    #group sentiment and (top 5)
    temp_df = temp_df.groupby(['area','sentiment'])['id'].count().reset_index(name='count')
    top_areas = temp_df.groupby('area')['count'].sum().nlargest(5).index
    temp_df_top = temp_df[temp_df['area'].isin(top_areas)]

    #graph for top5 areas/sentiment
    fig1 = px.bar(temp_df_top,x='area',y='count',color='sentiment', text='count', barmode='group',
                 labels={'area':'Area','sentiment':'Sentiment','count':'No.of Feedback'})
    
    fig2 = px.scatter(temp_df_top,y='area',x='count',color='sentiment',symbol='sentiment')

    fig1.update_traces(textposition='outside',textfont_size=11)
    fig2.update_traces(marker_size=10)
    #section for both graphs
    fig = make_subplots(rows=2, cols=1,subplot_titles = ('Top 5 Areas with Most Feedback', 'Sentiment Distribution in Top 5 Areas'),
                              specs=[[{'type':'bar'}],[{'type':'scatter'}]])
    for trace in fig1.data:
        trace.legend = "legend1"
        fig.add_trace(trace,row=1,col=1)
    for trace in fig2.data:
        trace.legend = "legend2"
        fig.add_trace(trace,row=2,col=1)

    for annotation in fig['layout']['annotations']:
        annotation['font']['weight'] = 'bold'

    fig.update_yaxes(range=[None,max(temp_df_top['count'])],row=1,col=1)
    fig.update_xaxes(range=[min(temp_df_top['count'])-2,max(temp_df_top['count'])+2], dtick=2, row=2, col=1)
    fig.update_layout(height=800, width=1000,legend1=dict(title='Sentiment',x=1.15,y=1,xanchor="right",yanchor="top"),
                      legend2=dict(title='Sentiment',x=1.15,y=0,xanchor="right",yanchor="bottom"),
                      xaxis_title='Area',yaxis_title='No.of Feedback',xaxis2_title='No.of Feedback',yaxis2_title='Area')
    fig.show()



geo_analysis(df)

In [24]:
#word analysis
def wordcloud_analysis(df):
    #split texts into 3 variables for each sentiment 
    temp_df = pd.DataFrame()
    temp_df[['text','sentiment']] = df[['feedback_text','sentiment']]
    temp_df['text'] = temp_df['text'].str.lower()
    positive_text = " ".join(temp_df[temp_df['sentiment']=='positive']['text'])
    negative_text = " ".join(temp_df[temp_df['sentiment']=='negative']['text'])
    neutral_text = ' '.join(temp_df[temp_df['sentiment']=='neutral']['text'])

    #generate word cloud
    dir_path = os.getcwd()
    dir_path = os.path.join(dir_path,'graph_image')
    wordcloud_generator(positive_text,os.path.join(dir_path,'positive_wordcloud.png'))
    wordcloud_generator(negative_text,os.path.join(dir_path,'negative_wordcloud.png'))
    wordcloud_generator(neutral_text,os.path.join(dir_path,'neutral_wordcloud.png'))

    #get wordcloud images
    pos_img = Image.open(os.path.join(dir_path,'positive_wordcloud.png'))
    neg_img = Image.open(os.path.join(dir_path,'negative_wordcloud.png'))
    neu_img = Image.open(os.path.join(dir_path,'neutral_wordcloud.png'))
    
    #plotting graphs
    fig = make_subplots(rows=1,cols=3,subplot_titles=('Positive Wordcloud','Negative Wordcloud','Neutral Wordcloud'))
    fig.add_trace(go.Image(z=pos_img), row=1, col=1)
    fig.add_trace(go.Image(z=neg_img), row=1,col=2)
    fig.add_trace(go.Image(z=neu_img), row=1, col=3)

    for annotation in fig['layout']['annotations']:
        annotation['font']['weight'] = 'bold'
        annotation['y'] = 1.05
        annotation['font']['family'] = 'Bahnschrift'
        annotation['font']['size'] = 20

    fig.update_layout(showlegend=False,)
    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)
    fig.show()              

#wordcloud creation function
def wordcloud_generator(text,file_path):
    wc = WordCloud(width=300, height=300, background_color='white').generate(text)
    wc.to_file(file_path)



def com_word_analysis(df):
    #get feedback text and sentiment
    temp_df = pd.DataFrame()
    temp_df[['text','sentiment']] = df[['feedback_text','sentiment']]
    temp_df['text'] = temp_df['text'].str.lower()
    temp_df['text_list'] = temp_df['text'].apply(lambda x:str(x).split())
    
    #finding 20 most common words used
    all_word = [word for words in temp_df['text_list'] for word in words if len(word)>5]
    top_word = Counter(all_word).most_common(20)
    top_word = sorted(top_word,key=lambda x:x[1],reverse=True)
    
    #extract word and frequency count 
    words = [word for word,freq in top_word]
    freq = [freq for word,freq in top_word]

    #color mapping
    colorscale = [(0,"#dcc0e7"),(0.3,"#b979d4"),(0.6,"#8e2eb8"),(1,"#70069d")]
    min_freq = min(freq)
    max_freq = max(freq)
    color_values = [(f - min_freq) / (max_freq - min_freq) for f in freq]

    #plotting horizontal bar graph
    fig = px.bar(x=freq, y=words, color=color_values, color_continuous_scale=colorscale, orientation='h')
    fig.update_yaxes(autorange='reversed')
    fig.update_layout(title='Top 20 Most Common Words', yaxis_title="Word", xaxis_title="Frequency", coloraxis_showscale=False, title_font_weight='bold')
    fig.show()

def pos_word_analysis(df):
    #get feedback text and sentiment
    temp_df = pd.DataFrame()
    temp_df[['text','sentiment']] = df[['feedback_text','sentiment']]
    temp_df['text'] = temp_df['text'].str.lower()
    temp_df['text_list'] = temp_df['text'].apply(lambda x:str(x).split())
    
    #finding 10 most positive words used
    all_word = [word for words in temp_df[temp_df['sentiment']=='positive']['text_list'] for word in words if len(word)>5] 
    top_word = Counter(all_word).most_common(10)
    top_word = sorted(top_word,key=lambda x:x[1],reverse=True)

    #extract word and frequency count 
    words = [word for word,freq in top_word]
    freq = [freq for word,freq in top_word]

    #color mapping
    colorscale = [(0,"#bed8dc"),(0.3,"#88c6dd"),(0.6,"#2e93b8"),(1,"#067f9d")]
    min_freq = min(freq)
    max_freq = max(freq)
    color_values = [(f - min_freq) / (max_freq - min_freq) for f in freq]

    #plotting horizontal bar graph
    fig = px.bar(x=freq, y=words, color=color_values, color_continuous_scale=colorscale, orientation='h')
    fig.update_yaxes(autorange='reversed')
    fig.update_layout(title='Top 10 Most Common Positive Words', yaxis_title="Word", xaxis_title="Frequency", coloraxis_showscale=False, title_font_weight='bold')
    fig.show()

def neg_word_analysis(df):
    #get feedback text and sentiment
    temp_df = pd.DataFrame()
    temp_df[['text','sentiment']] = df[['feedback_text','sentiment']]
    temp_df['text'] = temp_df['text'].str.lower()
    temp_df['text_list'] = temp_df['text'].apply(lambda x:str(x).split())
    
    #finding 10 most negative words used
    all_word = [word for words in temp_df[temp_df['sentiment']=='negative']['text_list'] for word in words if len(word)>5] 
    top_word = Counter(all_word).most_common(10)
    top_word = sorted(top_word,key=lambda x:x[1],reverse=True)

    #extract word and frequency count 
    words = [word for word,freq in top_word]
    freq = [freq for word,freq in top_word]

    #color mapping
    colorscale = [(0,"#dcbed6"),(0.3,"#dd88cb"),(0.6,"#b82ea8"),(1,"#9d067c")]
    min_freq = min(freq)
    max_freq = max(freq)
    color_values = [(f - min_freq) / (max_freq - min_freq) for f in freq]

    #plotting horizontal bar graph
    fig = px.bar(x=freq, y=words, color=color_values, color_continuous_scale=colorscale, orientation='h')
    fig.update_yaxes(autorange='reversed')
    fig.update_layout(title='Top 10 Most Common Negative Words', yaxis_title="Word", xaxis_title="Frequency", coloraxis_showscale=False, title_font_weight='bold')
    fig.show()

def neu_word_analysis(df):
    #get feedback text and sentiment
    temp_df = pd.DataFrame()
    temp_df[['text','sentiment']] = df[['feedback_text','sentiment']]
    temp_df['text'] = temp_df['text'].str.lower()
    temp_df['text_list'] = temp_df['text'].apply(lambda x:str(x).split())
    
    #finding 10 most neutral words used
    all_word = [word for words in temp_df[temp_df['sentiment']=='neutral']['text_list'] for word in words if len(word)>5] 
    top_word = Counter(all_word).most_common(10)
    top_word = sorted(top_word,key=lambda x:x[1],reverse=True)

    #extract word and frequency count 
    words = [word for word,freq in top_word]
    freq = [freq for word,freq in top_word]

    #color mapping
    colorscale = [(0,"#d3ab8c"),(0.3,"#cc985d"),(0.6,"#b86b27"),(1,"#9f4e08")]
    min_freq = min(freq)
    max_freq = max(freq)
    color_values = [(f - min_freq) / (max_freq - min_freq) for f in freq]

    #plotting horizontal bar graph
    fig = px.bar(x=freq, y=words, color=color_values, color_continuous_scale=colorscale, orientation='h')
    fig.update_yaxes(autorange='reversed')
    fig.update_layout(title='Top 10 Most Common Neutral Words', yaxis_title="Word", xaxis_title="Frequency", coloraxis_showscale=False, title_font_weight='bold')
    fig.show()

#wordcloud_analysis(df)
#com_word_analysis(df)
#pos_word_analysis(df)
#neg_word_analysis(df)
neu_word_analysis(df)




Summary Analysis <img src="doggie.png" style="width:40px;height:40px"/>

In [25]:
#specify summary text file
summ_file = "feedback_analysis_summary.txt"

#load dataset 
df = pd.read_csv('dataset_for_analysis.csv')

df['date'] = pd.to_datetime({
    'year': df['feedback_year'],
    'month': df['feedback_month'],
    'day': df['feedback_day']
})

#summary statistics
with open(summ_file, "w", encoding="utf-8") as file, redirect_stdout(file):
    print("\n   -----📈Summary Statistics-----\n")
    print(f"   Total Feedback: {len(df):,}")
    print(f"   Date Range: {df['date'].min().date()} to {df['date'].max().date()}")

    #sentiment distibution
    print(f"\n   -----Sentiment Distribution-----\n")
    sent_dist = df['sentiment'].value_counts(normalize=True)*100
    for sentiment, percentage in sent_dist.items():
        print(f"\t{sentiment}: {percentage:.2f}%")

    #rating distribution
    print(f"\n   -----Rating Distribution-----\n")
    rating_dist = df['rating'].value_counts(normalize=True)*100
    for rating, percentage in rating_dist.items():
        print(f"\t{rating}: {percentage:.2f}%")

    #feedback category distribution
    print(f"\n   -----Feedback Category Distribution-----\n")
    cat_dist = df['feedback_category'].value_counts()
    for category, count in cat_dist.items():
        print(f"   {category}: {count:,} feedback ({count/len(df)*100:.2f}%)")

    #customer segment distribution
    print(f"\n   -----Customer Category Distribution-----\n")
    custom_dist = df['customer_segment'].value_counts()
    for customer, count in custom_dist.items():
        print(f"   {customer}: {count:,} feedback ({count/len(df)*100:.2f}%)")

    #top 5 area
    print(f"\n   -----Top 5 Areas by Volume-----\n")
    top_area = df['area'].value_counts().head(5)
    for area, count in top_area.items():
        print(f"\t{area}: {count:,} feedback")

    #delivery distribution
    print(f"\n   -----Delivery Status Distribution-----\n")
    deli_dist = df['delivery_status'].value_counts()
    for status, count in deli_dist.items():
        print(f"   {status}: {count:,} feedback ({count/len(df)*100:.2f}%)")
    print(f"   Average Delay Duration: {df['delivery_time_mins'].mean():.0f}mins")
    print("\n\t✨ Analysis Complete! ✨")

print(f"\n✅ Summary analysis saved to {summ_file}")




✅ Summary analysis saved to feedback_analysis_summary.txt
