In [1]:
pip install pandas matplotlib seaborn fpdf scikit-learn joblib

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25ldone
[?25h  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40702 sha256=73c02b72fd9c875c5afe5c3897e647e68a11daf5b2906aea3ca7932d6b454ef8
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from fpdf import FPDF
import joblib
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np
from wordcloud import WordCloud
from collections import Counter
import io
import tempfile
import os


In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Preparation

In [4]:
def load_data(file_path):
    return pd.read_csv(file_path)

In [5]:
df = load_data('/kaggle/input/data-customer-feedback/iphone.csv')
df.head()

Unnamed: 0,productAsin,country,date,isVerified,ratingScore,reviewTitle,reviewDescription,reviewUrl,reviewedIn,variant,variantAsin
0,B09G9BL5CP,India,11-08-2024,True,4,No charger,"Every thing is good about iPhones, there's not...",https://www.amazon.in/gp/customer-reviews/R345...,Reviewed in India on 11 August 2024,Colour: MidnightSize: 256 GB,B09G9BQS98
1,B09G9BL5CP,India,16-08-2024,True,5,iPhone 13 256GB,"It look so fabulous, I am android user switche...",https://www.amazon.in/gp/customer-reviews/R2HJ...,Reviewed in India on 16 August 2024,Colour: MidnightSize: 256 GB,B09G9BQS98
2,B09G9BL5CP,India,14-05-2024,True,4,Flip camera option nill,I tried to flip camera while recording but no ...,https://www.amazon.in/gp/customer-reviews/R3Y7...,Reviewed in India on 14 May 2024,Colour: MidnightSize: 256 GB,B09G9BQS98
3,B09G9BL5CP,India,24-06-2024,True,5,Product,100% genuine,https://www.amazon.in/gp/customer-reviews/R1P9...,Reviewed in India on 24 June 2024,Colour: MidnightSize: 256 GB,B09G9BQS98
4,B09G9BL5CP,India,18-05-2024,True,5,Good product,Happy to get the iPhone 13 in Amazon offer,https://www.amazon.in/gp/customer-reviews/R1XI...,Reviewed in India on 18 May 2024,Colour: MidnightSize: 256 GB,B09G9BQS98


In [6]:
def preprocess_text(text):
    if pd.isna(text):
        return ''
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [7]:
def get_sentiment(rating):
    if pd.isna(rating):
        return 'unknown'
    rating = float(rating)
    if rating <= 2:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    else:
        return 'positive'

In [8]:
def process_with_spacy(text):
    if pd.isna(text):
        return [], []
    if not isinstance(text, str):
        text = str(text)
    doc = nlp(text)
    entities = [ent.label_ for ent in doc.ents]
    pos_tags = [token.pos_ for token in doc]
    return entities, pos_tags

In [9]:
def prepare_data(df):
    df['processed_feedback'] = df['reviewDescription'].apply(preprocess_text)
    df['sentiment'] = df['ratingScore'].apply(get_sentiment)
    df['entities'], df['pos_tags'] = zip(*df['reviewDescription'].apply(process_with_spacy))
    return df

In [10]:
df = prepare_data(df)
df.head()

Unnamed: 0,productAsin,country,date,isVerified,ratingScore,reviewTitle,reviewDescription,reviewUrl,reviewedIn,variant,variantAsin,processed_feedback,sentiment,entities,pos_tags
0,B09G9BL5CP,India,11-08-2024,True,4,No charger,"Every thing is good about iPhones, there's not...",https://www.amazon.in/gp/customer-reviews/R345...,Reviewed in India on 11 August 2024,Colour: MidnightSize: 256 GB,B09G9BQS98,every thing good iphones theres nothing compar...,positive,"[ORG, NORP, DATE, ORG, ORG, ORG, ORG]","[DET, NOUN, AUX, ADJ, ADP, NOUN, PUNCT, PRON, ..."
1,B09G9BL5CP,India,16-08-2024,True,5,iPhone 13 256GB,"It look so fabulous, I am android user switche...",https://www.amazon.in/gp/customer-reviews/R2HJ...,Reviewed in India on 16 August 2024,Colour: MidnightSize: 256 GB,B09G9BQS98,look fabulous android user switched apple perf...,positive,[ORG],"[PRON, VERB, ADV, ADJ, PUNCT, PRON, AUX, ADJ, ..."
2,B09G9BL5CP,India,14-05-2024,True,4,Flip camera option nill,I tried to flip camera while recording but no ...,https://www.amazon.in/gp/customer-reviews/R3Y7...,Reviewed in India on 14 May 2024,Colour: MidnightSize: 256 GB,B09G9BQS98,tried flip camera recording facility added pau...,positive,[ORDINAL],"[PRON, VERB, PART, VERB, NOUN, SCONJ, VERB, CC..."
3,B09G9BL5CP,India,24-06-2024,True,5,Product,100% genuine,https://www.amazon.in/gp/customer-reviews/R1P9...,Reviewed in India on 24 June 2024,Colour: MidnightSize: 256 GB,B09G9BQS98,100 genuine,positive,[PERCENT],"[NUM, NOUN, ADJ]"
4,B09G9BL5CP,India,18-05-2024,True,5,Good product,Happy to get the iPhone 13 in Amazon offer,https://www.amazon.in/gp/customer-reviews/R1XI...,Reviewed in India on 18 May 2024,Colour: MidnightSize: 256 GB,B09G9BQS98,happy get iphone 13 amazon offer,positive,"[CARDINAL, ORG]","[ADJ, PART, VERB, DET, PROPN, NUM, ADP, PROPN,..."


In [11]:
df.to_csv('/kaggle/working/processed_feedback.csv', index=False)

# Model Building

In [12]:
def prepare_features(df):
    # Replace NaN values with an empty string
    df['processed_feedback'] = df['processed_feedback'].fillna('')
    
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(df['processed_feedback'])
    y = df['sentiment']
    return X, y, vectorizer

In [13]:
def split_data(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
def train_models(X_train, y_train):
    models = {
        'Logistic Regression': LogisticRegression(),
        'Naive Bayes': MultinomialNB(),
        'SVM': SVC(),
        'Random Forest': RandomForestClassifier()
    }
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        print(f"{name} model trained.")
    
    return models

In [15]:
def save_models(models, vectorizer):
    for name, model in models.items():
        joblib.dump(model, f'/kaggle/working/{name.lower().replace(" ", "_")}_model.joblib')
    joblib.dump(vectorizer, '/kaggle/working/tfidf_vectorizer.joblib')


In [16]:
df = load_data('/kaggle/working/processed_feedback.csv')
df.head()

Unnamed: 0,productAsin,country,date,isVerified,ratingScore,reviewTitle,reviewDescription,reviewUrl,reviewedIn,variant,variantAsin,processed_feedback,sentiment,entities,pos_tags
0,B09G9BL5CP,India,11-08-2024,True,4,No charger,"Every thing is good about iPhones, there's not...",https://www.amazon.in/gp/customer-reviews/R345...,Reviewed in India on 11 August 2024,Colour: MidnightSize: 256 GB,B09G9BQS98,every thing good iphones theres nothing compar...,positive,"['ORG', 'NORP', 'DATE', 'ORG', 'ORG', 'ORG', '...","['DET', 'NOUN', 'AUX', 'ADJ', 'ADP', 'NOUN', '..."
1,B09G9BL5CP,India,16-08-2024,True,5,iPhone 13 256GB,"It look so fabulous, I am android user switche...",https://www.amazon.in/gp/customer-reviews/R2HJ...,Reviewed in India on 16 August 2024,Colour: MidnightSize: 256 GB,B09G9BQS98,look fabulous android user switched apple perf...,positive,['ORG'],"['PRON', 'VERB', 'ADV', 'ADJ', 'PUNCT', 'PRON'..."
2,B09G9BL5CP,India,14-05-2024,True,4,Flip camera option nill,I tried to flip camera while recording but no ...,https://www.amazon.in/gp/customer-reviews/R3Y7...,Reviewed in India on 14 May 2024,Colour: MidnightSize: 256 GB,B09G9BQS98,tried flip camera recording facility added pau...,positive,['ORDINAL'],"['PRON', 'VERB', 'PART', 'VERB', 'NOUN', 'SCON..."
3,B09G9BL5CP,India,24-06-2024,True,5,Product,100% genuine,https://www.amazon.in/gp/customer-reviews/R1P9...,Reviewed in India on 24 June 2024,Colour: MidnightSize: 256 GB,B09G9BQS98,100 genuine,positive,['PERCENT'],"['NUM', 'NOUN', 'ADJ']"
4,B09G9BL5CP,India,18-05-2024,True,5,Good product,Happy to get the iPhone 13 in Amazon offer,https://www.amazon.in/gp/customer-reviews/R1XI...,Reviewed in India on 18 May 2024,Colour: MidnightSize: 256 GB,B09G9BQS98,happy get iphone 13 amazon offer,positive,"['CARDINAL', 'ORG']","['ADJ', 'PART', 'VERB', 'DET', 'PROPN', 'NUM',..."


In [17]:
X, y, vectorizer = prepare_features(df)
X_train, X_test, y_train, y_test = split_data(X, y)
models = train_models(X_train, y_train)
save_models(models, vectorizer)

Logistic Regression model trained.
Naive Bayes model trained.
SVM model trained.
Random Forest model trained.


# Model Evaluation

In [18]:
def load_models_and_vectorizer():
    models = {
        'Logistic Regression': joblib.load('/kaggle/working/logistic_regression_model.joblib'),
        'Naive Bayes': joblib.load('/kaggle/working/naive_bayes_model.joblib'),
        'SVM': joblib.load('/kaggle/working/svm_model.joblib'),
        'Random Forest': joblib.load('/kaggle/working/random_forest_model.joblib')
    }
    vectorizer = joblib.load('/kaggle/working/tfidf_vectorizer.joblib')
    return models, vectorizer

In [19]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)
    return accuracy, precision, recall, f1, cm

In [20]:
def plot_confusion_matrix(cm, model_name):
    plt.figure(figsize=(10,7))
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.savefig(f'/kaggle/working/{model_name.lower().replace(" ", "_")}_cm.png')
    plt.close()

In [21]:
    models, vectorizer = load_models_and_vectorizer()
    
    X = vectorizer.transform(df['processed_feedback'])
    y = df['sentiment']
    
    for name, model in models.items():
        accuracy, precision, recall, f1, cm = evaluate_model(model, X, y)
        print(f"\nModel: {name}")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1-score: {f1}")
        plot_confusion_matrix(cm, name)
    
    print("Model evaluation completed.")


Model: Logistic Regression
Accuracy: 0.8680600914435009
Precision: 0.8803991996001094
Recall: 0.8680600914435009
F1-score: 0.8402253975171595


  _warn_prf(average, modifier, msg_start, len(result))



Model: Naive Bayes
Accuracy: 0.7975179621162639
Precision: 0.7571488658327531
Recall: 0.7975179621162639
F1-score: 0.7509452164838546

Model: SVM
Accuracy: 0.9288047028086218
Precision: 0.9331013081682361
Recall: 0.9288047028086218
F1-score: 0.922910486917031

Model: Random Forest
Accuracy: 0.9542782495101241
Precision: 0.9552440588962109
Recall: 0.9542782495101241
F1-score: 0.9532653940815476
Model evaluation completed.


# Sentiment Analysis Report

In [22]:
# Load data and models
df = pd.read_csv('/kaggle/working/processed_feedback.csv')
df['processed_feedback'] = df['processed_feedback'].fillna('')  # Handle NaN values
models = {
    'Logistic Regression': joblib.load('/kaggle/working/logistic_regression_model.joblib'),
    'Naive Bayes': joblib.load('/kaggle/working/naive_bayes_model.joblib'),
    'SVM': joblib.load('/kaggle/working/svm_model.joblib'),
    'Random Forest': joblib.load('/kaggle/working/random_forest_model.joblib')
}
vectorizer = joblib.load('/kaggle/working/tfidf_vectorizer.joblib')

# Prepare data
X = vectorizer.transform(df['processed_feedback'])
y = df['sentiment']

In [23]:
# Evaluate models
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred, average='weighted')
    cm = confusion_matrix(y, y_pred)
    return accuracy, precision, recall, f1, cm

results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X, y)


  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
# Function to save BytesIO image to a temporary file
def save_bytesio_to_file(bytesio):
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
    temp_filename = temp_file.name
    temp_file.write(bytesio.getvalue())
    temp_file.close()
    return temp_filename

In [25]:
# Create visualizations
def plot_sentiment_distribution():
    plt.figure(figsize=(10, 6))
    sentiment_counts = df['sentiment'].value_counts()
    colors = ['#ff9999', '#66b3ff', '#99ff99']
    plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', colors=colors, startangle=90)
    plt.title('Distribution of Sentiment Categories')
    img_buffer = io.BytesIO()
    plt.savefig(img_buffer, format='png')
    img_buffer.seek(0)
    plt.close()
    return img_buffer

In [26]:
def plot_confusion_matrices():
    fig, axes = plt.subplots(2, 2, figsize=(20, 20))
    for (name, result), ax in zip(results.items(), axes.ravel()):
        sns.heatmap(result[4], annot=True, fmt='d', ax=ax, cmap='YlGnBu')
        ax.set_title(f'Confusion Matrix - {name}')
    plt.tight_layout()
    img_buffer = io.BytesIO()
    plt.savefig(img_buffer, format='png')
    img_buffer.seek(0)
    plt.close()
    return img_buffer

In [27]:
def plot_model_comparison():
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
    model_names = list(results.keys())
    data = np.array([[result[i] for i in range(4)] for result in results.values()])
    
    fig, ax = plt.subplots(figsize=(12, 6))
    x = np.arange(len(metrics))
    width = 0.2
    
    for i, model in enumerate(model_names):
        ax.bar(x + i*width, data[i], width, label=model)
    
    ax.set_ylabel('Score')
    ax.set_title('Model Performance Comparison')
    ax.set_xticks(x + width * 1.5)
    ax.set_xticklabels(metrics)
    ax.legend(loc='lower right')
    
    img_buffer = io.BytesIO()
    plt.savefig(img_buffer, format='png')
    img_buffer.seek(0)
    plt.close()
    return img_buffer

In [28]:
def generate_wordcloud():
    text = ' '.join(df['processed_feedback'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of Customer Feedback')
    
    img_buffer = io.BytesIO()
    plt.savefig(img_buffer, format='png')
    img_buffer.seek(0)
    plt.close()
    return img_buffer

In [29]:
def plot_feedback_length_distribution():
    df['feedback_length'] = df['processed_feedback'].apply(len)
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='feedback_length', hue='sentiment', kde=True, palette='Set2')
    plt.title('Distribution of Feedback Length by Sentiment')
    plt.xlabel('Feedback Length (characters)')
    plt.ylabel('Count')
    
    img_buffer = io.BytesIO()
    plt.savefig(img_buffer, format='png')
    img_buffer.seek(0)
    plt.close()
    return img_buffer

In [30]:
# Generate visualizations
sentiment_dist_img = plot_sentiment_distribution()
confusion_matrices_img = plot_confusion_matrices()
model_comparison_img = plot_model_comparison()
wordcloud_img = generate_wordcloud()
feedback_length_img = plot_feedback_length_distribution()

  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)


In [31]:
# Save BytesIO images to temporary files
sentiment_dist_file = save_bytesio_to_file(sentiment_dist_img)
confusion_matrices_file = save_bytesio_to_file(confusion_matrices_img)
model_comparison_file = save_bytesio_to_file(model_comparison_img)
wordcloud_file = save_bytesio_to_file(wordcloud_img)
feedback_length_file = save_bytesio_to_file(feedback_length_img)

In [32]:
# Generate report
class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Sentiment Analysis Report', 0, 1, 'C')
    
    def footer(self):
        self.set_y(-15)
        self.set_font('Arial', 'I', 8)
        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')

pdf = PDF()
pdf.add_page()

In [33]:
# 1. Executive Summary
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, '1. Executive Summary', 0, 1)
pdf.set_font('Arial', '', 12)
pdf.multi_cell(0, 10, 'This report presents the results of our sentiment analysis project on customer feedback. We analyzed X customer reviews and classified them into positive, neutral, and negative sentiments using various machine learning models.')


[]

In [34]:
# 2. Data Overview
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, '2. Data Overview', 0, 1)
pdf.set_font('Arial', '', 12)
pdf.cell(0, 10, f'Total number of feedback entries: {len(df)}', 0, 1)
pdf.cell(0, 10, 'Distribution of sentiment categories:', 0, 1)
pdf.image(sentiment_dist_file, x=10, y=pdf.get_y()+10, w=180)

In [35]:
# 3. Methodology
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, '3. Methodology', 0, 1)
pdf.set_font('Arial', '', 12)
pdf.multi_cell(0, 10, 'We used the following machine learning models for sentiment analysis: Logistic Regression, Naive Bayes, Support Vector Machine (SVM), and Random Forest. The text data was preprocessed and vectorized using TF-IDF.')


[]

In [36]:
# 4. Model Performance Metrics
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, '4. Model Performance Metrics', 0, 1)
pdf.image(model_comparison_file, x=10, y=pdf.get_y()+10, w=180)

pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, '4. Model Performance Metrics', 0, 1)
pdf.set_font('Arial', '', 12)
data = [['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score']]
for name, result in results.items():
    data.append([name, f'{result[0]:.4f}', f'{result[1]:.4f}', f'{result[2]:.4f}', f'{result[3]:.4f}'])

col_width = pdf.w / 5
row_height = 10
for row in data:
    for item in row:
        pdf.cell(col_width, row_height, str(item), border=1)
    pdf.ln(row_height)

In [37]:
# 5. Confusion Matrices
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, '5. Confusion Matrices', 0, 1)
pdf.image(confusion_matrices_file, x=10, y=pdf.get_y()+10, w=180)

In [38]:
# 6. Key Insights
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, '6. Key Insights', 0, 1)
pdf.set_font('Arial', '', 12)
pdf.multi_cell(0, 10, 'Based on our analysis:\n- The majority of customer feedback is positive.\n- Negative feedback often mentions specific product features.\n- Neutral feedback tends to be more factual and less emotional.')
pdf.image(wordcloud_file, x=10, y=pdf.get_y()+10, w=180)
pdf.image(feedback_length_file, x=10, y=pdf.get_y()+110, w=180)

In [39]:
# Save the PDF
pdf.output('sentiment_analysis_report.pdf')

# Clean up temporary files
os.unlink(sentiment_dist_file)
os.unlink(confusion_matrices_file)
os.unlink(model_comparison_file)
os.unlink(wordcloud_file)
os.unlink(feedback_length_file)