In [1]:
import pandas as pd

df = pd.read_excel("bikedekho_ev_reviews.xlsx")
df = df.dropna(subset=['text'])


In [2]:
import re
import string
import emoji
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

def clean_review(text):
    if not isinstance(text, str):
        return ""
    
    try:
        text = text.encode('latin1').decode('utf-8')
    except:
        pass

    # Remove emojis using the emoji library
    text = emoji.replace_emoji(text, replace='')

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove all non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    
    # Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    return ' '.join(words)

In [6]:
#Sentiment Analysis using VADER

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def get_sentiment_score(cleaned_review):
    return analyzer.polarity_scores(cleaned_review)['compound']

def classify_sentiment(score):
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"


In [7]:
df['cleaned_review'] = df['text'].apply(clean_review)
df['sentiment_score'] = df['cleaned_review'].apply(get_sentiment_score)
df['sentiment_label'] = df['sentiment_score'].apply(classify_sentiment)


In [8]:
df.to_csv("bikedekho_reviews_with_sentiment_cleaned.csv", index=False, encoding='utf-8')

In [10]:
df.to_excel("bikedekho_reviews_with_sentiment_cleaned.xlsx", index=False)