In [None]:
# Import Libraries
!pip install vaderSentiment
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
# Replace 'amazon_reviews.csv' with your dataset path
df = pd.read_csv('/content/ama.csv')

# Display first few rows
print("Initial Data:")
print(df.head())

#------------------------------------------------------------------


#if 'reviewText' not in df.columns:
   # if 'Review Text' in df.columns:
    #    df = df.rename(columns={'Review Text': 'review_content'})  # Rename column to 'reviewText'
    #else:
     #   raise KeyError("Neither 'reviewText' nor 'Review Text' column found in the DataFrame.")


# Data Cleaning Function
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove mentions and hashtags
    text = re.sub(r'\@\w+|\#', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply cleaning
df['cleaned_review'] = df['review_content'].apply(lambda x: clean_text(str(x)))

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing Function
def preprocess_text(text):
    # Tokenize
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing
df['processed_review'] = df['cleaned_review'].apply(preprocess_text)

# Handle missing values
print("\nMissing Values Before:")
print(df.isnull().sum())

df = df.dropna(subset=['review_content', 'rating_count'])

print("\nMissing Values After:")
print(df.isnull().sum())

# Initialize VADER
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment score
def get_sentiment_score(text):
    score = analyzer.polarity_scores(text)
    return score['compound']

# Apply sentiment analysis
df['sentiment_score'] = df['processed_review'].apply(get_sentiment_score)

# Categorize sentiments
def categorize_sentiment(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['sentiment'] = df['sentiment_score'].apply(categorize_sentiment)

# Display sentiment distribution
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())

# Convert reviewTime to datetime
#df['reviewTime'] = pd.to_datetime(df['reviewTime'], format='%m %d, %Y')
df['month'] = df['reviewTime'].dt.to_period('M')

# Aggregate data by month
monthly_reviews = df.groupby('month').agg({
    'sentiment_score': 'mean',
    'sentiment': 'count'
}).rename(columns={'sentiment': 'review_count'})

# Generate synthetic sales data
np.random.seed(42)
monthly_reviews['sales'] = monthly_reviews['review_count'] * 10 + (monthly_reviews['sentiment_score'] * 100) + np.random.randint(-100, 100, size=monthly_reviews.shape[0])

# Display monthly aggregated data
print("\nMonthly Aggregated Data:")
print(monthly_reviews.head())

# Calculate correlation
correlation = monthly_reviews['sentiment_score'].corr(monthly_reviews['sales'])
print(f"\nCorrelation between sentiment score and sales: {correlation:.2f}")

# Visualization

# a. Sentiment Distribution
plt.figure(figsize=(8,6))
sns.countplot(x='sentiment', data=df, palette='viridis')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

# b. Average Monthly Sentiment Score
plt.figure(figsize=(12,6))
monthly_sentiment = monthly_reviews['sentiment_score']
monthly_sentiment.plot(kind='line', marker='o')
plt.title('Average Monthly Sentiment Score')
plt.xlabel('Month')
plt.ylabel('Average Sentiment Score')
plt.grid(True)
plt.show()

# c. Monthly Sales
plt.figure(figsize=(12,6))
monthly_sales = monthly_reviews['sales']
monthly_sales.plot(kind='line', marker='o', color='orange')
plt.title('Monthly Sales')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.grid(True)
plt.show()

# d. Sentiment Score vs. Sales
plt.figure(figsize=(10,6))
sns.scatterplot(x='sentiment_score', y='sales', data=monthly_reviews, hue=monthly_reviews.index.astype(str))
plt.title('Sentiment Score vs. Sales')
plt.xlabel('Average Sentiment Score')
plt.ylabel('Sales')
plt.legend(title='Month', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# e. Correlation Heatmap
plt.figure(figsize=(6,4))
sns.heatmap(monthly_reviews[['sentiment_score', 'sales']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
df = pd.read_csv('/content/ama.csv')
df

In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/ama.csv'
data = pd.read_csv(file_path)

# Display the first few rows and basic info about the data
print("Data Sample:")
print(data.head())
print("\nData Info:")
print(data.info())


In [None]:
# Drop columns that aren't useful for sentiment analysis (modify based on actual data)
data = data.drop(columns=['column_to_drop'], errors='ignore')

# Handle missing values (remove rows with NaN in important columns like review text)
data = data.dropna(subset=['review_content'])  # Replace 'review_body' with the actual review text column name


In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

import nltk

# Download the 'punkt' resource
nltk.download('punkt')

# Preprocess text data
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize words
    words = word_tokenize(text)
    # Remove stopwords and apply stemming
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    processed_text = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(processed_text)

# Apply preprocessing to the review column
data['processed_review'] = data['review_content'].apply(preprocess_text)  # Replace 'review_body' with the actual column name


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download VADER lexicon for sentiment analysis
nltk.download('vader_lexicon')

# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Calculate sentiment scores for each review
data['sentiment_score'] = data['processed_review'].apply(lambda x: sid.polarity_scores(x)['compound'])

# Classify sentiment as Positive, Neutral, or Negative
data['sentiment'] = data['sentiment_score'].apply(lambda score: 'Positive' if score > 0.05 else ('Negative' if score < -0.05 else 'Neutral'))


In [None]:
data['rating'] = pd.to_numeric(data['rating'], errors='coerce')

# Drop rows with NaN values in 'rating' column
data = data.dropna(subset=['rating'])

correlation = data[['sentiment_score', 'rating']].corr()  # Replace 'rating' with the actual column name
print("\nCorrelation between Sentiment Score and Rating:")
print(correlation)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Sentiment distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=data, x='sentiment')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Frequency')
plt.show()

# Scatter plot for sentiment score vs rating
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='sentiment_score', y='rating', hue='sentiment')
plt.title('Sentiment Score vs Rating')
plt.xlabel('Sentiment Score')
plt.ylabel('Rating')
plt.show()


In [None]:
#final



import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns

# Load Dataset
data = pd.read_csv('/content/ama.csv')


# Display sample data and structure
print("Data Sample:")
print(data.head())
print("\nData Info:")
print(data.info())

# Data Cleaning - Drop rows with missing values in essential columns
print(data.isna().sum())
data = data.dropna(subset=['rating_count'])  # Replace with actual review and rating column names

# Step 3: Text Preprocessing
nltk.download('stopwords')
nltk.download('vader_lexicon')

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize words
    words = word_tokenize(text)
    # Remove stopwords and apply stemming
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    processed_text = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(processed_text)

# Apply preprocessing to the review text column
data['processed_review'] = data['review_content'].apply(preprocess_text)  # Replace 'review_body' with actual column name

# Step 4: Sentiment Analysis using VADER
sid = SentimentIntensityAnalyzer()
data['sentiment_score'] = data['processed_review'].apply(lambda x: sid.polarity_scores(x)['compound'])

# Classify sentiment based on the compound score
data['sentiment'] = data['sentiment_score'].apply(lambda score: 'Positive' if score > 0.05 else ('Negative' if score < -0.05 else 'Neutral'))

# Step 5: Sales Insights
# Correlation between sentiment score and rating
data['rating'] = pd.to_numeric(data['rating'], errors='coerce')
data = data.dropna(subset=['rating'])



correlation = data[['sentiment_score', 'rating']].corr()  # Replace 'rating' with actual rating column name
print("\nCorrelation between Sentiment Score and Rating:")
print(correlation)

# Average rating and sentiment score per sentiment category
sentiment_analysis = data.groupby('sentiment').agg(
    avg_rating=('rating', 'mean'),
    avg_sentiment_score=('sentiment_score', 'mean'),
    count=('sentiment', 'count')
).reset_index()
print("\nSentiment Analysis Summary:")
print(sentiment_analysis)

# Step 6: Visualization

# Sentiment distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=data, x='sentiment', palette='viridis')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Frequency')
plt.show()

# Scatter plot for sentiment score vs rating
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='sentiment_score', y='rating', hue='sentiment', palette='coolwarm')
plt.title('Sentiment Score vs Rating')
plt.xlabel('Sentiment Score')
plt.ylabel('Rating')
plt.show()

# Average rating by sentiment category
plt.figure(figsize=(10, 6))
sns.barplot(data=sentiment_analysis, x='sentiment', y='avg_rating', palette='viridis')
plt.title('Average Rating by Sentiment Category')
plt.xlabel('Sentiment')
plt.ylabel('Average Rating')
plt.show()

# Step 7: Insights and Recommendations

# Positive Insights: Highlight features or attributes mentioned in positive reviews
positive_reviews = data[data['sentiment'] == 'Positive']['review_content']
# Display sample positive reviews (for more detail, perform keyword analysis on these reviews)
print("\nSample Positive Reviews:")
print(positive_reviews.sample(5, random_state=1))

# Negative Insights: Identify common complaints in negative reviews
negative_reviews = data[data['sentiment'] == 'Negative']['review_content']
print("\nSample Negative Reviews:")
print(negative_reviews.sample(5, random_state=1))

# Save analyzed data to CSV for further exploration if needed
output_path = '/content/analyzed_amazon_reviews.csv'
data.to_csv(output_path, index=False)
print(f"\nCleaned and analyzed data saved to {output_path}")



In [None]:
# Extract key words/phrases from positive reviews
positive_reviews = data[data['sentiment'] == 'Positive']
top_features = positive_reviews['processed_review'].str.split().explode().value_counts().head(10)

# Display top 10 words/phrases from positive sentiment reviews
print("Top Features in Positive Reviews:")
print(top_features)


In [None]:
# Extract common complaints/feedback from negative reviews
negative_reviews = data[data['sentiment'] == 'Negative']
complaints = negative_reviews['processed_review'].str.split().explode().value_counts().head(10)

# Display top 10 complaints/feedback from negative sentiment reviews
print("Top Complaints in Negative Reviews:")
print(complaints)


In [None]:
# Segment positive and negative sentiment customers
positive_customers = data[data['sentiment'] == 'Positive']
negative_customers = data[data['sentiment'] == 'Negative']

# Create marketing campaigns or follow-up actions for these segments
print("\nSample Positive Customers:")
print(positive_customers[['product_id', 'review_content', 'sentiment_score']].head(5))

print("\nSample Negative Customers:")
print(negative_customers[['product_id', 'review_content', 'sentiment_score']].head(5))


In [None]:
import pandas as pd
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns

# Define a list of keywords to analyze
keywords = ['price', 'quality', 'service']

# Function to count occurrences of keywords in reviews
def count_keywords_in_review(text, keywords):
    words = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    keyword_counts = {keyword: words.count(keyword) for keyword in keywords}
    return keyword_counts

# Clean reviews by dropping missing data and ensuring valid content
positive_reviews = positive_reviews.dropna(subset=['processed_review'])
negative_reviews = negative_reviews.dropna(subset=['processed_review'])

# Apply the keyword counting function to positive and negative reviews
positive_reviews['keyword_counts'] = positive_reviews['processed_review'].apply(count_keywords_in_review, keywords=keywords)
negative_reviews['keyword_counts'] = negative_reviews['processed_review'].apply(count_keywords_in_review, keywords=keywords)

# Summarize the total occurrences of each keyword in positive and negative reviews
positive_keyword_counts = pd.DataFrame(positive_reviews['keyword_counts'].tolist(), columns=keywords)
negative_keyword_counts = pd.DataFrame(negative_reviews['keyword_counts'].tolist(), columns=keywords)

# Total counts for positive and negative reviews
positive_totals = positive_keyword_counts.sum()
negative_totals = negative_keyword_counts.sum()

# Combine positive and negative keyword counts for visualization
keyword_data = pd.DataFrame({
    'Positive': positive_totals,
    'Negative': negative_totals
})

# Optionally: Add Neutral sentiment category and count for more insight (if neutral reviews are available)
# Assuming `sentiment` is a column in your reviews DataFrame
neutral_reviews = data[data['sentiment'] == 'Neutral']
neutral_reviews['keyword_counts'] = neutral_reviews['processed_review'].apply(count_keywords_in_review, keywords=keywords)
neutral_keyword_counts = pd.DataFrame(neutral_reviews['keyword_counts'].tolist(), columns=keywords)
neutral_totals = neutral_keyword_counts.sum()

# Add Neutral data to the keyword_data
keyword_data['Neutral'] = neutral_totals

# Plot the keyword frequencies in positive, negative, and neutral reviews
plt.figure(figsize=(10, 6))
keyword_data.plot(kind='bar', color=['green', 'red', 'gray'], width=0.8)
plt.title('Keyword Frequency Analysis in Sentiment Categories')
plt.xlabel('Keyword')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()  # Adjust layout to prevent clipping of labels
plt.show()

# Display keyword frequency counts for each sentiment
print("\nTotal Keyword Counts in Positive Reviews:")
print(positive_totals)

print("\nTotal Keyword Counts in Negative Reviews:")
print(negative_totals)

print("\nTotal Keyword Counts in Neutral Reviews:")
print(neutral_totals)

# Optionally: Find sentences with high frequency keywords for context-based analysis
def find_keyword_sentences(text, keywords):
    sentences = nltk.sent_tokenize(text)
    keyword_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)]
    return keyword_sentences

# Example: Find and display some keyword-related sentences from positive and negative reviews
positive_reviews['keyword_sentences'] = positive_reviews['review_content'].apply(lambda x: find_keyword_sentences(x, keywords))
negative_reviews['keyword_sentences'] = negative_reviews['review_content'].apply(lambda x: find_keyword_sentences(x, keywords))

# Display some sample sentences containing the keywords
print("\nSample Sentences with Keywords in Positive Reviews:")
print(positive_reviews['keyword_sentences'].head())

print("\nSample Sentences with Keywords in Negative Reviews:")
print(negative_reviews['keyword_sentences'].head())
