In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

# Load dataset
data = pd.read_csv('Reviews.csv')

# Select relevant columns
reviews = data[['Text', 'Score']]

# Drop null values
reviews.dropna(inplace=True)

# Function to clean text
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Tokenize
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]  # Lemmatize
    return ' '.join(text)

# Apply preprocessing
reviews['Cleaned_Text'] = reviews['Text'].apply(preprocess_text)

# Label sentiment (1 = positive, 0 = negative)
reviews['Sentiment'] = reviews['Score'].apply(lambda x: 1 if x > 3 else 0)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter

# Word cloud for positive reviews
positive_reviews = ' '.join(reviews[reviews['Sentiment'] == 1]['Cleaned_Text'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(positive_reviews)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud - Positive Reviews")
plt.show()

# Bigram analysis
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=10)
bigrams = vectorizer.fit_transform(reviews['Cleaned_Text'])
bigram_freq = dict(zip(vectorizer.get_feature_names_out(), bigrams.sum(axis=0).tolist()[0]))
plt.barh(list(bigram_freq.keys()), bigram_freq.values())
plt.title("Top 10 Bigrams")
plt.show()

# Sentiment distribution
reviews['Sentiment'].value_counts().plot(kind='bar', color=['red', 'green'])
plt.title("Sentiment Distribution")
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)  # Limit to 5000 features
X = tfidf.fit_transform(reviews['Cleaned_Text'])
y = reviews['Sentiment']

# Split data into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)