## **Import Libraries**

In [5]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Optional: Set visual style
sns.set(style='whitegrid', palette='muted')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## **Load the Datasets**

In [2]:
python -m pip install wordcloud

SyntaxError: invalid syntax (3995702791.py, line 1)

In [None]:
# Load Cornell Movie Reviews
cornell_df = pd.read_csv("cornellmovie_reviews.csv")
print("Cornell dataset shape:", cornell_df.shape)
display(cornell_df.head())

# Load Rotten Tomatoes Reviews
rotten_df = pd.read_csv("Rotten_tomatoes_critic_reviews.csv")
print("Rotten Tomatoes dataset shape:", rotten_df.shape)
display(rotten_df.head())

## Basic Cleaning (Handle Missing Data)

In [None]:
# Drop empty rows
cornell_df.dropna(inplace=True)
rotten_df.dropna(inplace=True)

# Keep only relevant columns
# You may need to adapt this depending on your CSV structure
cornell_df = cornell_df.rename(columns={cornell_df.columns[0]: 'review', cornell_df.columns[-1]: 'label'})
rotten_df = rotten_df.rename(columns={rotten_df.columns[0]: 'review', rotten_df.columns[-1]: 'label'})

print("Cornell columns:", cornell_df.columns.tolist())
print("Rotten columns:", rotten_df.columns.tolist())

## **Define a Text Cleaning Function**

In [8]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(tokens)

## **Apply Cleaning**

In [9]:
cornell_df['clean_text'] = cornell_df['review'].apply(clean_text)
rotten_df['clean_text'] = rotten_df['review'].apply(clean_text)

## **Add Useful Features**

In [10]:
for df in [cornell_df, rotten_df]:
    df['text_length'] = df['clean_text'].apply(len)
    df['word_count'] = df['clean_text'].apply(lambda x: len(x.split()))

## **Basic EDA (Exploratory Data Analysis)**

#### **Distribution of Sentiments**

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
sns.countplot(x='label', data=cornell_df, ax=axes[0])
axes[0].set_title("Cornell Reviews Label Distribution")

sns.countplot(x='label', data=rotten_df, ax=axes[1])
axes[1].set_title("Rotten Tomatoes Label Distribution")

plt.tight_layout()
plt.show()

### **Word Count Distribution**

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='label', y='word_count', data=cornell_df)
plt.title("Cornell Dataset — Word Count by Sentiment")
plt.show()

plt.figure(figsize=(8,5))
sns.boxplot(x='label', y='word_count', data=rotten_df)
plt.title("Rotten Tomatoes Dataset — Word Count by Sentiment")
plt.show()

### **Top Frequent Words**

In [None]:
from collections import Counter

def plot_top_words(df, title):
    all_words = " ".join(df['clean_text']).split()
    word_freq = Counter(all_words)
    common_words = pd.DataFrame(word_freq.most_common(20), columns=['word', 'count'])
    plt.figure(figsize=(10,5))
    sns.barplot(x='count', y='word', data=common_words)
    plt.title(title)
    plt.show()

plot_top_words(cornell_df, "Most Frequent Words — Cornell Dataset")
plot_top_words(rotten_df, "Most Frequent Words — Rotten Tomatoes Dataset")

### **Save the Cleaned Data (optional)**

In [None]:
cornell_df.to_csv("cleaned_cornell.csv", index=False)
rotten_df.to_csv("cleaned_rotten.csv", index=False)
print("✅ Cleaned datasets saved successfully.")