In [None]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS


In [None]:
#Loading dataset
df=pd.read_csv('balanced_dataset.csv')
df.head()

In [None]:
df.info()

In [None]:
# Checking Null Values
df.isna().sum()

In [None]:

nlp = spacy.load("en_core_web_sm")

# Regex patterns
url_pattern = r"https?://\S+|www\.\S+"
html_pattern = r"<.*?>"
emoji_pattern = r"[\U00010000-\U0010ffff]|[\u263a-\U0001f645]"
special_pattern = r"[^a-zA-Z0-9\s]"

def clean_basic(text):

    text = text.lower()

    text = re.sub(url_pattern, " ", text)

    text = re.sub(html_pattern, " ", text)

    text = re.sub(emoji_pattern, " ", text)

    text = re.sub(special_pattern, " ", text)

    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in STOP_WORDS and len(token.text) > 1]

    cleaned = " ".join(tokens)
    return cleaned


In [None]:
df["cleaned_text"] = df["Text"].apply(clean_basic)


In [None]:
df

In [None]:
# filtering words
def filter_review(text, min_words=3, max_words=250):
    words = text.split()

    if len(words) < min_words:
        return False
    if len(words) > max_words:
        return False
    
    return True


In [None]:
# df_original = df.copy()
df_filtered = df[df["cleaned_text"].apply(filter_review)]


In [None]:
df_filtered

In [None]:
# visualization
# bar plot
plt.figure(figsize=(6,4))
sns.countplot(x="Score", data=df_filtered)
plt.title("Review Count per Rating")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.show()

In [None]:
#Histogram: Word Count Distribution by Rating
plt.figure(figsize=(8,5))
sns.histplot(data=df_filtered, x="review_length", hue="Score", bins=50)
plt.title("Review Length Distribution by Rating")
plt.xlabel("Length of Review (words)")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.subplot(1,2,1)
plt.hist(df['review_length'], bins=50, color='skyblue', edgecolor='black')
plt.title("Before Filtering")
plt.xlabel("Review Length")
plt.ylabel("Frequency")

plt.subplot(1,2,2)
plt.hist(df_filtered['review_length'], bins=50, color='salmon', edgecolor='black')
plt.title("After Filtering")
plt.xlabel("Review Length")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()


In [None]:
#Display 3â€“10 Sample Reviews per Rating
def show_samples(df, n=5):
    for rating in sorted(df_filtered["Score"].unique()):
        print("\n" + "="*40)
        print(f"Rating: {rating} | Showing {n} sample reviews")
        print("="*40)
        
        samples = df[df["Score"] == rating].sample(n, random_state=42)
        
        for i, row in samples.iterrows():
            print(f"\nReview {i}:")
            print(row["cleaned_text"])  


In [None]:
show_samples(df_filtered, n=10)


In [None]:
df_filtered['Score'].value_counts()


In [None]:
#splitting into equal counts(5000)
df_balanced = df_filtered.groupby('Score').sample(n=4000, random_state=42)
df_balanced

In [None]:
df_balanced['Score'].value_counts()

In [None]:
x = df_filtered['Text']   (f)
y = df_filtered['Score']  (t)
x.head()

In [None]:
y.head()

In [None]:
# Splitting into Train and Test
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x, y,test_size=0.30,random_state=42,stratify=y)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
x_train_vect = vectorizer.fit_transform(x_train)
x_test_vect = vectorizer.transform(x_test)


In [None]:
print("Training data shape:", x_train_vect.shape)
print("Test data shape:", x_test_vect.shape)
