In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.8.29-cp313-cp313-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   --------------------------- ------------ 1.0/1.5 MB 5.2 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 4.8 MB/s eta 0:00:00
Downloading regex-2025.8.29-cp313-cp313-win_amd64.whl (275 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.9.1 regex-2025.8.29



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ------------------------------ ------- 786.4/981.5 kB 4.2 MB/s eta 0:00:01
     -------------------------------------- 981.5/981.5 kB 4.3 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (pyproject.toml): started
  Building wheel for langdetect (pyproject.toml): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993363 sha256=75e4b6ed1b6c04d873357725e034359250940a0d954de4763f2dacdb93cab665
  Stored in directory:


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from langdetect import detect, DetectorFactory

# Ensure consistent results from langdetect
DetectorFactory.seed = 0

In [5]:
# Load dataset
df = pd.read_csv("chatgpt_reviews.csv")
df.head()

Unnamed: 0,date,title,review,rating,username,helpful_votes,review_length,platform,language,location,version,verified_purchase
0,2024-12-24,East list local interesting far magazine father.,Impressive natural language understanding. Alm...,1,darren55,25,9,Google Play,de,Nepal,2.0.4,No
1,2024-07-26,Road determine picture figure hard experience ...,Sometimes gives wrong answers or outdated info...,2,davistyler,35,14,Flipkart,es,Panama,2.6.5,No
2,2024-08-22,Policy social past analysis whole.,Great tool for generating content quickly. Int...,5,andrea59,94,10,App Store,es,Denmark,4.2.2,No
3,2025-04-24,News financial billion four foreign.,Great tool for generating content quickly. Sta...,3,dcooper,50,15,Amazon,de,Brunei Darussalam,4.4.2,No
4,2024-08-15,To skill she case sing stop likely.,Excellent for language translation and grammar...,1,christine79,33,11,Flipkart,fr,United States Minor Outlying Islands,5.1.1,Yes


In [6]:
df.isnull().sum()

date                 0
title                0
review               0
rating               0
username             0
helpful_votes        0
review_length        0
platform             0
language             0
location             0
version              0
verified_purchase    0
dtype: int64

In [7]:
# Drop missing reviews/ratings
df = df.dropna(subset=["review", "rating"])

# Fill missing helpful_votes with 0
df["helpful_votes"] = df["helpful_votes"].fillna(0)

# Fill missing platform/location/version with "Unknown"
df[["platform", "location", "version"]] = df[["platform", "location", "version"]].fillna("Unknown")


In [9]:
# --- Language Detection ---
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

df["language_detected"] = df["review"].apply(detect_language)

In [10]:
# Keep only English reviews (optional: you can also keep multilingual and train separately)
df = df[df["language_detected"] == "en"]

In [12]:
# --- Text Cleaning ---
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)   # remove punctuation/special chars
    text = re.sub(r"\s+", " ", text).strip()   # normalize whitespace
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["clean_review"] = df["review"].apply(clean_text)

[nltk_data] Downloading package stopwords to C:\Users\This
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\This
[nltk_data]     PC\AppData\Roaming\nltk_data...


In [13]:
# --- Derived Features ---
df["review_length"] = df["review"].apply(lambda x: len(str(x).split()))
df["helpful_flag"] = df["helpful_votes"].apply(lambda x: 1 if x > 10 else 0)
df["verified_flag"] = df["verified_purchase"].apply(lambda x: 1 if str(x).lower() == "yes" else 0)

In [14]:
# --- Sentiment Labels ---
def map_sentiment(rating):
    if rating >= 4:
        return "positive"
    elif rating == 3:
        return "neutral"
    else:
        return "negative"

df["sentiment"] = df["rating"].apply(map_sentiment)

In [15]:
# --- Save Preprocessed Data ---
df.to_csv("cleaned_reviews.csv", index=False)
print("✅ Preprocessed dataset saved as cleaned_reviews.csv")
print(df.head())

✅ Preprocessed dataset saved as cleaned_reviews.csv
         date                                              title  \
0  2024-12-24   East list local interesting far magazine father.   
1  2024-07-26  Road determine picture figure hard experience ...   
2  2024-08-22                 Policy social past analysis whole.   
3  2025-04-24               News financial billion four foreign.   
4  2024-08-15                To skill she case sing stop likely.   

                                              review  rating     username  \
0  Impressive natural language understanding. Alm...       1     darren55   
1  Sometimes gives wrong answers or outdated info...       2   davistyler   
2  Great tool for generating content quickly. Int...       5     andrea59   
3  Great tool for generating content quickly. Sta...       3      dcooper   
4  Excellent for language translation and grammar...       1  christine79   

   helpful_votes  review_length     platform language  \
0             25   