In [1]:
import csv
import random 
import pandas as pd
import pandas as pd
import numpy as np
import random
import datetime
import string
from textblob import TextBlob
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:

def generate_customer_reviews(filename='customer_reviews.csv', num_rows=100):
    # Specific typos requested
    typos = ['terribleee', 'goood', 'sloww']
    
    # Review templates by sentiment
    positive_reviews = [
        "The battery life is {goood}!",
        "Amazing display and the {goood} camera quality makes it worth it.",
        "Smooth performance, definitely a {goood} purchase.",
        "Love the new design, it feels very premium.",
        "Best smartphone I have owned in years, highly recommended!"
    ]
    
    neutral_reviews = [
        "It's an okay phone, but the charging is a bit {sloww}.",
        "Decent for the price, though the interface is sometimes {sloww}.",
        "Average build quality. Not great, but not {terribleee} either.",
        "The screen is nice, but I've seen better battery life.",
        "Standard smartphone experience. Does what it needs to do."
    ]
    
    negative_reviews = [
        "The software is incredibly {sloww} and buggy.",
        "Absolutely {terribleee} customer service and the screen broke instantly.",
        "The phone gets too hot. A {terribleee} experience overall.",
        "Camera is blurry and the apps are {sloww} to open.",
        "Waste of money. Performance is {terribleee} compared to the last model."
    ]

    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write Header
        writer.writerow(['Review_ID', 'Review_Text', 'Rating'])
        
        # Determine which indices will be null (10%)
        null_indices = random.sample(range(1, num_rows + 1), int(num_rows * 0.10))
        
        for i in range(1, num_rows + 1):
            rating = random.randint(1, 5)
            
            # Select review text based on rating
            if rating >= 4:
                review_text = random.choice(positive_reviews)
            elif rating == 3:
                review_text = random.choice(neutral_reviews)
            else:
                review_text = random.choice(negative_reviews)
            
            # Inject requested typos
            review_text = review_text.format(
                goood='goood', 
                sloww='sloww', 
                terribleee='terribleee'
            )
            
            # Apply null values (10%)
            if i in null_indices:
                review_text = ""
                
            writer.writerow([i, review_text, rating])

    print(f"Successfully created '{filename}' with {num_rows} rows.")

if __name__ == "__main__":
    generate_customer_reviews()

Successfully created 'customer_reviews.csv' with 100 rows.


In [3]:

def clean_customer_reviews(input_file='customer_reviews.csv', output_file='cleaned_reviews.csv'):
    # 1. Load 'customer_reviews.csv'
    try:
        df = pd.read_csv(input_file)
        print(f"Successfully loaded {len(df)} rows.")
    except FileNotFoundError:
        print(f"Error: {input_file} not found. Please run the generation script first.")
        return

    # 2. Remove rows where Review_Text is missing
    # subset=['Review_Text'] ensures we only drop rows where the text is null
    df = df.dropna(subset=['Review_Text'])
    
    # 3. Use regex to remove special characters from Review_Text
    # [^a-zA-Z0-9\s] means: keep letters, numbers, and whitespace; remove everything else
    df['Review_Text'] = df['Review_Text'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

    # 4. Standardize text to lowercase
    df['Review_Text'] = df['Review_Text'].str.lower()

    # Optional: Strip leading/trailing whitespace that might remain after regex
    df['Review_Text'] = df['Review_Text'].str.strip()

    # Save and display results
    df.to_csv(output_file, index=False)
    print(f"Cleaning complete. {len(df)} rows remaining.")
    print("\nFirst few rows of cleaned data:")
    print(df.head())

if __name__ == "__main__":
    clean_customer_reviews()

Successfully loaded 100 rows.
Cleaning complete. 90 rows remaining.

First few rows of cleaned data:
   Review_ID                                        Review_Text  Rating
0          1                          the battery life is goood       5
1          2     smooth performance definitely a goood purchase       5
2          3         the software is incredibly sloww and buggy       1
3          4  absolutely terribleee customer service and the...       2
4          5         the software is incredibly sloww and buggy       1
