In [1]:
import pandas as pd
import numpy as np
import re
import string
import os
import warnings
warnings.filterwarnings('ignore')


In [2]:
# LOAD THE DATASET 
file_path = r'D:\sentiment_analysis\reviews.csv'  
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully!")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print(f" Error: File '{file_path}' not found.")
    print(" Please place your CSV file in the same folder as this script.")
    exit()

# Display first few rows with all columns
print("\nFirst 5 rows of raw data:")
print(df.head())


Dataset loaded successfully!
Dataset shape: (413840, 6)

First 5 rows of raw data:
                                        Product Name Brand Name   Price  \
0  "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
1  "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
2  "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
3  "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
4  "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   

   Rating                                            Reviews  Review Votes  
0       5  I feel so LUCKY to have found this used (phone...           1.0  
1       4  nice phone, nice up grade from my pantach revu...           0.0  
2       5                                       Very pleased           0.0  
3       4  It works good but it goes slow sometimes but i...           0.0  
4       4  Great phone to replace my lost phone. The only...           0.0  


In [3]:

# Display column names and data types
print(df.dtypes)

Product Name     object
Brand Name       object
Price           float64
Rating            int64
Reviews          object
Review Votes    float64
dtype: object


In [4]:
# DATA CLEANING 

# 1. Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# 2. Handle missing values - drop rows with missing values in key columns
initial_count = len(df)
df.dropna(subset=['Reviews', 'Brand Name', 'Product Name'], inplace=True)
print(f" Dropped {initial_count - len(df)} rows with missing values in key columns")

# 3. Cleaning text data 
def clean_text(text):
    if pd.isna(text):
        return ""
    
    text = str(text).lower().strip()
    
    # Remove all punctuation marks
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text
df['Reviews'] = df['Reviews'].apply(clean_text)
df['Product Name'] = df['Product Name'].apply(clean_text)

print(f"\n After cleaning: {len(df)} reviews remain.")


Missing values in each column:
Product Name        0
Brand Name      65171
Price            5933
Rating              0
Reviews            70
Review Votes    12296
dtype: int64
 Dropped 65224 rows with missing values in key columns

 After cleaning: 348616 reviews remain.


In [5]:
# STEP 3: FILTER FOR SAMSUNG PRODUCTS 

# Unique brand names  in the dataset
print("\nUnique brand names in dataset:")
print(df['Brand Name'].value_counts())

#  Samsung brand filtration
samsung_brand_mask = df['Brand Name'].str.contains('samsung', case=False, na=False)
df_samsung_brand = df[samsung_brand_mask].copy()

print(f"\nFound {len(df_samsung_brand)} Samsung brand reviews.")


Unique brand names in dataset:
Brand Name
Samsung             65725
BLU                 63246
Apple               58179
LG                  22410
BlackBerry          16872
                    ...  
Blackberry (Rim)        1
TracFone                1
ThL                     1
Danger Inc.             1
ToShare Tech            1
Name: count, Length: 384, dtype: int64

Found 69781 Samsung brand reviews.


In [6]:
#  Filter Samsung product name
samsung_product_mask = df_samsung_brand['Product Name'].str.contains('samsung', case=False, na=False)
df_samsung = df_samsung_brand[samsung_product_mask].copy()

print(df_samsung['Product Name'].value_counts().head(10))

#Folder
final_df = df_samsung[['Product Name', 'Brand Name', 'Rating', 'Reviews']].copy()
output_folder = 'output'
os.makedirs(output_folder, exist_ok=True)

output_path = os.path.join(output_folder, 'samsungreviews.csv')
final_df.to_csv(output_path, index=False, encoding='utf-8')

Product Name
samsung galaxy s duos ii s7582 dual sim factory unlocked international version black                               1109
samsung galaxy s duos gts7562 gsm unlocked touchscreen 5mp camera smartphone white                                 1096
samsung galaxy s4 i9505 16gb lte unlocked international version white                                              1095
samsung galaxy s5 smg900a gsm unlocked cellphone 16gb blue                                                         1042
samsung galaxy exhibit 4g tmobile t679                                                                              990
samsung galaxy s7 edge g9350 32gb hk dual sim factory unlocked gsm international version no warranty blue coral     944
samsung s5830 galaxy ace unlocked phone black                                                                       901
samsung galaxy s5 mini g800h unlocked cellphone international version 16gb white                                    898
samsung galaxy s3 sghi747 4

In [7]:
#FILTER SPECIFIC PRODUCT 

product_name = "samsung galaxy s duos ii s7582 dual sim factory unlocked international version black"

# Filter rows matching this product name exactly
df_product = df_samsung[df_samsung['Product Name'] == product_name].copy()

# Check if any data is found
if not df_product.empty:
    specific_product_path = os.path.join(output_folder, 'specific_product.csv')
    df_product.to_csv(specific_product_path, index=False, encoding='utf-8')
    print(f"\n Specific product data saved to '{specific_product_path}'")
else:
    print("\n No data found for the specific product.")


 Specific product data saved to 'output\specific_product.csv'
