In [76]:
import pandas as pd

# Specify an alternative encoding to read the CSV file
data = pd.read_csv('flipkart_product.csv', encoding='ISO-8859-1')

# Display the first few rows to verify successful loading
print(data.head())


                                         ProductName     Price Rate  \
0  Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...  ??3,999    5   
1  Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...  ??3,999    5   
2  Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...  ??3,999    3   
3  Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...  ??3,999    1   
4  Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...  ??3,999    3   

            Review                                            Summary  
0           Super!  Great cooler.. excellent air flow and for this...  
1          Awesome             Best budget 2 fit cooler. Nice cooling  
2             Fair  The quality is good but the power of air is de...  
3  Useless product                 Very bad product it's a only a fan  
4             Fair                                      Ok ok product  


In [77]:
data = data.drop(columns=['ProductName', 'Price'])


In [78]:
data = data.drop_duplicates()


In [80]:
data['Rate'] = pd.to_numeric(data['Rate'], errors='coerce')

# Replace NaN values in 'Rate' if needed (e.g., fill with 0 or drop rows with NaN)
# Here, we will drop rows with NaN in the 'Rate' column
data = data.dropna(subset=['Rate'])

# Label sentiment based on rating
data['sentiment'] = 0  # Default to negative sentiment
data.loc[data['Rate'] >= 4, 'sentiment'] = 1  # Set positive sentiment for ratings of 4 and 5

# Check the result
print(data.head())

   Rate           Review                                            Summary  \
0   5.0           Super!  Great cooler.. excellent air flow and for this...   
1   5.0          Awesome             Best budget 2 fit cooler. Nice cooling   
2   3.0             Fair  The quality is good but the power of air is de...   
3   1.0  Useless product                 Very bad product it's a only a fan   
4   3.0             Fair                                      Ok ok product   

   sentiment  
0          1  
1          1  
2          0  
3          0  
4          0  


In [82]:
import nltk

nltk.download('punkt')


[nltk_data] Downloading package punkt to C:\Users\Shubham
[nltk_data]     Yadav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [89]:
def sampling_k_elements(group, k=150000):
    if len(group) < k:
        return group
    return group.sample(k, random_state=42)

# Balance the dataset by sampling equal positive and negative reviews
finalDF = data.groupby('sentiment').apply(sampling_k_elements).reset_index(drop=True)


  finalDF = data.groupby('sentiment').apply(sampling_k_elements).reset_index(drop=True)


In [90]:
finalDF.to_csv('flipkart_reviews_balanced.csv', index=False)


In [91]:
# Preview the final balanced dataset
print(finalDF.head())

# Check basic information to confirm structure
print(finalDF.info())

# Display the distribution of sentiment labels
print(finalDF['sentiment'].value_counts())


   Rate           Review                                            Summary  \
0   3.0             Fair  The quality is good but the power of air is de...   
1   1.0  Useless product                 Very bad product it's a only a fan   
2   3.0             Fair                                      Ok ok product   
3   3.0             Nice                                          Very nice   
4   1.0   Unsatisfactory                                    Very bad cooler   

   sentiment  
0          0  
1          0  
2          0  
3          0  
4          0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116936 entries, 0 to 116935
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Rate       116936 non-null  float64
 1   Review     116933 non-null  object 
 2   Summary    116931 non-null  object 
 3   sentiment  116936 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 3.6+ MB
None
sentiment
1 

In [94]:
import pandas as pd



# Drop rows with any missing values
df_cleaned = df.dropna()

# Check the shape of the new DataFrame
print(df_cleaned.shape)


(180379, 6)


In [96]:
pip install imbalanced-learn


Collecting imbalanced-learnNote: you may need to restart the kernel to use updated packages.

  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Collecting scipy>=1.5.0 (from imbalanced-learn)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting scikit-learn>=1.0.2 (from imbalanced-learn)
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting threadpoolctl>=2.0.0 (from imbalanced-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   -------- ------------------------------- 2.4/11.0 MB 10.3 MB/s eta 0:00:01
   ---------- ----------------------------- 2.9/11.0 MB 8.4 MB/s eta 0:00:01
   ------------- -------------------------- 3.7/11.0 MB 5.9 MB/s eta 0:00:02
   ------------------ -------

In [98]:
# Display the distribution of sentiment labels
print(finalDF['sentiment'].value_counts())


sentiment
1    86487
0    30449
Name: count, dtype: int64


In [113]:
# Features (Review and Summary) and target (sentiment)
X = finalDF[['Review', 'Summary']]
y = finalDF['sentiment']


In [114]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = finalDF[finalDF['sentiment'] == 1]
df_minority = finalDF[finalDF['sentiment'] == 0]

# Downsample majority class
df_majority_downsampled = resample(df_majority,
                                    replace=False,    # sample without replacement
                                    n_samples=len(df_minority),  # to match minority class
                                    random_state=42)  # reproducible results

# Combine minority class with downsampled majority class
finalDF_balanced = pd.concat([df_majority_downsampled, df_minority])

# Display new class distribution
print(finalDF_balanced['sentiment'].value_counts())



sentiment
1    30449
0    30449
Name: count, dtype: int64


In [115]:
# Check the balanced distribution
print(finalDF_balanced['sentiment'].value_counts())


sentiment
1    30449
0    30449
Name: count, dtype: int64


In [116]:
finalDF_balanced.to_csv('balanced_dataset.csv', index=False)


In [117]:
print(finalDF_balanced['sentiment'].value_counts())

sentiment
1    30449
0    30449
Name: count, dtype: int64


In [121]:
print(finalDF_balanced.head())

        Rate             Review  \
55752    5.0          Excellent   
40271    5.0          Fabulous!   
65229    4.0        Really Nice   
105812   5.0  Worth every penny   
31081    5.0     Classy product   

                                                  Summary  sentiment  
55752   Good juicer. Easy to use and also clean. Every...          1  
40271                        Very good product. Worthful.          1  
65229                                       Nice ðð          1  
105812         Quality is very good, looking is beautiful          1  
31081   Its an awesome product and I am not making any...          1  


In [122]:
print(finalDF_balanced.info())

<class 'pandas.core.frame.DataFrame'>
Index: 60898 entries, 55752 to 30448
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Rate       60898 non-null  float64
 1   Review     60897 non-null  object 
 2   Summary    60895 non-null  object 
 3   sentiment  60898 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 2.3+ MB
None


In [123]:
print(finalDF_balanced['sentiment'].value_counts())

sentiment
1    30449
0    30449
Name: count, dtype: int64
