<a href="https://colab.research.google.com/github/Intel-Unnathi-Intership-Program/Product_Sentiment_Analysis/blob/main/cleaningi5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure the required NLTK data packages are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
file_path = r"C:\Users\Ananya\Documents\Intel Dataset\i5_reviews.csv"
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Original Data:")
print(df.head())

# Data Cleaning Steps

# 1. Handling missing values
# Drop rows with any missing values
df.dropna(inplace=True)

# 2. Removing duplicates
# Drop duplicate rows
#df.drop_duplicates(inplace=True)

# 3. Normalizing text (assuming there's a 'review_body' column)
# Convert column to string type to handle non-string values
df['review_body'] = df['review_body'].astype(str)

# Convert text to lowercase
df['review_body'] = df['review_body'].str.lower()

# Remove punctuation and special characters
df['review_body'] = df['review_body'].str.replace(r'[^\w\s]', '', regex=True)

# Remove numbers
df['review_body'] = df['review_body'].str.replace(r'\d+', '', regex=True)

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Remove stop words
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove "read more"
    text = text.replace('read more', '')
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word, pos='v') for word in words]  # Apply lemmatization
    return ' '.join(words)

df['review_body'] = df['review_body'].apply(clean_text)

# Temporarily save the cleaned dataset to a new file
temp_file_path = r"C:\Users\Ananya\Documents\Intel Dataset\temp_i5_reviews.csv"
df.to_csv(temp_file_path, index=False)

# Display the first few rows of the cleaned dataset
print("Cleaned Data:")
print(df.head())

Original Data:
  Processor                                                URL  review_rating  \
0        i5  https://www.amazon.in/Intel-Generation-Desktop...            5.0   
1        i5  https://www.amazon.in/Intel-Generation-Desktop...            5.0   
2        i5  https://www.amazon.in/Intel-Generation-Desktop...            5.0   
3        i5  https://www.amazon.in/Intel-Generation-Desktop...            4.0   
4        i5  https://www.amazon.in/Intel-Generation-Desktop...            5.0   

                                         review_body  
0  The processor was very power efficient i devel...  
1  best budget range and entry level processor ou...  
2  This processor is great mid to high range budg...  
3  This will not work without graphic card, So be...  
4     Thanku Appario you send me best cpu\nRead more  
Cleaned Data:
  Processor                                                URL  review_rating  \
0        i5  https://www.amazon.in/Intel-Generation-Desktop...           

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ananya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ananya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
