<a href="https://colab.research.google.com/github/Kashaf-Abdullah/20sw027_DSA_LAB/blob/main/DSA_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the dataset (replace 'dataset.csv' with the actual file path)
df = pd.read_csv('amazon.csv')

# Get an overview of the dataset
print(df.head())  # Display the first few rows
print(df.info())  # Display data types and non-null counts
print(df.describe())  # Display summary statistics


         asin           name  rating               date  verified  \
0  B0000SX2UC          Janet       3   October 11, 2005     False   
1  B0000SX2UC     Luke Wyatt       1    January 7, 2004     False   
2  B0000SX2UC         Brooke       5  December 30, 2003     False   
3  B0000SX2UC  amy m. teague       3     March 18, 2004     False   
4  B0000SX2UC  tristazbimmer       4    August 28, 2005     False   

                                          title  \
0                   Def not best, but not worst   
1                   Text Messaging Doesn't Work   
2                               Love This Phone   
3                       Love the Phone, BUT...!   
4  Great phone service and options, lousy case!   

                                                body  helpfulVotes  
0  I had the Samsung A600 for awhile which is abs...           1.0  
1  Due to a software issue between Nokia and Spri...          17.0  
2  This is a great, reliable phone. I also purcha...           5.0  
3 

In [None]:
# Get the column names
column_names = df.columns

# Display the column names
print(column_names)

Index(['asin', 'name', 'rating', 'date', 'verified', 'title', 'body',
       'helpfulVotes'],
      dtype='object')


DATA CLEANING


In [None]:

# Handling missing values
df.dropna(subset=['name', 'rating', 'date', 'verified'], inplace=True)

# Remove duplicate rows based on all columns
df.drop_duplicates(inplace=True)

# Reset the index after removing rows
df.reset_index(drop=True, inplace=True)

# Display the cleaned DataFrame
print(df.head())

         asin           name  rating               date  verified  \
0  B0000SX2UC          Janet       3   October 11, 2005     False   
1  B0000SX2UC     Luke Wyatt       1    January 7, 2004     False   
2  B0000SX2UC         Brooke       5  December 30, 2003     False   
3  B0000SX2UC  amy m. teague       3     March 18, 2004     False   
4  B0000SX2UC  tristazbimmer       4    August 28, 2005     False   

                                          title  \
0                   Def not best, but not worst   
1                   Text Messaging Doesn't Work   
2                               Love This Phone   
3                       Love the Phone, BUT...!   
4  Great phone service and options, lousy case!   

                                                body  helpfulVotes  
0  I had the Samsung A600 for awhile which is abs...           1.0  
1  Due to a software issue between Nokia and Spri...          17.0  
2  This is a great, reliable phone. I also purcha...           5.0  
3 

DATA EXTRACTION


In [None]:
# Extract data based on specific conditions or criteria

# Extract rows where 'rating' is greater than or equal to 4
high_rating_df = df[df['rating'] >= 4]

# Extract rows where 'verified' is True
verified_reviews_df = df[df['verified']]

# Extract rows where 'helpfulVotes' is greater than 10
helpful_reviews_df = df[df['helpfulVotes'] > 10]

# Extract rows where 'title' contains a specific keyword (e.g., 'interesting')
keyword = 'interesting'
keyword_reviews_df = df[df['title'].str.contains(keyword, case=False, na=False)]

# Display the extracted DataFrames
print("High Rating Reviews:")
print(high_rating_df.head())

print("\nVerified Reviews:")
print(verified_reviews_df.head())

print("\nHelpful Reviews:")
print(helpful_reviews_df.head())

print("\nReviews with Keyword 'interesting':")
print(keyword_reviews_df.head())


High Rating Reviews:
         asin                        name  rating               date  \
2  B0000SX2UC                      Brooke       5  December 30, 2003   
4  B0000SX2UC               tristazbimmer       4    August 28, 2005   
5  B0000SX2UC  the cell phone store owner       5     April 16, 2004   
6  B0000SX2UC                        Matt       4      April 3, 2004   
7  B0000SX2UC                Charles Cook       5  November 24, 2003   

   verified                                         title  \
2     False                               Love This Phone   
4     False  Great phone service and options, lousy case!   
5     False           Wanna cool Nokia? You have it here!   
6     False          Problem with 3588i universal headset   
7     False                            cool phone!!!!!!!!   

                                                body  helpfulVotes  
2  This is a great, reliable phone. I also purcha...           5.0  
4  The phone has been great for every pur

FEATURE ENGINEERING


In [None]:
# Date Feature:
# Extract year, month, day, or day of the week from the 'date' column to create new temporal features.
df['year'] = pd.to_datetime(df['date']).dt.year
df['month'] = pd.to_datetime(df['date']).dt.month
df['day'] = pd.to_datetime(df['date']).dt.day
df['day_of_week'] = pd.to_datetime(df['date']).dt.dayofweek


In [None]:
# Text Length:
# Calculate the length of the 'body' or 'title' text to create a feature indicating the length of the review.

df['body_length'] = df['body'].apply(len)
df['title_length'] = df['title'].apply(len)


In [None]:
# Review Sentiment:
# You can use natural language processing techniques to analyze the sentiment of the 'body' or 'title' text and create sentiment features.


from textblob import TextBlob

df['body_sentiment'] = df['body'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['title_sentiment'] = df['title'].apply(lambda x: TextBlob(x).sentiment.polarity)


In [None]:
# Interaction Features:
# Create interaction features between existing columns. For example, you can multiply 'rating' by 'helpfulVotes' to capture the interaction between these two features.

df['rating_helpful_interaction'] = df['rating'] * df['helpfulVotes']


In [None]:
# Aggregated Features:
# Compute aggregated statistics for each product or user. For example, calculate the average rating for each product.

product_avg_rating = df.groupby('asin')['rating'].mean().reset_index()
product_avg_rating.rename(columns={'rating': 'avg_rating_per_product'}, inplace=True)
df = df.merge(product_avg_rating, on='asin', how='left')
