In [1]:
"""
# Noon Company Review: Data Loading and Preprocessing

# This notebook demonstrates the steps to load the CSV file containing Reddit comments, inspect the data, extract the comment text along with their target sentiment labels, and group the comments by topic (post title).

# Data File:** `noon_filtered_comments_20250128_221435.csv`

*Columns in the CSV include:**
- `post_id`
- `post_title`
- `comment_id`
- `author`
- `comment_text`
- `score`
- `created_utc`
- `is_submitter`
- `contains_noon_mention`
- `type`
- `Target`
"""

'\n# Noon Company Review: Data Loading and Preprocessing\n\n# This notebook demonstrates the steps to load the CSV file containing Reddit comments, inspect the data, extract the comment text along with their target sentiment labels, and group the comments by topic (post title).\n\n# Data File:** `noon_filtered_comments_20250128_221435.csv`\n\n*Columns in the CSV include:**\n- `post_id`\n- `post_title`\n- `comment_id`\n- `author`\n- `comment_text`\n- `score`\n- `created_utc`\n- `is_submitter`\n- `contains_noon_mention`\n- `type`\n- `Target`\n'

In [1]:
# Import the necessary libraries
import pandas as pd
import numpy as np

# (Optional) For visualization and further analysis
import matplotlib.pyplot as plt
import seaborn as sns

# Configure visualization settings
%matplotlib inline
sns.set(style="whitegrid")

In [5]:
# Define the path to your CSV file
csv_file_path = '../Data/processed/noon_filtered_comments_20250128_221435.csv'

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(csv_file_path, sep=',')  # Use sep=',' if your file is comma-separated

# Display the first few rows to inspect the data
df.head()

Unnamed: 0,post_id,post_title,comment_id,author,comment_text,score,created_utc,is_submitter,contains_noon_mention,type,Target
0,1ec7r4l,My bank just woke me up at 2:00 in the morning...,ley1hzs,hanihaneefa,Noon doesn't ask for otp when using cards... H...,75,26/07/2024 2:29,False,True,comment,Negative
1,1ec7r4l,My bank just woke me up at 2:00 in the morning...,lf0s1re,SnooGuavas4756,UPDATE: Noon.com has reached out to me on Redd...,9,26/07/2024 16:05,True,True,comment,Positive
2,1ec7r4l,My bank just woke me up at 2:00 in the morning...,lf06mu0,Dansdan84,The bank should be able to give the money back...,2,26/07/2024 12:37,False,True,comment,Positive
3,1ec7r4l,My bank just woke me up at 2:00 in the morning...,lf1m0tu,haruror,This happened to me recently. Why is it always...,2,26/07/2024 19:11,False,True,comment,Positive
4,1ec7r4l,My bank just woke me up at 2:00 in the morning...,lfartnt,Impressive_End_1222,Something similar happened and in case of no O...,2,28/07/2024 9:47,False,True,comment,Positive


In [6]:
# Check the shape and basic information of the DataFrame
print("DataFrame Shape:", df.shape)
print("\nDataFrame Info:")
df.info()

# Display summary statistics if needed
df.describe(include='all')

DataFrame Shape: (519, 11)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 519 entries, 0 to 518
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   post_id                519 non-null    object
 1   post_title             519 non-null    object
 2   comment_id             519 non-null    object
 3   author                 509 non-null    object
 4   comment_text           519 non-null    object
 5   score                  519 non-null    int64 
 6   created_utc            519 non-null    object
 7   is_submitter           519 non-null    bool  
 8   contains_noon_mention  519 non-null    bool  
 9   type                   519 non-null    object
 10  Target                 519 non-null    object
dtypes: bool(2), int64(1), object(8)
memory usage: 37.6+ KB


Unnamed: 0,post_id,post_title,comment_id,author,comment_text,score,created_utc,is_submitter,contains_noon_mention,type,Target
count,519,519,519,509,519,519.0,519,519,519,519,519
unique,86,85,519,381,516,,513,2,1,1,2
top,1ah6j9m,Be careful purchasing from Noon!,ley1hzs,Distinct-Drama7372,Even noon doesn‚Äôt know,,24/09/2024 17:55,False,True,comment,Positive
freq,42,42,1,14,2,,2,462,519,519,487
mean,,,,,,5.337187,,,,,
std,,,,,,17.203079,,,,,
min,,,,,,-11.0,,,,,
25%,,,,,,1.0,,,,,
50%,,,,,,2.0,,,,,
75%,,,,,,3.0,,,,,


In [7]:
# Extract the comment text and target sentiment columns
df_comments = df[['post_title', 'comment_text', 'Target']]

# Show the first few rows of the extracted DataFrame
df_comments.head()

Unnamed: 0,post_title,comment_text,Target
0,My bank just woke me up at 2:00 in the morning...,Noon doesn't ask for otp when using cards... H...,Negative
1,My bank just woke me up at 2:00 in the morning...,UPDATE: Noon.com has reached out to me on Redd...,Positive
2,My bank just woke me up at 2:00 in the morning...,The bank should be able to give the money back...,Positive
3,My bank just woke me up at 2:00 in the morning...,This happened to me recently. Why is it always...,Positive
4,My bank just woke me up at 2:00 in the morning...,Something similar happened and in case of no O...,Positive


In [8]:
# Group comments by post_title
grouped_comments = df_comments.groupby('post_title')['comment_text'].apply(list).reset_index()

# Optionally, group sentiment labels (targets) as well
grouped_targets = df_comments.groupby('post_title')['Target'].apply(list).reset_index()

# Merge the two groupings if desired
grouped_data = pd.merge(grouped_comments, grouped_targets, on='post_title', suffixes=('_comments', '_targets'))

# Display the grouped data for one topic as an example
grouped_data.head()

Unnamed: 0,post_title,comment_text,Target
0,Advice about noon,[Oh pls... Don't buy anything off noon. Go to ...,"[Positive, Positive, Positive, Positive, Posit..."
1,Aliexpress shipping,[Aliexpress products come from China directly....,[Positive]
2,Amazon & Noon items,[Yes most of them are genuine items but you ne...,"[Positive, Positive]"
3,Amazon and Noon cannibalising it's market place,[The whole reason for marketplace is for them ...,[Positive]
4,Amazon/Noon Listing in the UAE,[Zero entry barriers. Some paperwork required ...,[Positive]


In [9]:
# If you plan to work with individual comment examples:
X = df_comments['comment_text'].values   # Text data (features)
y = df_comments['Target'].values         # Sentiment labels

# Display the first 5 examples
for i in range(5):
    print(f"Comment: {X[i]}")
    print(f"Sentiment: {y[i]}")
    print("-" * 80)

Comment: Noon doesn't ask for otp when using cards... How can I set my card to ask for otp ALL THE TIME?
Sentiment: Negative
--------------------------------------------------------------------------------
Comment: UPDATE: Noon.com has reached out to me on Reddit. I got a call from Noon.com customer service and a complaint has been raised. This is responsive and I hope they are able to track the delivery addresses of the scammer.
Sentiment: Positive
--------------------------------------------------------------------------------
Comment: The bank should be able to give the money back when no OTP has been used. And arrange with noon.
Sentiment: Positive
--------------------------------------------------------------------------------
Comment: This happened to me recently. Why is it always noon being charged
Sentiment: Positive
--------------------------------------------------------------------------------
Comment: Something similar happened and in case of no OTP means their security sys