In [1]:
# Sentiment Analysis on the Skincare Brand 'The Ordinary'

In [7]:
import praw
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

In [15]:
# Connecting to the Reddit API to retrieve posts and comments from the SkincareAddiction subreddit (with 3.9 million subscribers) using the keyword 
#'ordinary' to analyze the brand. Saving the retrieved data in CSV files

# Credentials
client_id = '_Dg7T7G5aQ3WIMFli6d31g'
client_secret = 'gjenP2Q3zzLAI_eukg7v3mbM780kGg'
user_agent = 'my_bot/0.1 by u/No-Cherry-3059'

# Initializing PRAW with the credentials
reddit = praw.Reddit(client_id=client_id,
                     client_secret=client_secret,
                     user_agent=user_agent)

# Setting the variables
sub = 'SkincareAddiction'
word = 'ordinary'

# Retrieving posts (limited to 300) and associated comments from the SkincareAddiction subreddit
subreddit = reddit.subreddit(sub)
posts = list(subreddit.search(word, limit = 300))

PostData_Cleaned = []
 
for post in posts:
    
    PostData_Cleaned.append({
         'title': post.title,
         'selftext': post.selftext,
         'score': post.score,
         'num_comments': post.num_comments,
         'created': post.created_utc
     }) 


PostComments_Cleaned = []


for post in posts:
        
    comment_count = 0 
    for comment in post.comments:  
        if isinstance(comment, praw.models.MoreComments):
            continue 
        if comment_count<100:
                PostComments_Cleaned.append({
                    'title': post.title, 
                    'selftext': post.selftext,
                    'num_comments': post.num_comments,
                    'comment': comment.body,
                    'created': post.created_utc 
                })
        comment_count += 1

df_comments =pd.DataFrame(PostComments_Cleaned) 
df_posts =pd.DataFrame(PostData_Cleaned) 


In [16]:
# Data cleaning
# Converting the 'created' column to a readable datetime format and storing it in a new column named 'date'. 
# Afterward, removing the original 'created' column

df_comments['date'] = pd.to_datetime(df_comments['created'], unit='s').dt.date
df_posts['date'] = pd.to_datetime(df_posts['created'], unit='s').dt.date

df_comments = df_comments.drop('created', axis=1)
df_posts = df_posts.drop('created', axis=1)

In [17]:
# 230 posts related to the topic of The Ordinary brand were retrieved from the subreddit
df_posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229 entries, 0 to 228
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         229 non-null    object
 1   selftext      151 non-null    object
 2   score         229 non-null    int64 
 3   num_comments  229 non-null    int64 
 4   date          229 non-null    object
dtypes: int64(2), object(3)
memory usage: 9.1+ KB


In [None]:
# Filtering the retriewed posts by a specific word in a title (ordinary)
# Filling any NaN values in the selftext column with spaces
ordinary_posts = df_posts[df_posts['title'].str.contains('ordinary', case=False)].copy()
ordinary_posts['selftext'] = ordinary_posts['selftext'].fillna(' ')

In [None]:
ordinary_posts

In [None]:
# Performing sentiment analysis using TextBlob on the posts, which includes both the title and the selftext (the author's text following the title)


def analyze_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

In [None]:
def add_sentiment_columns(df, column_to_analyse, polarity_column_name='polarity post', subjectivity_column_name='subjectivity post'):
    df[[polarity_column_name, subjectivity_column_name]] = df[column_to_analyse].apply(lambda x: pd.Series(analyze_sentiment(x)))

In [None]:
add_sentiment_columns(ordinary_posts, 'title', 'polarity post', 'subjectivity post')
add_sentiment_columns(ordinary_posts, 'selftext', 'polarity selftext', 'subjectivity selftext')

In [None]:
ordinary_posts

In [None]:
# Defining a function to create visualizations of the sentiment polarity distribution

def plot_polarity(polarity_column, viz_name):
    plt.figure(figsize=(10, 6))
    sns.histplot(polarity_column, bins=20, kde=True)
    plt.title(viz_name)
    plt.xlabel('Polarity')
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Plotting the distribution of post title polarity, which ranges from -1 (very negative) to 1 (very positive), including intermediate values,
# where 0 representing neutral sentiment.

plot_polarity(ordinary_posts['polarity post'], 'Distribution of Post Title Polarity')

In [None]:
# Plotting distribution of the Post text polarity

plot_polarity(ordinary_posts['polarity selftext'], 'Distribution of Post Selftext Polarity')


In [None]:
# The title polarity distribution indicates that the majority sentiment is slightly negative. 
# Let's explore the reasons for this using a Word Cloud technique.

In [None]:
# Defining a function to apply Word Cloud analysis

def cloud_analysis(column_to_analyse):
    title_cloud = WordCloud(stopwords = stopwords).generate(column_to_analyse.to_string())
    plt.imshow(title_cloud, interpolation = 'bilinear')
    plt.axis('off')
    plt.show()


In [None]:
#Setting extra stop words

extra_stop = {'n', 'nk', 's' , 'yes', 'welcome', 'm', 'deleted' , 'thank', 'skincareaddiction', 'skincare', 'hi', 'everyone', 'use', 'product', 'ordinary', 'products', 'skin', 've'}
stopwords = STOPWORDS.union(extra_stop)

In [None]:
# Generating WordCloud and plotting the results

cloud_analysis(ordinary_posts['title'])

In [None]:
# Analysis of the titles of all posts about ordinary products reveals that people are most concerned with routine help
# and have questions about the products

In [None]:
#Exploring in more detail using a Word Cloud analysis of the selftext from the posts

cloud_analysis(ordinary_posts['selftext'])

In [None]:
# Based on the Word Cloud generated from over 150 non-null selftexts about ordinary products, we can infer that people are asking about 
# recently purchased items and seeking information on their usage.
# There is particular concern regarding acid products, such as glycolic acid. Let's filter the question posts to determine their specific content

In [None]:
ordinary_posts_product_question = ordinary_posts[ordinary_posts['title'].str.contains('question', case=False)].copy()

In [None]:
ordinary_posts_product_question.info()

# We filtered the post titles using 'question' words in the text, resulting in a total of 82 posts

In [None]:
extra_stop2 = {'n', 'nk', 's' , 'yes', 'welcome', 'm', 'deleted' , 'thank', 'skincareaddiction', 'skincare', 'hi', 'everyone',  'anyone', 'use', 'product', 'ordinary', 'products', 'skin', 've', 'question'}
stopwords = STOPWORDS.union(extra_stop2)

cloud_analysis(ordinary_posts_product_question['title'])

#Indeed, we can confirm that the most common question among customers who have recently purchased Ordinary products pertains to acids,
# particularly glycolic acid. Let's explore the selftext to identify the specific questions being asked.




In [None]:
# Generating and plotting WordCloud for selftext of the 'Question' posts

cloud_analysis(ordinary_posts_product_question['selftext'])


# From the Word Cloud below, we can see that people who purchased acid products are struggling to understand the correct routine for their use.
# The customers have doubts about their proper application. In particular, how do these acids combined with other products like peptide and retinol
# in the same routine.

In [None]:
# To confirm our hypothesis, we will filter the posts using the keyword 'Routine help' and analyze the titles and selftext of these posts

In [None]:
ordinary_posts_routine = ordinary_posts[ordinary_posts['title'].str.contains('routine', case=False)].copy()

In [None]:
ordinary_posts_routine.info()

In [None]:
extra_stop3 = {'n', 'nk', 's' , 'yes', 'welcome', 'm', 'deleted' , 'thank', 'skincareaddiction', 'skincare', 'hi', 'everyone',  'anyone', 'use', 'product', 'ordinary', 'products', 'skin', 've', 'question', 'routine', 'routine help'}
stopwords = STOPWORDS.union(extra_stop3)

cloud_analysis(ordinary_posts_routine['title'])


In [None]:
cloud_analysis(ordinary_posts_routine['selftext'])

In [None]:
# Based on the two WordCloud analyses above, we can confirm our assumption: clients who recently purchased The Ordinary products primarily 
# have doubts about incorporating them into their daily skincare routine. Their concerns particularly revolve around acids (especially glycolic acid)
# and whether these can be used alongside Niacinamide, Peptides, and Retinol in the same regimen.
# It’s recommended to review the guidance on the official website and the packaging for usage instructions on acids in skincare routines

In [None]:
#Sentiment analysis on the comments of the posts which have in the title 'The ordinary' brand
#We will filter the comments DataFrame to only include posts where the word 'ordinary' appears in the title.
ordinary_comments = df_comments[df_comments['title'].str.contains('ordinary', case=False)].copy()


In [None]:
add_sentiment_columns(ordinary_comments, 'comment', 'polarity comment', 'subjectivity comment')

In [None]:
ordinary_comments


In [None]:
ordinary_comments.info()

In [None]:
plot_polarity(ordinary_comments['polarity comment'], 'Distribution of Comments Polarity')


In [None]:
# The overall polarity distribution of the comment section, which contains over 4,000 comments, is positive

In [None]:
#What people say in the comments about The Ordinary brand

cloud_analysis(ordinary_comments['comment'])

# Overall, the sentiment is positive, within over 4000 comments showing that the word 'love' is the most frequently used.
# The Word Cloud below reinforces this, indicating that people are generally happy with The Ordinary products.

In [None]:
posts_distribution = ordinary_posts.groupby('date').size().reset_index(name='post_count')

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(posts_distribution['date'], posts_distribution['post_count'], marker='o', linestyle='-', color='b')
plt.title('Distribution of Posts by Date')
plt.xlabel('Date')
plt.ylabel('Number of Posts')
plt.xticks(rotation=45)
plt.grid()
plt.tight_layout()  
plt.show()

In [None]:
# During this year we can see the most interest in The Ordinary brand