# Analisis Sentimen Opini Masyarakat terhadap ChatGPT sebagai Aplikasi Natural Language Processing

In [8]:
import pandas as pd

chatgpt_tweets_df = pd.read_csv('dataset/chatgpt_tweets_dataset.csv')
chatgpt_tweets_df.sample(10, random_state=8)

Unnamed: 0,date,user_name,text,sentiment
34010,2022-12-26 07:40:17+00:00,Ario Jafarzadeh,"This past week, I used #ChatGPT to help me: up...",Positive
4318,2023-03-27 13:01:36+00:00,Deqode,Let us know if we missed anything.\n\nVisit ht...,Negative
7746,2023-02-07 17:12:40+00:00,Gerard Dusastre,"#Google unveils #Bard, , its #ChatGPT rival, o...",Negative
21560,2023-01-29 19:08:28+00:00,Stainless Willie,#ChatGPT to the Moon\n ...,Neutral
28377,2023-01-03 06:42:01+00:00,UVisible,Unlock #SEO success with #ChatGPT! Easily gene...,Positive
17780,2023-03-10 16:43:08+00:00,WunThompsonINTL,In this week's roundup: @VirginAtlantic zero-e...,Neutral
32499,2023-02-15 05:45:51+00:00,Andre Pawlowski,With all the new hype around AI/ML through #Ch...,Positive
21232,2023-04-08 14:54:39+00:00,Raoul Galt,How long before #ChatGPT comes preinstalled on...,Neutral
27354,2023-04-19 17:35:11+00:00,Boston Healthcare Cloud Community,"What ""strikes"" the perfect chord for healthcar...",Positive
21794,2023-04-06 19:44:33+00:00,Klehr Harrison Harvey Branzburg LLP,In this edition of his #Startups &amp; #Entrep...,Neutral


In [9]:
positive = chatgpt_tweets_df[chatgpt_tweets_df['sentiment'] == 'Positive'].sample(5000, random_state=8)
neutral = chatgpt_tweets_df[chatgpt_tweets_df['sentiment'] == 'Neutral'].sample(5000, random_state=8)
negative = chatgpt_tweets_df[chatgpt_tweets_df['sentiment'] == 'Negative'].sample(5000, random_state=8)

chatgpt_tweets_df = pd.concat([positive, neutral, negative]).sample(frac=1, random_state=8).reset_index(drop=True)

In [10]:
chatgpt_tweets_df.head()

Unnamed: 0,date,user_name,text,sentiment
0,2022-12-06 10:57:04+00:00,Abhinay Bhasin,Who called it #googlesearch and not #ChatGPT :p,Negative
1,2023-04-10 12:45:56+00:00,DiKayo | dikayo.eth ♠️❤️♣️♦️,@pwang I see you’re a fan of #ChatGPT :),Positive
2,2023-04-05 19:01:28+00:00,GanWeaving 🐦,"Toying around with #p5js, #midjourney and #cha...",Positive
3,2023-04-09 19:37:05+00:00,Dan Brunelle 5.0,Here’s why ChatGPT requires a phone number to ...,Neutral
4,2023-02-03 01:13:24+00:00,Timothy Karera,#ChatGPT may be coming for our jobs. Here are ...,Positive


In [11]:
chatgpt_tweets_df['sentiment'].value_counts()

Negative    5000
Positive    5000
Neutral     5000
Name: sentiment, dtype: int64

## Data Preparation and Preprocessing

### Text Filtering and Case Folding

In [12]:
import re
import html

def remove_html_escape(text):
    return html.unescape(text)

def remove_escape_sequence(text):
    escape_sequence_pattern = re.compile(r'[\n\t\r\\]')
    return escape_sequence_pattern.sub(' ', text)

def remove_emojis(text):
    emoji_pattern = re.compile('['
                              u'\U0001F600-\U0001F64F'  # emoticons
                              u'\U0001F300-\U0001F5FF'  # symbols & pictographs
                              u'\U0001F680-\U0001F6FF'  # transport & map symbols
                              u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
                              ']+', flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_links(text):
    link_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|www(?:\.[a-zA-Z0-9-]+){2,3}(?:/[a-zA-Z0-9]+)?')
    return link_pattern.sub('', text)

def remove_slashed_dashed(text):
    slashed_dashed_pattern = re.compile(r'[/-]')
    return slashed_dashed_pattern.sub(' ', text)

def remove_punctuation(text):
    punctuation_pattern = re.compile(r'[^\w\s]+')
    return punctuation_pattern.sub('', text)

chatgpt_tweets_df['text'] = chatgpt_tweets_df['text'].apply(remove_html_escape)
chatgpt_tweets_df['text'] = chatgpt_tweets_df['text'].apply(remove_escape_sequence)
chatgpt_tweets_df['text'] = chatgpt_tweets_df['text'].apply(remove_emojis)
chatgpt_tweets_df['text'] = chatgpt_tweets_df['text'].apply(remove_links)
chatgpt_tweets_df['text'] = chatgpt_tweets_df['text'].apply(remove_slashed_dashed)
chatgpt_tweets_df['text'] = chatgpt_tweets_df['text'].apply(remove_punctuation)
chatgpt_tweets_df['text'] = chatgpt_tweets_df['text'].str.replace(r'\s{2,}', ' ', regex=True)
chatgpt_tweets_df['text'] = chatgpt_tweets_df['text'].str.lower()

### Tokenization

In [13]:
from nltk.tokenize import word_tokenize

chatgpt_tweets_df['text'] = chatgpt_tweets_df['text'].apply(
    lambda x: word_tokenize(x))

### Remove Stopwords

In [14]:
from nltk.corpus import stopwords

chatgpt_tweets_df['text'] = chatgpt_tweets_df['text'].apply(
    lambda x: [word for word in x if word not in stopwords.words('english')])

### Stemming

In [15]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
chatgpt_tweets_df['text'] = chatgpt_tweets_df['text'].apply(
    lambda x: [ps.stem(word) for word in x])

### Vector Conversion

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(chatgpt_tweets_df['text'].apply(
    lambda x: ' '.join(x)))
vectors = [vector.toarray()[0] for vector in vectors]

In [17]:
import numpy as np

X = np.asarray(vectors)
y = np.asarray(chatgpt_tweets_df['sentiment'].apply(
    lambda x: 0 if x == 'Negative' else 1 if x == 'Positive' else 2 if x == 'Neutral' else x))

np.savez_compressed('X.npz', X)
np.savez_compressed('y.npz', y)