# Analisis Sentimen Opini Masyarakat terhadap ChatGPT sebagai Aplikasi Natural Language Processing

In [1]:
import pandas as pd

chatgpt_tweets_df = pd.read_csv('dataset/chatgpt_tweets_dataset.csv')
chatgpt_tweets_df.sample(10, random_state=8)

Unnamed: 0,date,user_name,text,sentiment
12283,2023-03-24 08:13:49+00:00,KaEl,"#ChatGPT third-Party plugins, the „eyes and ea...",Neutral
1986,2023-02-07 16:25:07+00:00,Piyush Tiwari,OpenAI: #ChatGPT is the future!\n\nGoogle: hol...,Neutral
3974,2022-12-10 12:37:15+00:00,Poonam Soni,"@wahVinci Also there is #ChatGPT now, 👀\nGivin...",Positive
9641,2023-03-10 16:30:55+00:00,Stefanescu Liviu,Social media comments made with #Chatgpt #AI. ...,Negative
4517,2023-02-12 15:05:43+00:00,BeryBearishBear,some of leading marijuana companies.... accord...,Neutral
13155,2023-04-25 13:53:17+00:00,Temerty Centre for AI in Medicine (T-CAIREM),How #ArtificialIntelligence Like #ChatGPT Is I...,Positive
14568,2022-12-08 20:55:36+00:00,jhermes,Space &amp; Time.\nTuring Test: failed. #ChatG...,Negative
9872,2023-02-27 16:20:47+00:00,Dr. Hilary Murray,Excellent panel on #ChatGPT held by our #Trust...,Positive
1157,2023-04-06 08:43:02+00:00,AbandonHope,surely there should be settings&gt; share data...,Positive
10830,2023-04-24 21:51:39+00:00,azlef900 ✨👽🌙,"New youtube series in the works ""Deconstructin...",Positive


## Data Preparation and Preprocessing

### Text Filtering and Case Folding

In [2]:
import re
import html

def remove_html_escape(text):
    return html.unescape(text)

def remove_escape_sequence(text):
    escape_sequence_pattern = re.compile(r'[\n\t\r\\]')
    return escape_sequence_pattern.sub(' ', text)

def remove_emojis(text):
    emoji_pattern = re.compile('['
                              u'\U0001F600-\U0001F64F'  # emoticons
                              u'\U0001F300-\U0001F5FF'  # symbols & pictographs
                              u'\U0001F680-\U0001F6FF'  # transport & map symbols
                              u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
                              ']+', flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_username(text):
    username_pattern = re.compile(r'@\w+')
    return username_pattern.sub('', text)

def remove_hashtag(text):
    hashtag_pattern = re.compile(r'#[\w\d]+')
    return hashtag_pattern.sub('', text)

def remove_digits(text):
    digits_pattern = re.compile(r'[\w+]\d+[\w+]')
    return digits_pattern.sub('', text)

def remove_possession(text):
    possession_pattern = re.compile(r'\'s')
    return possession_pattern.sub(' is', text)

def remove_links(text):
    link_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|www(?:\.[a-zA-Z0-9-]+){2,3}(?:/[a-zA-Z0-9]+)?')
    return link_pattern.sub('', text)

def remove_slashed_dashed(text):
    slashed_dashed_pattern = re.compile(r'[/-]')
    return slashed_dashed_pattern.sub(' ', text)

def remove_punctuation(text):
    punctuation_pattern = re.compile(r'[^\w\s]+')
    return punctuation_pattern.sub('', text)

def remove_one_character(text):
    one_character_pattern = re.compile(r'\b\w\b')
    return one_character_pattern.sub(' ', text)

chatgpt_tweets_df['filtered_text'] = chatgpt_tweets_df['text'].str.lower()
chatgpt_tweets_df['filtered_text'] = chatgpt_tweets_df['filtered_text'].apply(remove_html_escape)
chatgpt_tweets_df['filtered_text'] = chatgpt_tweets_df['filtered_text'].apply(remove_escape_sequence)
chatgpt_tweets_df['filtered_text'] = chatgpt_tweets_df['filtered_text'].apply(remove_emojis)
chatgpt_tweets_df['filtered_text'] = chatgpt_tweets_df['filtered_text'].apply(remove_username)
chatgpt_tweets_df['filtered_text'] = chatgpt_tweets_df['filtered_text'].apply(remove_hashtag)
chatgpt_tweets_df['filtered_text'] = chatgpt_tweets_df['filtered_text'].apply(remove_digits)
chatgpt_tweets_df['filtered_text'] = chatgpt_tweets_df['filtered_text'].apply(remove_possession)
chatgpt_tweets_df['filtered_text'] = chatgpt_tweets_df['filtered_text'].apply(remove_links)
chatgpt_tweets_df['filtered_text'] = chatgpt_tweets_df['filtered_text'].apply(remove_slashed_dashed)
chatgpt_tweets_df['filtered_text'] = chatgpt_tweets_df['filtered_text'].apply(remove_punctuation)
chatgpt_tweets_df['filtered_text'] = chatgpt_tweets_df['filtered_text'].apply(remove_one_character)
chatgpt_tweets_df['filtered_text'] = chatgpt_tweets_df['filtered_text'].str.replace(r'\s{2,}', ' ', regex=True)
chatgpt_tweets_df['filtered_text'] = chatgpt_tweets_df['filtered_text'].str.strip()

In [3]:
chatgpt_tweets_df[['text', 'filtered_text']].sample(10, random_state=8)

Unnamed: 0,text,filtered_text
12283,"#ChatGPT third-Party plugins, the „eyes and ea...",third party plugins the eyes and ears for larg...
1986,OpenAI: #ChatGPT is the future!\n\nGoogle: hol...,openai is the future google hold my
3974,"@wahVinci Also there is #ChatGPT now, 👀\nGivin...",also there is now giving code directly
9641,Social media comments made with #Chatgpt #AI. ...,social media comments made with the future wil...
4517,some of leading marijuana companies.... accord...,some of leading marijuana companies according ...
13155,How #ArtificialIntelligence Like #ChatGPT Is I...,how like is influencing medical diagnoses
14568,Space &amp; Time.\nTuring Test: failed. #ChatG...,space time turing test failed
9872,Excellent panel on #ChatGPT held by our #Trust...,excellent panel on held by our team looking at...
1157,surely there should be settings&gt; share data...,surely there should be settings share data and...
10830,"New youtube series in the works ""Deconstructin...",new youtube series in the works deconstructing...


### Tokenization

In [4]:
chatgpt_tweets_df.iloc[1157]['filtered_text']

'surely there should be settings share data and make it for private mode'

In [5]:
from nltk.tokenize import word_tokenize

chatgpt_tweets_df['tokenized'] = chatgpt_tweets_df['filtered_text'].apply(
    lambda x: word_tokenize(x))

In [6]:
chatgpt_tweets_df[['text', 'filtered_text', 'tokenized']].sample(10, random_state=8)

Unnamed: 0,text,filtered_text,tokenized
12283,"#ChatGPT third-Party plugins, the „eyes and ea...",third party plugins the eyes and ears for larg...,"[third, party, plugins, the, eyes, and, ears, ..."
1986,OpenAI: #ChatGPT is the future!\n\nGoogle: hol...,openai is the future google hold my,"[openai, is, the, future, google, hold, my]"
3974,"@wahVinci Also there is #ChatGPT now, 👀\nGivin...",also there is now giving code directly,"[also, there, is, now, giving, code, directly]"
9641,Social media comments made with #Chatgpt #AI. ...,social media comments made with the future wil...,"[social, media, comments, made, with, the, fut..."
4517,some of leading marijuana companies.... accord...,some of leading marijuana companies according ...,"[some, of, leading, marijuana, companies, acco..."
13155,How #ArtificialIntelligence Like #ChatGPT Is I...,how like is influencing medical diagnoses,"[how, like, is, influencing, medical, diagnoses]"
14568,Space &amp; Time.\nTuring Test: failed. #ChatG...,space time turing test failed,"[space, time, turing, test, failed]"
9872,Excellent panel on #ChatGPT held by our #Trust...,excellent panel on held by our team looking at...,"[excellent, panel, on, held, by, our, team, lo..."
1157,surely there should be settings&gt; share data...,surely there should be settings share data and...,"[surely, there, should, be, settings, share, d..."
10830,"New youtube series in the works ""Deconstructin...",new youtube series in the works deconstructing...,"[new, youtube, series, in, the, works, deconst..."


### Remove Stopwords

In [8]:
from nltk.corpus import stopwords

chatgpt_tweets_df['removed_stopwords'] = chatgpt_tweets_df['tokenized'].apply(
    lambda x: [word for word in x if word not in stopwords.words('english')])

In [9]:
chatgpt_tweets_df[['text', 'filtered_text', 'tokenized', 'removed_stopwords']].sample(10, random_state=8)

Unnamed: 0,text,filtered_text,tokenized,removed_stopwords
12283,"#ChatGPT third-Party plugins, the „eyes and ea...",third party plugins the eyes and ears for larg...,"[third, party, plugins, the, eyes, and, ears, ...","[third, party, plugins, eyes, ears, large, lan..."
1986,OpenAI: #ChatGPT is the future!\n\nGoogle: hol...,openai is the future google hold my,"[openai, is, the, future, google, hold, my]","[openai, future, google, hold]"
3974,"@wahVinci Also there is #ChatGPT now, 👀\nGivin...",also there is now giving code directly,"[also, there, is, now, giving, code, directly]","[also, giving, code, directly]"
9641,Social media comments made with #Chatgpt #AI. ...,social media comments made with the future wil...,"[social, media, comments, made, with, the, fut...","[social, media, comments, made, future, 99, spam]"
4517,some of leading marijuana companies.... accord...,some of leading marijuana companies according ...,"[some, of, leading, marijuana, companies, acco...","[leading, marijuana, companies, according, im,..."
13155,How #ArtificialIntelligence Like #ChatGPT Is I...,how like is influencing medical diagnoses,"[how, like, is, influencing, medical, diagnoses]","[like, influencing, medical, diagnoses]"
14568,Space &amp; Time.\nTuring Test: failed. #ChatG...,space time turing test failed,"[space, time, turing, test, failed]","[space, time, turing, test, failed]"
9872,Excellent panel on #ChatGPT held by our #Trust...,excellent panel on held by our team looking at...,"[excellent, panel, on, held, by, our, team, lo...","[excellent, panel, held, team, looking, data, ..."
1157,surely there should be settings&gt; share data...,surely there should be settings share data and...,"[surely, there, should, be, settings, share, d...","[surely, settings, share, data, make, private,..."
10830,"New youtube series in the works ""Deconstructin...",new youtube series in the works deconstructing...,"[new, youtube, series, in, the, works, deconst...","[new, youtube, series, works, deconstructing, ..."


### Stemming

In [10]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
chatgpt_tweets_df['stemmed'] = chatgpt_tweets_df['removed_stopwords'].apply(
    lambda x: [ps.stem(word) for word in x])

In [11]:
chatgpt_tweets_df[['text', 'filtered_text', 'tokenized', 'removed_stopwords', 'stemmed']].sample(10, random_state=8)

Unnamed: 0,text,filtered_text,tokenized,removed_stopwords,stemmed
12283,"#ChatGPT third-Party plugins, the „eyes and ea...",third party plugins the eyes and ears for larg...,"[third, party, plugins, the, eyes, and, ears, ...","[third, party, plugins, eyes, ears, large, lan...","[third, parti, plugin, eye, ear, larg, languag..."
1986,OpenAI: #ChatGPT is the future!\n\nGoogle: hol...,openai is the future google hold my,"[openai, is, the, future, google, hold, my]","[openai, future, google, hold]","[openai, futur, googl, hold]"
3974,"@wahVinci Also there is #ChatGPT now, 👀\nGivin...",also there is now giving code directly,"[also, there, is, now, giving, code, directly]","[also, giving, code, directly]","[also, give, code, directli]"
9641,Social media comments made with #Chatgpt #AI. ...,social media comments made with the future wil...,"[social, media, comments, made, with, the, fut...","[social, media, comments, made, future, 99, spam]","[social, media, comment, made, futur, 99, spam]"
4517,some of leading marijuana companies.... accord...,some of leading marijuana companies according ...,"[some, of, leading, marijuana, companies, acco...","[leading, marijuana, companies, according, im,...","[lead, marijuana, compani, accord, im, bullish]"
13155,How #ArtificialIntelligence Like #ChatGPT Is I...,how like is influencing medical diagnoses,"[how, like, is, influencing, medical, diagnoses]","[like, influencing, medical, diagnoses]","[like, influenc, medic, diagnos]"
14568,Space &amp; Time.\nTuring Test: failed. #ChatG...,space time turing test failed,"[space, time, turing, test, failed]","[space, time, turing, test, failed]","[space, time, ture, test, fail]"
9872,Excellent panel on #ChatGPT held by our #Trust...,excellent panel on held by our team looking at...,"[excellent, panel, on, held, by, our, team, lo...","[excellent, panel, held, team, looking, data, ...","[excel, panel, held, team, look, data, futur, ..."
1157,surely there should be settings&gt; share data...,surely there should be settings share data and...,"[surely, there, should, be, settings, share, d...","[surely, settings, share, data, make, private,...","[sure, set, share, data, make, privat, mode]"
10830,"New youtube series in the works ""Deconstructin...",new youtube series in the works deconstructing...,"[new, youtube, series, in, the, works, deconst...","[new, youtube, series, works, deconstructing, ...","[new, youtub, seri, work, deconstruct, scienti..."


### Vector Conversion

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(chatgpt_tweets_df['stemmed'].apply(
    lambda x: ' '.join(x)))
vectors = [vector.toarray()[0] for vector in vectors]
chatgpt_tweets_df['vectors'] = vectors

In [13]:
chatgpt_tweets_df[['text', 'filtered_text', 'tokenized', 'removed_stopwords', 'stemmed', 'vectors']].sample(10, random_state=8)

Unnamed: 0,text,filtered_text,tokenized,removed_stopwords,stemmed,vectors
12283,"#ChatGPT third-Party plugins, the „eyes and ea...",third party plugins the eyes and ears for larg...,"[third, party, plugins, the, eyes, and, ears, ...","[third, party, plugins, eyes, ears, large, lan...","[third, parti, plugin, eye, ear, larg, languag...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1986,OpenAI: #ChatGPT is the future!\n\nGoogle: hol...,openai is the future google hold my,"[openai, is, the, future, google, hold, my]","[openai, future, google, hold]","[openai, futur, googl, hold]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3974,"@wahVinci Also there is #ChatGPT now, 👀\nGivin...",also there is now giving code directly,"[also, there, is, now, giving, code, directly]","[also, giving, code, directly]","[also, give, code, directli]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9641,Social media comments made with #Chatgpt #AI. ...,social media comments made with the future wil...,"[social, media, comments, made, with, the, fut...","[social, media, comments, made, future, 99, spam]","[social, media, comment, made, futur, 99, spam]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4517,some of leading marijuana companies.... accord...,some of leading marijuana companies according ...,"[some, of, leading, marijuana, companies, acco...","[leading, marijuana, companies, according, im,...","[lead, marijuana, compani, accord, im, bullish]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13155,How #ArtificialIntelligence Like #ChatGPT Is I...,how like is influencing medical diagnoses,"[how, like, is, influencing, medical, diagnoses]","[like, influencing, medical, diagnoses]","[like, influenc, medic, diagnos]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
14568,Space &amp; Time.\nTuring Test: failed. #ChatG...,space time turing test failed,"[space, time, turing, test, failed]","[space, time, turing, test, failed]","[space, time, ture, test, fail]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9872,Excellent panel on #ChatGPT held by our #Trust...,excellent panel on held by our team looking at...,"[excellent, panel, on, held, by, our, team, lo...","[excellent, panel, held, team, looking, data, ...","[excel, panel, held, team, look, data, futur, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1157,surely there should be settings&gt; share data...,surely there should be settings share data and...,"[surely, there, should, be, settings, share, d...","[surely, settings, share, data, make, private,...","[sure, set, share, data, make, privat, mode]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
10830,"New youtube series in the works ""Deconstructin...",new youtube series in the works deconstructing...,"[new, youtube, series, in, the, works, deconst...","[new, youtube, series, works, deconstructing, ...","[new, youtub, seri, work, deconstruct, scienti...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [14]:
import numpy as np

X = np.asarray(vectors)
y = np.asarray(chatgpt_tweets_df['sentiment'].apply(
    lambda x: 0 if x == 'Negative' else 1 if x == 'Neutral' else 2 if x == 'Positive' else x))

np.savez_compressed('X.npz', X)
np.savez_compressed('y.npz', y)