In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from warnings import filterwarnings
filterwarnings("ignore")
import numpy as np

In [2]:
df = pd.read_csv("c:/Class/ChatGPT/ChatGPT_Sentiment.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweets,labels
0,0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,1,"Try talking with ChatGPT, our new AI system wh...",good
2,2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,3,"THRILLED to share that ChatGPT, our new model ...",good
4,4,"As of 2 minutes ago, @OpenAI released their ne...",bad


# Data Preprcessing

In [4]:
df = df[["tweets","labels"]]

In [5]:
df

Unnamed: 0,tweets,labels
0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,"Try talking with ChatGPT, our new AI system wh...",good
2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,"THRILLED to share that ChatGPT, our new model ...",good
4,"As of 2 minutes ago, @OpenAI released their ne...",bad
...,...,...
219289,Other Software Projects Are Now Trying to Repl...,bad
219290,I asked #ChatGPT to write a #NYE Joke for SEOs...,good
219291,chatgpt is being disassembled until it can onl...,bad
219292,2023 predictions by #chatGPT. Nothing really s...,bad


In [6]:
print(df['tweets'].head())

0    ChatGPT: Optimizing Language Models for Dialog...
1    Try talking with ChatGPT, our new AI system wh...
2    ChatGPT: Optimizing Language Models for Dialog...
3    THRILLED to share that ChatGPT, our new model ...
4    As of 2 minutes ago, @OpenAI released their ne...
Name: tweets, dtype: object


In [7]:
# Remove Links from texts
df['tweet_list'] = df["tweets"].str.split('https:')

In [8]:
df.head()

Unnamed: 0,tweets,labels,tweet_list
0,ChatGPT: Optimizing Language Models for Dialog...,neutral,[ChatGPT: Optimizing Language Models for Dialo...
1,"Try talking with ChatGPT, our new AI system wh...",good,"[Try talking with ChatGPT, our new AI system w..."
2,ChatGPT: Optimizing Language Models for Dialog...,neutral,[ChatGPT: Optimizing Language Models for Dialo...
3,"THRILLED to share that ChatGPT, our new model ...",good,"[THRILLED to share that ChatGPT, our new model..."
4,"As of 2 minutes ago, @OpenAI released their ne...",bad,"[As of 2 minutes ago, @OpenAI released their n..."


In [9]:
text = [i[0] for i in df.tweet_list]

In [10]:
df['text'] = text

In [11]:
df = df[["text","labels"]]

In [12]:
df.head()

Unnamed: 0,text,labels
0,ChatGPT: Optimizing Language Models for Dialogue,neutral
1,"Try talking with ChatGPT, our new AI system wh...",good
2,ChatGPT: Optimizing Language Models for Dialogue,neutral
3,"THRILLED to share that ChatGPT, our new model ...",good
4,"As of 2 minutes ago, @OpenAI released their ne...",bad


In [13]:
import re
str = r'[A-Za-z0-9 ]'
trim_lst = []

for row in text:
    s =''
    for letter in row:
        if bool(re.match(str, letter)):
            s+=letter
    trim_lst.append(s)

In [14]:
# Remove non-Printing Characters from text
rep_list = ['\U0001fae1', '\\n', '@', '#', '\xa0', '***']

for i in trim_lst:
    for j in rep_list:
        if j in i:
            i.replace(j,'')

In [15]:
df['text'] = trim_lst

In [16]:
df.head()

Unnamed: 0,text,labels
0,ChatGPT Optimizing Language Models for Dialogue,neutral
1,Try talking with ChatGPT our new AI system whi...,good
2,ChatGPT Optimizing Language Models for Dialogue,neutral
3,THRILLED to share that ChatGPT our new model o...,good
4,As of 2 minutes ago OpenAI released their new ...,bad


In [17]:
# Map the labels to integers
# 1 for good 
# 0 for neutral
# -1 bad

df['lab_int'] = np.where(df['labels']=='good', 1, np.where(df['labels']=='bad', -1, 0))

In [18]:
df.head()

Unnamed: 0,text,labels,lab_int
0,ChatGPT Optimizing Language Models for Dialogue,neutral,0
1,Try talking with ChatGPT our new AI system whi...,good,1
2,ChatGPT Optimizing Language Models for Dialogue,neutral,0
3,THRILLED to share that ChatGPT our new model o...,good,1
4,As of 2 minutes ago OpenAI released their new ...,bad,-1


In [19]:
#import libraries 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import jaccard_score, accuracy_score, f1_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['lab_int'], test_size=0.3, random_state=1)

In [21]:
vec = CountVectorizer(
    ngram_range=(1, 3), 
    stop_words="english",
)

X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

In [22]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)

preds = nb.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          -1       0.76      0.93      0.83     32288
           0       0.72      0.29      0.41     16710
           1       0.66      0.77      0.71     16791

    accuracy                           0.72     65789
   macro avg       0.71      0.66      0.65     65789
weighted avg       0.72      0.72      0.69     65789



In [23]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

preds = lr.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          -1       0.87      0.94      0.90     32288
           0       0.74      0.65      0.69     16710
           1       0.83      0.79      0.81     16791

    accuracy                           0.83     65789
   macro avg       0.81      0.79      0.80     65789
weighted avg       0.83      0.83      0.83     65789



# Hyperparameter Tunning

In [24]:
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold

In [25]:
# Hyperparameter tuning for Multinomial Naive Bayes model

param_grid = {"alpha": [0.1,0,1.0, 10, 100]}

grid_search = GridSearchCV(MultinomialNB(), param_grid, verbose=2)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ..........................................alpha=0.1; total time=   0.4s
[CV] END ..........................................alpha=0.1; total time=   0.3s
[CV] END ..........................................alpha=0.1; total time=   0.5s
[CV] END ..........................................alpha=0.1; total time=   0.4s
[CV] END ..........................................alpha=0.1; total time=   0.4s
[CV] END ............................................alpha=0; total time=   0.4s
[CV] END ............................................alpha=0; total time=   0.3s
[CV] END ............................................alpha=0; total time=   0.3s
[CV] END ............................................alpha=0; total time=   0.3s
[CV] END ............................................alpha=0; total time=   0.4s
[CV] END ..........................................alpha=1.0; total time=   0.4s
[CV] END ........................................

GridSearchCV(estimator=MultinomialNB(),
             param_grid={'alpha': [0.1, 0, 1.0, 10, 100]}, verbose=2)

In [26]:
grid_search.best_params_

{'alpha': 1.0}