<a href="https://colab.research.google.com/github/uknowmeCREED/MahcineLearning/blob/main/ChatGPT%20Tweet%20Sentiment%20anlysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ChatGPT Tweets Sentiment Analysis.

## Table of Contents
* Data Preprocessing
* Data Visualization
* Model Selection
* Hyperparameter tuning

In [33]:
# Import data processing modules

import pandas as pd
import numpy as np

In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
df = pd.read_csv("/content/drive/MyDrive/Data/file[1].csv", nrows=50000)


In [36]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweets,labels
0,0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,1,"Try talking with ChatGPT, our new AI system wh...",good
2,2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,3,"THRILLED to share that ChatGPT, our new model ...",good
4,4,"As of 2 minutes ago, @OpenAI released their ne...",bad


## Data Preprocessing

In [37]:
# Remove the 'Unnamed: 0' column

df = df[['tweets', 'labels']]

In [38]:
df.head()

Unnamed: 0,tweets,labels
0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,"Try talking with ChatGPT, our new AI system wh...",good
2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,"THRILLED to share that ChatGPT, our new model ...",good
4,"As of 2 minutes ago, @OpenAI released their ne...",bad


In [39]:
# View the first 10 rows in the tweets column

for i in df.tweets.head(10):
    print(i)
    print()

ChatGPT: Optimizing Language Models for Dialogue https://t.co/K9rKRygYyn @OpenAI

Try talking with ChatGPT, our new AI system which is optimized for dialogue. Your feedback will help us improve it. https://t.co/sHDm57g3Kr

ChatGPT: Optimizing Language Models for Dialogue https://t.co/GLEbMoKN6w #AI #MachineLearning #DataScience #ArtificialIntelligence\n\nTrending AI/ML Article Identified &amp; Digested via Granola; a Machine-Driven RSS Bot by Ramsey Elbasheer https://t.co/RprmAXUp34

THRILLED to share that ChatGPT, our new model optimized for dialog, is now public, free, and accessible to everyone. https://t.co/dyvtHecYbd https://t.co/DdhzhqhCBX https://t.co/l8qTLure71

As of 2 minutes ago, @OpenAI released their new ChatGPT. \n\nAnd you can use it right now 👇 https://t.co/VyPGPNw988 https://t.co/cSn5h6h1M1

Just launched ChatGPT, our new AI system which is optimized for dialogue: https://t.co/ArX6m0FfLE.\n\nTry it out here: https://t.co/YM1gp5bA64

As of 2 minutes ago, @OpenAI release

In [40]:
# Remove all the tweet links since they all begin with https:

df['tweet_list'] = df['tweets'].str.split('https:')

In [41]:
df.head()

Unnamed: 0,tweets,labels,tweet_list
0,ChatGPT: Optimizing Language Models for Dialog...,neutral,[ChatGPT: Optimizing Language Models for Dialo...
1,"Try talking with ChatGPT, our new AI system wh...",good,"[Try talking with ChatGPT, our new AI system w..."
2,ChatGPT: Optimizing Language Models for Dialog...,neutral,[ChatGPT: Optimizing Language Models for Dialo...
3,"THRILLED to share that ChatGPT, our new model ...",good,"[THRILLED to share that ChatGPT, our new model..."
4,"As of 2 minutes ago, @OpenAI released their ne...",bad,"[As of 2 minutes ago, @OpenAI released their n..."


In [42]:
# Select the text part of the list

text = [i[0] for i in df.tweet_list]

In [43]:
df['text'] = text

In [44]:
df = df[['text', 'labels']]

In [45]:
df.head()

Unnamed: 0,text,labels
0,ChatGPT: Optimizing Language Models for Dialogue,neutral
1,"Try talking with ChatGPT, our new AI system wh...",good
2,ChatGPT: Optimizing Language Models for Dialogue,neutral
3,"THRILLED to share that ChatGPT, our new model ...",good
4,"As of 2 minutes ago, @OpenAI released their ne...",bad


In [46]:
# Import re for string processing
import re


In [47]:
# Remove all non-alphanumeric characters from the text list

string = r'[A-Za-z0-9 ]'

trim_list=[]

for row in text:
    s=''
    for letter in row:
        if bool(re.match(string, letter)):
            s+=letter
    trim_list.append(s)

In [48]:
# Remove the non-printing characters from text

rep_list = ['\U0001fae1', '\\n', '@', '#', '\xa0', '***']

for i in trim_list:
    for j in rep_list:
        if j in i:
            i.replace(j,'')

In [49]:
df['text'] = trim_list

In [50]:
df.head()

Unnamed: 0,text,labels
0,ChatGPT Optimizing Language Models for Dialogue,neutral
1,Try talking with ChatGPT our new AI system whi...,good
2,ChatGPT Optimizing Language Models for Dialogue,neutral
3,THRILLED to share that ChatGPT our new model o...,good
4,As of 2 minutes ago OpenAI released their new ...,bad


## Data Visualization with word cloud

In [51]:
pip install wordcloud



In [52]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from PIL import Image

comment_words = ''
stopwords = set(STOPWORDS)


for val in df.text:

    # typecaste each val to string
    val = str(val)

    # split the value
    tokens = val.split()

    # Converts each token into lowercase
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()

    comment_words += " ".join(tokens)+" "

mask = np.array(Image.open("/kaggle/input/masks/comment.png"))

wordcloud = WordCloud(width = 800, height = 800,
                background_color ='pink',
                stopwords = stopwords,
                min_font_size = 10, mask=mask).generate(comment_words)

# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()

FileNotFoundError: ignored

## Model Selection

In [53]:
# Map the labels to integers
# 1 for good tweet
# 0 for neutral tweet
# -1 for bad tweet

df['lab_int'] = np.where(df['labels']=='good', 1, np.where(df['labels']=='bad', -1, 0))

In [54]:
# Import sci-kit learn models for classification

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import jaccard_score, accuracy_score, f1_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['labels'], test_size=0.3, random_state=1)

In [56]:
vec = CountVectorizer(
    ngram_range=(1, 3),
    stop_words="english",
)

X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

# Naive Bayes

In [57]:
from sklearn.naive_bayes import MultinomialNB

In [58]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

preds = nb.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         bad       0.74      0.90      0.81      7246
        good       0.61      0.77      0.68      3941
     neutral       0.65      0.21      0.32      3813

    accuracy                           0.69     15000
   macro avg       0.67      0.63      0.60     15000
weighted avg       0.68      0.69      0.65     15000



# Logistic Regression

In [59]:
log = LogisticRegression()
log.fit(X_train, y_train)

preds = log.predict(X_test)
print(classification_report(y_test, preds))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

         bad       0.80      0.94      0.86      7246
        good       0.79      0.70      0.74      3941
     neutral       0.64      0.51      0.57      3813

    accuracy                           0.77     15000
   macro avg       0.75      0.72      0.72     15000
weighted avg       0.76      0.77      0.76     15000



## Hyperparameter tuning

In [60]:
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold

In [61]:
# Hyperparameter tuning for Multinomial Naive Bayes model

param_grid = {"alpha": [0.1,0,1.0, 10, 100]}

grid_search = GridSearchCV(MultinomialNB(), param_grid, verbose=2)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ..........................................alpha=0.1; total time=   0.1s
[CV] END ..........................................alpha=0.1; total time=   0.1s
[CV] END ..........................................alpha=0.1; total time=   0.1s
[CV] END ..........................................alpha=0.1; total time=   0.1s
[CV] END ..........................................alpha=0.1; total time=   0.1s




[CV] END ............................................alpha=0; total time=   0.1s
[CV] END ............................................alpha=0; total time=   0.1s




[CV] END ............................................alpha=0; total time=   0.1s
[CV] END ............................................alpha=0; total time=   0.1s




[CV] END ............................................alpha=0; total time=   0.1s
[CV] END ..........................................alpha=1.0; total time=   0.1s
[CV] END ..........................................alpha=1.0; total time=   0.1s
[CV] END ..........................................alpha=1.0; total time=   0.1s
[CV] END ..........................................alpha=1.0; total time=   0.1s
[CV] END ..........................................alpha=1.0; total time=   0.1s
[CV] END ...........................................alpha=10; total time=   0.1s
[CV] END ...........................................alpha=10; total time=   0.1s
[CV] END ...........................................alpha=10; total time=   0.1s
[CV] END ...........................................alpha=10; total time=   0.1s
[CV] END ...........................................alpha=10; total time=   0.1s
[CV] END ..........................................alpha=100; total time=   0.1s
[CV] END ...................

In [62]:
grid_search.best_params_

{'alpha': 10}

In [63]:
# Evaluate the model using various evaluation measures
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='weighted')
recall = recall_score(y_test, preds, average='weighted')
f1 = f1_score(y_test, preds, average='weighted')


print("Accuracy: {:.3f}".format(accuracy))
print("Precision: {:.3f}".format(precision))
print("Recall: {:.3f}".format(recall))
print("F1 score: {:.3f}".format(f1))


Accuracy: 0.766
Precision: 0.758
Recall: 0.766
F1 score: 0.756


In [64]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
from sklearn.svm import LinearSVC

# Create a LinearSVC model and fit it to the training data
svc = LinearSVC()
svc.fit(X_train, y_train)

# Make predictions on the test data using the SVC model
svc_preds = svc.predict(X_test)

# Evaluate the performance of the SVC model using various metrics
svc_acc = accuracy_score(y_test, svc_preds)
svc_precision = precision_score(y_test, svc_preds, average='weighted')
svc_recall = recall_score(y_test, svc_preds, average='weighted')
svc_f1 = f1_score(y_test, svc_preds, average='weighted')

# Create a table to display the performance of all three models
data = {'Model': ['Naive Bayes', 'Logistic Regression', 'Support Vector Machine'],
        'Accuracy': [accuracy_score(y_test, preds), accuracy_score(y_test, preds), svc_acc],
        'Precision': [precision_score(y_test, preds, average='weighted'), precision_score(y_test, preds, average='weighted'), svc_precision],
        'Recall': [recall_score(y_test, preds, average='weighted'), recall_score(y_test, preds, average='weighted'), svc_recall],
        'F1 Score': [f1_score(y_test, preds, average='weighted'), f1_score(y_test, preds, average='weighted'), svc_f1]}
results_df = pd.DataFrame(data)

# Display the results table
print(results_df)


                    Model  Accuracy  Precision    Recall  F1 Score
0             Naive Bayes  0.765667   0.757613  0.765667  0.756282
1     Logistic Regression  0.765667   0.757613  0.765667  0.756282
2  Support Vector Machine  0.752733   0.742620  0.752733  0.743302


# Ensenmble Learning Algortihms.
## Bagging ensemble

# ANN Algorithms
* Perceptron
* Multilayer perceptron


In [65]:
# Import necessary libraries
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score




In [66]:
# Train and evaluate a Perceptron
perceptron = Perceptron()
perceptron.fit(X_train, y_train)
perceptron_pred = perceptron.predict(X_test)
perceptron_acc = accuracy_score(y_test, perceptron_pred)
print("Perceptron Accuracy: ", perceptron_acc)

Perceptron Accuracy:  0.757


In [None]:
# Train and evaluate a Multilayer Perceptron
mlp = MLPClassifier(hidden_layer_sizes=(50,50))
mlp.fit(X_train, y_train)
mlp_pred = mlp.predict(X_test)
mlp_acc = accuracy_score(y_test, mlp_pred)
print("MLP Accuracy: ", mlp_acc)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['labels'], test_size=0.3, random_state=1)

# Create the CountVectorizer object with stop_words=None
vec = CountVectorizer(
    ngram_range=(1, 3),
    stop_words=None,
)

In [None]:
# Transform the text data
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

In [None]:
# Create individual classifiers
svc = SVC(probability=True)
rfc = RandomForestClassifier(n_estimators=100)
nb = MultinomialNB()
log = LogisticRegression()

In [None]:
# Create the ensemble model using a voting classifier
ensemble = VotingClassifier(
    estimators=[('svc', svc), ('rfc', rfc), ('nb', nb), ('log', log)],
    voting='soft'
)


In [None]:
# Fit the ensemble model to the training data
ensemble.fit(X_train, y_train)

In [None]:
# Make predictions using the ensemble model
preds = ensemble.predict(X_test)

In [None]:
# Print classification report and accuracy score
print(classification_report(y_test, preds))
print("Accuracy:", accuracy_score(y_test, preds))