# INFO 498 Final Project

In [3]:
# Import libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

import re
import ssl
import string
from collections import Counter

import little_mallet_wrapper
from pathlib import Path

import nltk
from nltk import ne_chunk
from nltk.tree import Tree
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\uyvie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\uyvie\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\uyvie\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\uyvie\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\uyvie\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

### Load and Check Data

In [3]:
# Read in data
df = pd.read_csv('./chatgpt.csv.bz2')

In [4]:
# Check number of rows and columns in data
print(df.shape)
df.head()

(375027, 6)


Unnamed: 0,date,id,content,username,like_count,retweet_count
0,2023-03-11 17:18:29+00:00,1634604721058004993,"I have just published ChatGptNet, a library th...",marcominerva,129.0,37.0
1,2023-01-27 11:13:40+00:00,1618930234820288513,How to make money with Chat GPT?\n\nFollow the...,Fitness_Empire0,76.0,9.0
2,2023-03-24 02:19:12+00:00,1639089452545875970,Lack of safeguards for freedom of speech: The ...,rajeshsersia,0.0,0.0
3,2023-02-01 08:36:15+00:00,1620702557411901441,#ChatGPT: a threat or an opportunity for #educ...,LLInC_Leiden,1.0,0.0
4,2023-01-28 10:07:29+00:00,1619275965078839297,"Back in 2020, people thought lock down is fore...",vspeeeeee,0.0,0.0


In [5]:
# Check for missing values
print('Missing values:')
print(df.isnull().sum())

df = df.dropna()

Missing values:
date              0
id                4
content           4
username         28
like_count       50
retweet_count    50
dtype: int64


In [6]:
# Check data types
print('Data types:')
print(df.dtypes)

Data types:
date              object
id                object
content           object
username          object
like_count       float64
retweet_count    float64
dtype: object


In [7]:
# Show summary of data
print('Summary statistics:')
print(df.describe())

Summary statistics:
          like_count  retweet_count
count  374977.000000  374977.000000
mean        7.143417       1.504042
std       224.018703      48.334174
min         0.000000       0.000000
25%         0.000000       0.000000
50%         1.000000       0.000000
75%         2.000000       0.000000
max     64094.000000   16080.000000


### Named-Entity Recognition

In [None]:
# Download stopwords if not already downloaded
nltk.download('stopwords')

# Get the list of stopwords and punctuation
stopwords_list = set(stopwords.words('english'))
punctuation_set = set(string.punctuation)

# Define a function to perform tokenization with checks for special characters and stop words
def tokenize_text(text):
    tokens = nltk.word_tokenize(str(text))
    return tokens

# Tokenize the text data in the "content" column
df['content_tokenized'] = df['content'].apply(tokenize_text)

In [None]:
def perform_ner(tokenized_text):
    
    # Perform part-of-speech tagging, label nouns, verbs, adjectives, etc. 
    tagged_text = nltk.pos_tag(tokenized_text)
    
    # Perform named entity recognition, perform named entity recognition
    ner_result = ne_chunk(tagged_text)
    return ner_result

# Apply NER to the 'content_tokenized' column
df['ner_result'] = df['content_tokenized'].apply(perform_ner)

In [None]:
# Flatten the named entity results and extract named entities
flattened_entities = []

# Extract named entities
for sublist in df['ner_result']:
    for ent in sublist:
        if isinstance(ent, Tree):
            flattened_entities.append(' '.join([token[0] for token in ent.leaves()]))

# Count the frequency of named entities
entity_counts = Counter(flattened_entities)

# Set the number of top named entities to display
top_n = 10

# Get the top N named entities
top_entities = entity_counts.most_common(top_n)

# Display the top N named entities
for entity, count in top_entities:
    print(entity, count)

# Extract entities and counts for plotting
entities = [entity for entity, count in top_entities]
counts = [count for entity, count in top_entities]

# Plot the bar chart
plt.bar(range(len(entities)), counts, align='center')
plt.xticks(range(len(entities)), entities, rotation=45)
plt.xlabel('Named Entities')
plt.ylabel('Frequency')
plt.title(f'Top {top_n} Named Entities')
plt.tight_layout()
plt.show()

### Sentiment Analysis

In [None]:
# Specify the percentage threshold for a given tweet, this percentage cuts off tweets that have 
# under X% of tokens/words that are present in the vader lexicon and correctly processed

# Specify the desired threshold (e.g., 20%)
threshold = 0.35

# Index of specific tweet to analyze, this is mainly used for debugging but it is interesting to see how the tokenization works 
tweet_index = 50

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Get the list of stopwords and punctuation
stopwords_list = set(stopwords.words('english'))
punctuation_set = set(string.punctuation)

# Initialize the SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Define a function to perform tokenization with checks for special characters, stop words, and VADER lexicon presence
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)  # Tokenize the text

    #Remove tokens starting with a special character
    tokens = [token for token in tokens if not re.match(r'^[^a-zA-Z0-9]', token)]

    # Remove URLs and links
    tokens = [token for token in tokens if not re.match(r'^https?://[^\s]+', token)]

    # Remove trailing special characters
    tokens = [token.strip(string.punctuation) for token in tokens]

    # Convert tokens to lowercase
    tokens = [token.lower() for token in tokens]

    tokens = [token for token in tokens if len(token) > 1]

    stop_words = set(stopwords.words('english'))
    
    tokens = [token for token in tokens if token not in stop_words]

    return tokens

def filter_tokens(tokens):
    filtered_tokens = [token for token in tokens if token not in stopwords_list and token in sid.lexicon]
    filtered_tokens = [token for token in filtered_tokens if token not in punctuation_set]
    return filtered_tokens

# Define a function to print words and their sentiment scores
def print_word_sentiment(tokens):
    for token in tokens:
        scores = sid.polarity_scores(token)
        print(f'Word: {token}\tSentiment Scores: {scores}')

# Define a function to calculate the average sentiment score for a tweet
def calculate_average_score(tokens):
    scores = [sid.polarity_scores(token)['compound'] for token in tokens]
    return sum(scores) / len(scores) if scores else 0.0

# Define a function to check if the given percentage of words are successfully tokenized and calculate the tokenization percentage
def check_tokenization_threshold(tokens, threshold):
    num_tokens = len(tokens)
    if num_tokens == 0:
        return False, 0.0
    filtered_tokens = filter_tokens(tokens)
    num_successful_tokens = len(filtered_tokens)
    tokenization_percentage = num_successful_tokens / num_tokens
    return tokenization_percentage >= threshold, tokenization_percentage

df['content_tokenized'] = df['content'].apply(tokenize_text)
df['tokenization_percentage'] = df['content_tokenized'].apply(lambda tokens: check_tokenization_threshold(tokens, threshold)[1])

# Filter out texts that don't meet the tokenization threshold
df = df[df['tokenization_percentage'] >= threshold]

# Example usage to select and analyze a specific tweet by index
selected_tweet = df['content'].iloc[tweet_index]
tokenized_tweet = df['content_tokenized'].iloc[tweet_index]

if tokenized_tweet:
    print(f'Selected Tweet: {selected_tweet}')
    print('\nTokenized Content:')
    print(tokenized_tweet)

    print('\nIndividual Word Sentiment Scores:')
    print_word_sentiment(tokenized_tweet)

    average_score = calculate_average_score(tokenized_tweet)
    print(f'\nAverage Score: {average_score}')
else:
    print('Text does not meet the tokenization threshold and cannot be analyzed.')

# Print the updated DataFrame with the tokenization percentage column
print(df)

In [None]:
# Calculate the average sentiment score for each tweet
df['average_sentiment'] = df['content_tokenized'].apply(calculate_average_score)

# Create a distribution plot of the average sentiment scores
plt.hist(df['average_sentiment'], bins=10, edgecolor='black')
plt.xlabel('Average Sentiment Score')
plt.ylabel('Frequency')
plt.title('Distribution of Average Sentiment Scores')
plt.show()

### Topic Modeling

In [4]:
df = pd.read_csv('./chatgpt.csv.bz2')

In [5]:
df = df.dropna()
print(df.shape)
df.head()

(374977, 6)


Unnamed: 0,date,id,content,username,like_count,retweet_count
0,2023-03-11 17:18:29+00:00,1634604721058004993,"I have just published ChatGptNet, a library th...",marcominerva,129.0,37.0
1,2023-01-27 11:13:40+00:00,1618930234820288513,How to make money with Chat GPT?\n\nFollow the...,Fitness_Empire0,76.0,9.0
2,2023-03-24 02:19:12+00:00,1639089452545875970,Lack of safeguards for freedom of speech: The ...,rajeshsersia,0.0,0.0
3,2023-02-01 08:36:15+00:00,1620702557411901441,#ChatGPT: a threat or an opportunity for #educ...,LLInC_Leiden,1.0,0.0
4,2023-01-28 10:07:29+00:00,1619275965078839297,"Back in 2020, people thought lock down is fore...",vspeeeeee,0.0,0.0


In [6]:
# Convert "content" column to string data type
df['content'] = df['content'].astype(str)

In [7]:
# Use a sample of the dataset to ensure reasonable run time
work_df = df.sample(10000)

In [8]:
# Transform all the Twitter posts to lowercase and remove stopwords, punctuation, and numbers
training_data = [little_mallet_wrapper.process_string(text, numbers='remove') for text in work_df['content']]

In [9]:
# Create a list of the original (not pre-processed) Twitter posts
original_texts = []

for text in work_df['content']:
    original_texts.append(text)

In [12]:
# Set up Mallet

# Number of topics to be returned
num_topics = 20

path_to_mallet = '../mallet/bin/mallet'

training_data = training_data

# Set output directory
output_directory_path = 'topic-model-output/'

# Create output directory
Path(f"{output_directory_path}").mkdir(parents=True, exist_ok=True)

# Create output files
path_to_training_data           = f'{output_directory_path}/training.txt'
path_to_formatted_training_data = f'{output_directory_path}/mallet.training'
path_to_model                   = f'{output_directory_path}/mallet.model.{str(num_topics)}'
path_to_topic_keys              = f'{output_directory_path}/mallet.topic_keys.{str(num_topics)}'
path_to_topic_distributions     = f'{output_directory_path}/mallet.topic_distributions.{str(num_topics)}'

In [22]:
# Train topic model
little_mallet_wrapper.quick_train_topic_model(path_to_mallet,
                                              output_directory_path,
                                              num_topics,
                                              training_data)
print('Done training!')

Importing data...
Complete
Training topic model...
Complete
done training


In [14]:
# Print all the topics with topic number and topic key words
topics = little_mallet_wrapper.load_topic_keys(path_to_topic_keys)

for number, topic in enumerate(topics):
    print(f"✨Topic {number}✨\n\n{topic}\n")

✨Topic 0✨

['chatgpt', 'gpt', 'chat', 'like', 'use', 'think', 'one', 'know', 'time', 'people', 'using', 'good', 'would', 'make', 'get', 'better', 'even', 'could', 'see', 'need']

✨Topic 1✨

['chatgpt', 'https', 'openai', 'users', 'plus', 'million', 'get', 'free', 'month', 'app', 'paid', 'per', 'months', 'subscription', 'access', 'number', 'chatgptplus', 'daily', 'days', 'version']

✨Topic 2✨

['https', 'chatgpt', 'python', 'cybersecurity', 'trading', 'machinelearning', 'stocks', 'daysofcode', 'options', 'news', 'tech', 'deeplearning', 'free', 'fintech', 'iot', 'web', 'take', 'bitcoin', 'investing', 'investments']

✨Topic 3✨

['https', 'chatgpt', 'crypto', 'gpt', 'nft', 'airdrop', 'bitcoin', 'web', 'blockchain', 'btc', 'eth', 'cryptocurrency', 'magic', 'imgnai', 'token', 'future', 'nfts', 'ape', 'ethereum', 'powerful']

✨Topic 4✨

['chatgpt', 'language', 'https', 'model', 'gpt', 'data', 'openai', 'text', 'models', 'based', 'new', 'large', 'trained', 'api', 'generative', 'like', 'natural

In [15]:
# Load the topic distributions for all the documents (the probability that each document contains each of the topics)
topic_distributions = little_mallet_wrapper.load_topic_distributions(path_to_topic_distributions)

In [16]:
# Functions that will help to examine the top Twitter posts for each topic
from IPython.display import Markdown, display
import re

def make_md(string):
    """A function that transforms string data into Markdown
    so it can be nicely formatted with bolding and emojis
    """
    display(Markdown(str(string)))

def get_top_docs(docs, topic_distributions, topic_index=1, n=5):
    
    """A function that shows the top documents for a given set of topic distributions
    and a specific topic number
    """
    
    sorted_data = sorted([(_distribution[topic_index], _document) for _distribution, _document in zip(topic_distributions, docs)], reverse=True)
    topic_words = topics[topic_index]
    make_md(f"### ✨Topic {topic_index}✨\n\n{topic_words}\n\n---")
    
    for probability, doc in sorted_data[:n]:
        # Make topic words bolded
        for word in topic_words:
            if word in doc.lower():
                doc = re.sub(f"\\b{word}\\b", f"**{word}**", doc, re.IGNORECASE)
        make_md(f'✨  \n**Topic Probability**: {probability}  \n**Document**: {doc}\n\n')

In [19]:
# Display the top 5 Twitter posts with the highest probability of containing the given topic
get_top_docs(work_df['content'], topic_distributions, topic_index=1, n=5)

### ✨Topic 1✨

['chatgpt', 'https', 'openai', 'users', 'plus', 'million', 'get', 'free', 'month', 'app', 'paid', 'per', 'months', 'subscription', 'access', 'number', 'chatgptplus', 'daily', 'days', 'version']

---

✨  
**Topic Probability**: 0.9387115241913839  
**Document**: Timeline for adoption of 1 **million** **plus** **users**:

#Netflix – 3 years and 5 **months**
#Twitter – 2 years
#Spotify - 2 years **plus**
#Pinterest - 20 **months**
#ClassPass - 12 months
#Facebook – 10 months
#Hulu - 10 months
#Instagram – 2.5 months

#ChatGPT – 5 **days** 

Source: @Vitdwesh



✨  
**Topic Probability**: 0.8367001126248945  
**Document**: These tiktok hearings are the trailer to **get** us invested in the @OpenAI #ChatGPT hearings.

Those will be GOLD!



✨  
**Topic Probability**: 0.8354987214622681  
**Document**: Time it took to hit 1 Million **users**:

#Facebook - 2 years
#Instagram - 2 years
#Pinterest - 5 **months**
#AngryBirds - 34 **days**
#ChatGPT - 5 **days**



✨  
**Topic Probability**: 0.8153747432536841  
**Document**: Update **version** v.1.0.5, XChatBot ChatGPT Complete Flutter App. Add Webview (InApp Web). **https**://t.co/9OAdUm8vQC #flutter #flutterdev #xchatbot #**chatgpt** #**openai** #OpenAIChatGPT #flutterapp #flutterid **https**://t.co/4kdWK5XZ1s



✨  
**Topic Probability**: 0.8153747432536841  
**Document**: Booking a flight has never been easier! With HappyFares, you **get** the best deals and the most convenient booking experience. 🛫✈️ 
.
.
.
#bestflightbookingwebsite #happyfares #flyhigh #happyfares #flight #Number1 #travel #travelling #travelgram #travelphotography #ChatGPT **https**://t.co/ukbJWL2mK6



In [20]:
# Display the top 5 Twitter posts with the highest probability of containing the given topic
get_top_docs(work_df['content'], topic_distributions, topic_index=2, n=5)

### ✨Topic 2✨

['https', 'chatgpt', 'python', 'cybersecurity', 'trading', 'machinelearning', 'stocks', 'daysofcode', 'options', 'news', 'tech', 'deeplearning', 'free', 'fintech', 'iot', 'web', 'take', 'bitcoin', 'investing', 'investments']

---

✨  
**Topic Probability**: 0.9361340915584833  
**Document**: $BK Awaiting Buy Signal based off 7 signals $939 net profit 7.41 profit factor 85% win rate on a 15-min chart. #**trading** #**stocks** #**options** #**chatgpt** #ai #contentmassive 🚀 Free trial at **https**://t.co/tX8L2oEUgX **https**://t.co/EsVmsZ8rhZ



✨  
**Topic Probability**: 0.9221372521300545  
**Document**: The Ascent of ChatGPT

**https**://t.co/Xus4kWb1gF

#Python #DataScientist #BigData #Analytics #DataScience #AI #TensorFlow #JavaScript #CloudComputing #Coding #100DaysofCode #programming #flutter #SQL #MLOps #satriaadhipradana #ChatGPT



✨  
**Topic Probability**: 0.9221372521300545  
**Document**: Take a look to $YINN
 #daytrading  #StocksToWatch  #bottomfishing  #**options**  #**news**  #**trading**  #Stocks  #**investing**  #RedditArmy  #YOLO  #FOMO  #**investments**  #StocksToBuy  #ToTheMoon #ChatGPT
 **https**://t.co/uxpaqv8azG



✨  
**Topic Probability**: 0.9221372521300545  
**Document**: Take a look to $KRT
 #Stocks  #RedditArmy  #bottomfishing  #**trading**  #FOMO  #**news**  #**investing**  #ToTheMoon  #StocksToBuy  #YOLO  #**investments**  #StocksToWatch  #daytrading  #**options** #ChatGPT
 **https**://t.co/jX6EGvcfld



✨  
**Topic Probability**: 0.9221372521300545  
**Document**: Take a look to $GSD
 #ToTheMoon  #**investing**  #StocksToWatch  #daytrading  #Stocks  #RedditArmy  #bottomfishing  #StocksToBuy  #**investments**  #FOMO  #YOLO  #**trading**  #**news**  #**options** #ChatGPT
 **https**://t.co/u3Y3nXpWCT



In [21]:
# Display the top 5 Twitter posts with the highest probability of containing the given topic
get_top_docs(work_df['content'], topic_distributions, topic_index=3, n=5)

### ✨Topic 3✨

['https', 'chatgpt', 'crypto', 'gpt', 'nft', 'airdrop', 'bitcoin', 'web', 'blockchain', 'btc', 'eth', 'cryptocurrency', 'magic', 'imgnai', 'token', 'future', 'nfts', 'ape', 'ethereum', 'powerful']

---

✨  
**Topic Probability**: 0.9361126739825462  
**Document**: Just checked my portfolio and this #GPT4 is doing crazy numbers! 🔥🔥🔥 

👉**https**://t.co/KixnEguJ4Y

  #**airdrop** #NFT #OpenAIChatGPT #ChatGPT #OPTIMUS $ARB $SUI $USDT $BONK  $RINIA $ETH $SHIB $RDNT $EVMOS $FLOKI $ARB  $ARB **https**://t.co/iGEZGrqqg8



✨  
**Topic Probability**: 0.925381802463898  
**Document**: #ChatGPT can triple productivity.🦾🧠 Are you ready to enter the world of #AI + #Crypto = #GPT4?

Uniswap 👉 **https**://t.co/hzmODBSFtw
Dextool 👉 **https**://t.co/zuwVBIyrwM

#AI $EGGS #AI $GPT4 $SUI $USDT $HEX https://t.co/kjU8CMzMFF



✨  
**Topic Probability**: 0.9221111406825565  
**Document**: GPT-4 is 500 Times More **powerful** than the current ChatGPT, so is $GPT4 

👉**https**://t.co/T3Ks0zNyaj

 #**airdrop** #NFT $MAGIC $BOO #ChatGPT $hook $**magic** $APE $MATIC **https**://t.co/h0HRlAUfHZ



✨  
**Topic Probability**: 0.9221111406825565  
**Document**: GM!☕️☀️ 
Just aped 3 **eth** in #GPT4 and it is pumping!! 📈📈📈
Have you #HODL #GPT4 before its #ATH ? 🚀🚀🚀 Buy the **token** at Uniswap before it pumps TOOO high!!!

🦄
**https**://t.co/PSxNr7xKah

#zksync #ChatGPT **https**://t.co/Zl6sIAMMUu



✨  
**Topic Probability**: 0.9185406163096128  
**Document**: GPT-4 is 500 Times More **powerful** than the current ChatGPT, so is $GPT4 

👉**https**://t.co/dfFxkJWplp

 #**airdrop** #NFT $TSUKA $imgnAI $BONE $ocean $BCB #Damus $APE **https**://t.co/CQTiPS5Cxm



Based on the topic words and the top documents, I would label each topic as the following:
1. Topic 1 Label - Rapid Success and Usage of AI
2. Topic 2 Label - "Get-Rich-Quick" AI Callouts
3. Topic 3 Label - AI and Cryptocurrency