# Prepare

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install snscrape

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting snscrape
  Downloading snscrape-0.6.2.20230320-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.8/71.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: snscrape
Successfully installed snscrape-0.6.2.20230320


In [3]:
%cd '/content/drive/MyDrive/BDT SEM2/5005/'

/content/drive/MyDrive/BDT SEM2/5005


In [4]:
# os
import os
# storage
import pickle

# data processing
import pandas as pd
import snscrape.modules.twitter as sntwitter
import numpy as np

# visualization
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# utils
from random import sample
import datetime as dt
from collections import Counter

# tqdm
from tqdm import tqdm

# warnings
import warnings

warnings.filterwarnings("ignore")

# Convert every .pkl to .csv and remove unnecessary columns

In [None]:
cols=[\
      'id',\
      'date',\
      'username',\
      'rawContent',\
      'retweetCount',\
      'likeCount'
      ]

def pkl_to_df(tweets_dict):
    data = []
    for tid, tweet in tweets_dict.items():
        twi = list()
        # tweet id
        twi.append(tweet.id)
        # date
        twi.append(tweet.date.strftime('%Y-%m-%d'))
        # usr name
        twi.append(tweet.user.username)
        # content
        twi.append(tweet.rawContent)
        # retweetCount
        twi.append(tweet.retweetCount)
        # likeCount
        twi.append(tweet.likeCount)
        
        data.append(twi)
    return pd.DataFrame(data, columns=cols)

In [None]:
files = tqdm(os.listdir('./dataset/'))
target_dir = './dataset_rawcontent/'
for file in files:
    if file.endswith('.pkl'):
        with open('./dataset/'+file, 'rb') as f:
            tweets_dict = pickle.load(f)
            tweets_df = pkl_to_df(tweets_dict)
            if not os.path.exists(target_dir):
                os.mkdir(target_dir)
            tweets_df.to_csv(target_dir+file.strip('.pkl')+'.csv', index=False)
        

100%|██████████| 21/21 [06:38<00:00, 18.99s/it]


# Combine all csv and drop duplicate tweets

In [None]:
%cd './dataset_rawcontent/'

/content/drive/MyDrive/BDT SEM2/5005/dataset_rawcontent


In [None]:
csvfiles = tqdm(os.listdir())
target_dir = './dataset_rawcontent/'
df_l = []
for file in csvfiles:
    if file.endswith('.csv') and (not file.startswith('SUM')):
        df_l.append(pd.read_csv(file))
concat_df = pd.concat(df_l, ignore_index=True)

100%|██████████| 20/20 [00:18<00:00,  1.08it/s]


In [None]:
concat_df = concat_df.drop_duplicates(subset=['id'])
concat_df = concat_df.drop_duplicates(subset=['rawContent'])
concat_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71712 entries, 0 to 1202619
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            71711 non-null  float64
 1   date          71711 non-null  object 
 2   username      71711 non-null  object 
 3   rawContent    71711 non-null  object 
 4   retweetCount  71711 non-null  float64
 5   likeCount     71711 non-null  float64
 6   Unnamed: 0    1 non-null      float64
 7   word          1 non-null      object 
 8   tfidf         1 non-null      float64
dtypes: float64(5), object(4)
memory usage: 5.5+ MB


In [None]:
concat_df.to_csv('SUM.csv', index=False)

# TF-IDF

In [None]:
%cd './dataset_rawcontent/'

/content/drive/MyDrive/BDT SEM2/5005/dataset_rawcontent


In [None]:
tif_df = pd.read_csv('sum_processed.csv')
tmp = tif_df.lemmaContent
p_corpus = tmp.dropna().tolist()
p_corpus

['look speak worker start ai model help job tell boss happy talk anonymously',
 'ask     damn self want read',
 'mute     kindness fuck ask',
 'zero ai     marketing thing datum     neural map     map parse     unreal speed intelligence',
 'fuck end mix detail news conference         sound like write story shit',
 'congrat find combination copilot      powerful hack prototype new company quickly touch lot technology know deeply excited experience come',
 'machine learn artificial intelligence manipulate datum datum magic',
 'opinion     tool weapon chit juan read',
 '      task administrative       legal profession automate      construction      maintenance uncertainty long term impact ai     ai tech job jobcut',
 'goldman sach report suggest ai replace       million job globally bring productivity boom new job ai ability replace quarter work task europe generative ai particularly see major advancement     tech',
 '    rule odd historian know include city hall record keeper cover chro

In [None]:
for s in p_corpus:
    if 'datum' in p_corpus:
        print(s)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X=vectorizer.fit_transform(p_corpus)

In [None]:
data = {'word':vectorizer.get_feature_names_out(), 'tfidf':X.toarray().sum(axis=0).tolist()}
X_df = pd.DataFrame(data)
X_df = X_df.sort_values(by='tfidf', ascending=False)
X_df

Unnamed: 0,word,tfidf
657,ai,2431.133786
33754,write,1377.711780
1940,ask,1084.910469
17524,like,1062.612066
12677,google,1032.717092
...,...,...
7325,daaa,0.093023
9188,dunnn,0.093023
28609,stargazing,0.084429
2623,barbecuing,0.084429


In [None]:
X_df.to_csv('TFIDF_words_lemm.csv')

# Topic Modeling

In [5]:
!pip install contractions
!pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.2/103.2 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/sim

In [6]:
import pandas as pd
import unicodedata
import re
import contractions
import string
#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
#spacy
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
#vis
import pyLDAvis
import pyLDAvis.gensim_models

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Clean & Preprocessing & Save

### Functions:

**Lower-Case Conversion:**

convert our entire text data into lowercase.

In [None]:
# Lower-Case Conversion
def to_lowercase(text):
  return text.lower()

  and should_run_async(code)


**Standardizing Accent Characters:**

Sometimes, people use accented characters like é, ö, etc. to signify emphasis on a particular letter during pronunciation. In some instances, accent marks also clarify the meanings of words, which might be different without accent marks. Though the use of accent marks is very limited, it’s a good practice to convert these characters into standard ASCII characters.

In [None]:
def standardize_accented_chars(text):
  return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  
#testing the function on a single sample for explaination
print(standardize_accented_chars('Sómě words such as résumé, café, prótest, divorcé, coördinate, exposé, latté.'))

Some words such as resume, cafe, protest, divorce, coordinate, expose, latte.


  and should_run_async(code)


**Removing URLs**

In [None]:
def remove_url(text):
  return re.sub(r'https?:\S*', '', text)

#testing the function on a single sample for explaination
print(remove_url('using https://www.google.com/ as an example'))
print(remove_url('using http://www.google.com/ as an example'))

using  as an example
using  as an example


  and should_run_async(code)


**Expanding Contractions**

Contractions are shortened versions of words or syllables. They are created by removing, one or more letters from words. More often than not, multiple words are combined to create a contraction. For example, I will is contracted into I’ll, do not into don’t. We wouldn’t want our model to consider I will and I’ll differently. Hence, we will convert each contraction into its expanded form using the below-mentioned code.

In [None]:
def expand_contractions(text):
  expanded_words = [] 
  for word in text.split():
    expanded_words.append(contractions.fix(word)) 
  return ' '.join(expanded_words)

#testing the function on a single sample for explaination
print(expand_contractions("Don't is same as do not"))
print(expand_contractions("I am gonna go"))
print(expand_contractions("I'll be there within 5 min. Shouldn't you be there too?\
I'd love to see u there my dear. It's awesome to meet new friends.\
We've been waiting for this day for so long."))

Do not is same as do not
I am going to go
I will be there within 5 min. Should not you be there too?I would love to see you there my dear. It is awesome to meet new friends.We have been waiting for this day for so long.


  and should_run_async(code)


**Removing Mentions and Hashtags**

After a quick analysis, I found that in this example, mentions and hashtags don’t contribute much to the information relevant for topic modeling as they don’t follow any pattern i.e. they are randomly used in tweets. Remove mentions and tags using the function mentioned below.

In [None]:
def remove_mentions_and_tags(text, rm_mention=True, rm_tag=True):
  if rm_mention:
    text = re.sub(r'@\S*', '', text)
  if rm_tag:
    text = re.sub(r'#\S*', '', text)
  return text

#testing the function on a single sample for explaination
print(remove_mentions_and_tags('Some random @abc and #def'))

Some random  and 


  and should_run_async(code)


**Keeping only Alphabet:** 

Punctuations, numbers, and special characters like ‘$, %, etc.’ don’t provide any information.

In [None]:
def keep_only_alphabet(text):
  # if 'to_lowercase' already done:
  #text = re.sub(r'[^a-z]', ' ', text)
  # otherwise
  text = re.sub(r'[^A-z]', ' ', text)
  return text

#testing the function on a single sample for explaination
print(keep_only_alphabet('Just a bit more $$processing required.Just a #bit!!!'))

Just a bit more   processing required Just a  bit   


  and should_run_async(code)


**Removing Stopwords(Default+Custom) and Removing Short Words:**

In [None]:
#creating a spaCy object. 
nlp = spacy.load("en_core_web_sm", disable=['parser','ner'])
my_stop_words = {'gpt','fuck','chatgpt', 'chatgpts', 'hey', 'hi', 'hmm', 'yo',\
'really','sometimes', 'go', 'since', 'whither', 'they', 'its', 'them',\
'well', 'meanwhile', 'seems', 'and', 'latterly', 'regarding', 'somehow',\
'sixty', 'whole', 'anyway', 'else', 'few', "'m", 'beside', 'to', 'namely',\
'someone', 'see', 'moreover', 'wherein', 'for', 'former', 'bottom', 'it',\
'next', 'six', 'along', 'once', 'might', 'whenever', 'below', 'another',\
'yourself', 'each', 'just', 'ourselves', 'everyone', 'any', 'across', \
'get', 'that', 'eight', 'we', 'which', 'therefore', 'may', "'s", 'keep',\
'among', 'give', 'such', 'are', 'indeed', 'everywhere', 'same', 'herself',\
'yourselves', 'alone', 'were', 'was', 'take', 'seem', 'say', 'why', 'show',\
'between', 'during', 'elsewhere', 'or', 'though', 'forty', 'made', 'used',\
'others', 'whereafter', 'formerly', 'several', 'via', 'does', 'please', 'three',\
'also', 'fifty', 'afterwards', "'s", 'noone', 'do', 'perhaps', 'further',\
'i', 'beforehand', 'myself', 'empty', "'ll", 'yet', 'thereby', 'been', 'both',\
'never', 'put', 'without', 'him', 'a', 'nothing', 'thereafter', 'make', 'then',\
'whom', 'must', 'sometime', 'against', 'through', 'being', 'four', 'back', 'become',\
'our', 'himself', 'because', 'anything', "'re", 'nor', 'therein', 'due', 'until',\
'own', 'ca', 'most', 'now', 'while', 'of', 'only', 'am', 'itself', 'too', "'m",\
'nobody', 'if', 'one', 'whereas', 'twelve', 'together', 'can', 'who', 'even',\
'be', 'she', 'besides', 'herein', 'off', "'d", 'last', 'no', 'whereupon', 'the', "'m",\
'thru', 'out', 'hereupon', 'by', 'us', 'already', 'became', 'here', 'hers', 'onto',\
'beyond', 'down', 'enough', 'did', 'some', 'over', 'serious', 'quite', 'move', 'around',\
'nowhere', 'amongst', 'but', 'so', 'wherever', 'twenty', 'often', 'part',\
'again', 'where', 're', 'within', 'at', "n't", 'yours', 'front', 'unless',\
'could', 'anyone', 'third', 'whatever', 'doing', "'d", 'nevertheless',\
'before', 'rather', 'fifteen', 'her', 'me', 'thereupon', 'mostly',\
'throughout', 'hence', "'re", 'mine', 'ten', 'hundred', 'nine', 'call',\
'when', 'about', 'will', 'whereby', 'this', 'upon', 'you', 'should',\
'always', 'themselves', 'not', 'has', 'behind', 'on', 'anywhere', 'side',\
'their', 'hereby', 'latter', 'after', "'ve", 'none', 'these', 'name',\
"n't", 'every', 'although', "'s", 'however', 'he', 'becoming', 'how', \
'whose', 'still', 'hereafter', 'whether', 'towards', 'more', 'everything',\
'whoever', 'seemed', 'cannot', 'up', 'otherwise', 'in', 'would', 'under',\
'done', 'thence', 'whence', 'seeming', 'either', 'other', 'with', 'into',\
'amount', 'five', 'much', "'re", 'except', 'his', 'thus', "'ll", 'what',\
'almost', 'becomes', 'least', 'ever', 'above', 'is', 'first', 'there',\
'somewhere', 'top', "'ve", 'than', "n't", 'have', 'toward', 'per', 'all',\
'ours', 'full', "'d", 'anyhow', 'as', "'ll", 'many', 'various', 'your',\
'had', 'eleven', 'from', 'something', 'less', 'those', 'using', 'an',\
'two', 'my', 'very', 'neither'}

  and should_run_async(code)


In [None]:
def remove_stopwords(text,nlp,custom_stop_words=None,remove_small_tokens=True,min_len=1):
  # if custom stop words are provided, then add them to default stop words list
  if custom_stop_words:
    nlp.Defaults.stop_words = custom_stop_words
  
  filtered_sentence = [] 
  doc = nlp(text)
  for token in doc:
    if token.is_stop == False: 
      # if small tokens have to be removed, then select only those which are longer than the min_len 
      if remove_small_tokens:
        if len(token.text) > min_len:
          filtered_sentence.append(token.text)
      else:
        filtered_sentence.append(token.text)

    # if after the stop word removal, words are still left in the sentence, then return the sentence as a string else return null 
  return ' '.join(filtered_sentence) if len(filtered_sentence) > 0 else np.nan

#removing stop-words and short words from every row
# tweets_df.Tweets=tweets_df.Tweets.apply(lambda x:remove_stopwords(x,nlp,{"elon","musk",}))
remove_stopwords(" chatgpt really say   socially liberal   fiscally conservative",nlp, my_stop_words)

  and should_run_async(code)


'chatgpt    socially liberal    fiscally conservative'

**Lemmatization:**
Lemmatization generates the root of the word. It makes use of vocabulary and morphological analysis of words, to generate the root form of a word. We will use the spaCy library for performing lemmatization.

In [None]:
def lemmatize(text, nlp):
  doc = nlp(text)
  lemmatized_text = []
  for token in doc:
    lemmatized_text.append(token.lemma_)
  return ' '.join(lemmatized_text)

#testing the function on a single sample for explaination
print(lemmatize('Reading NLP blog is fun.' ,nlp ))

#Performing lemmatization on every row
#tweets_df.Tweets=tweets_df.Tweets.apply(lambda x:lemmatize(x,nlp)

read NLP blog be fun .


  and should_run_async(code)


### preprocessing:

In [None]:
%pwd

  and should_run_async(code)


'/content/drive/MyDrive/BDT SEM2/5005'

In [None]:
%cd '/content/drive/MyDrive/BDT SEM2/5005/'

/content/drive/MyDrive/BDT SEM2/5005


  and should_run_async(code)


In [None]:
t_df = pd.read_csv('/content/drive/MyDrive/BDT SEM2/5005/dataset_rawcontent/tweet_230201-230228.csv')
t_df.info(10)

  and should_run_async(code)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1225140 entries, 0 to 1225139
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            831924 non-null  object 
 1   date          831921 non-null  object 
 2   username      831921 non-null  object 
 3   rawContent    831897 non-null  object 
 4   retweetCount  515724 non-null  float64
 5   likeCount     515724 non-null  float64
dtypes: float64(2), object(4)
memory usage: 56.1+ MB


In [None]:
#files = tqdm(os.listdir('./dataset_rawcontent/'))
files = tqdm(['tweet_230101-230130_az.csv'])
target_dir = './dataset_processed/'
for file in files:
    if file.endswith('.csv') and ('tweet' in file):
        # read
        df = pd.read_csv('./dataset_rawcontent/'+file)

        df['rawContent'] = df['rawContent'].astype('str') 

        tweets_df = df.rawContent.copy()

        # apply all the funcs
        tweets_df = tweets_df\
        .apply(to_lowercase)\
        .apply(standardize_accented_chars)\
        .apply(remove_url)\
        .apply(expand_contractions)\
        .apply(remove_mentions_and_tags)\
        .apply(keep_only_alphabet)

        tweets_df_rms = tweets_df.apply(lambda x:remove_stopwords(x,nlp,my_stop_words))

        tweets_df_lemma = tweets_df_rms.apply(lambda x:lemmatize(x,nlp) if x is not np.nan else np.nan)

        df['lemmaContent'] = tweets_df_lemma
        df['processedContent'] = tweets_df

        # save
        if not os.path.exists(target_dir):
            os.mkdir(target_dir)
        df.to_csv(target_dir+file, index=False)

## Pipeline
**start to process our data:**

first, define the text you want to perform topic modeling:

In [None]:
%pwd

'/content/drive/MyDrive/BDT SEM2/5005'

In [None]:
root = './dataset_processed/'
csvfiles = tqdm(os.listdir(root))
df_l = []
for file in csvfiles:
    if file.endswith('.csv') and file.startswith('tweet_verified'):
        df_l.append(pd.read_csv(root+file))
tweets_df = pd.concat(df_l, ignore_index=True)

  and should_run_async(code)
100%|██████████| 18/18 [00:14<00:00,  1.20it/s]


filter

In [None]:
#tweets_df = tweets_df.loc[df['likeCount'] > 2]
#filtered_df = tweets_df.loc[tweets_df['likeCount'] > 10]
filtered_df = tweets_df.copy()

  and should_run_async(code)


In [None]:
filtered_df.info()

  and should_run_async(code)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1202619 entries, 0 to 1202618
Data columns (total 8 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   id                1202619 non-null  int64 
 1   date              1202619 non-null  object
 2   username          1202619 non-null  object
 3   rawContent        1202619 non-null  object
 4   retweetCount      1202619 non-null  int64 
 5   likeCount         1202619 non-null  int64 
 6   lemmaContent      1202272 non-null  object
 7   processedContent  1202619 non-null  object
dtypes: int64(3), object(5)
memory usage: 73.4+ MB


In [None]:
tweets_df_rms = filtered_df.lemmaContent.astype(str).apply(lambda x:remove_stopwords(x,nlp,my_stop_words))

  and should_run_async(code)


In [None]:
filtered_df['rmsContent'] = tweets_df_rms
filtered_df

  and should_run_async(code)


Unnamed: 0,id,date,username,rawContent,retweetCount,likeCount,lemmaContent,processedContent,rmsContent
0,1640881210586193921,2023-03-29,cameronwilson,I'm looking to speak to workers who've started...,1,1,look speak worker start chatgpt ai model help ...,i am looking to speak to workers who have star...,look speak worker start ai model help job tell...
1,1640880822990557184,2023-03-29,katiewalshstx,"""I asked ChatGPT to-"" keep to your damn self, ...",0,2,ask chatgpt damn self want read,i asked chatgpt to keep to your damn self ...,ask damn self want read
2,1640880099749937152,2023-03-29,katiewalshstx,"I have to mute ""ChatGPT,"" with kindness, I cou...",3,18,mute chatgpt kindness fuck ask,i have to mute chatgpt with kindness i cou...,mute kindness ask
3,1640879874473689089,2023-03-29,martypartymusic,@Blacklist_Ray No there is zero AI in chatGPT ...,0,0,zero ai chatgpt marketing thing datum ...,no there is zero ai in chatgpt its a market...,zero ai marketing thing datum neural...
4,1640879019020230658,2023-03-29,WRBolen,How the fuck do you end up mixing in details f...,0,2,fuck end mix detail news conference so...,how the fuck do you end up mixing in details f...,end mix detail news conference sound ...
...,...,...,...,...,...,...,...,...,...
1202614,1598036175285276672,2022-11-30,sethbannon,Trying out ChatGPT and wow https://t.co/TQ0t7B...,15,163,try chatgpt wow,trying out chatgpt and wow,try wow
1202615,1598027429909778432,2022-11-30,minimaxir,ChatGPT thwarts my many attempts at malicious ...,22,186,chatgpt thwart attempt malicious javascript in...,chatgpt thwarts my many attempts at malicious ...,thwart attempt malicious javascript injection
1202616,1598024131978547200,2022-11-30,minimaxir,ChatGPT vs. riddles https://t.co/YuPYOD0U7o,2,18,chatgpt vs riddle,chatgpt vs riddles,vs riddle
1202617,1598015627540635648,2022-11-30,gdb,"Just launched ChatGPT, our new AI system which...",367,2361,launch chatgpt new ai system optimize dialogue...,just launched chatgpt our new ai system which...,launch new ai system optimize dialogue try


In [None]:
filtered_df = filtered_df.dropna()

  and should_run_async(code)


### Generating Document Matrix and Dictionary:

A **document-term matrix** is a mathematical matrix that describes the frequency of terms that occur in a collection of documents. In a document-term matrix, rows correspond to documents in the collection and columns correspond to terms.

A **Dictionary** is collection of all unique tokens present in documents.

In [13]:
def generate_tokens(tweet):
  words=[]
  for word in word_tokenize(tweet):
    # using the if condition because we introduced extra spaces during text cleaning
    if word != '':
       words.append(word)
  return words

#storing the generated tokens in a new column named 'words'
filtered_df['tokens']=filtered_df.rmsContent.apply(generate_tokens)

  and should_run_async(code)


In [14]:
filtered_df.tokens

  and should_run_async(code)


0          [look, speak, worker, start, ai, model, help, ...
1                              [ask, damn, self, want, read]
2                                      [mute, kindness, ask]
3          [zero, ai, marketing, thing, datum, neural, ma...
4          [end, mix, detail, news, conference, sound, li...
                                 ...                        
1196065                                           [try, wow]
1196066    [thwart, attempt, malicious, javascript, injec...
1196067                                         [vs, riddle]
1196068    [launch, new, ai, system, optimize, dialogue, ...
1196069    [try, talk, new, ai, system, optimize, dialogu...
Name: tokens, Length: 1196070, dtype: object

In [None]:
# save
filtered_df.to_csv('filtered_verified_tweets.csv', ignore_index=True)

  and should_run_async(code)


In [7]:
# load
filtered_df = pd.read_csv('filtered_verified_tweets.csv')

  and should_run_async(code)


In [15]:
def create_dictionary(words):
  return corpora.Dictionary(words)

#passing the dataframe column having tokens as the argument
id2word = create_dictionary(filtered_df.tokens)
print(id2word)

  and should_run_async(code)


Dictionary<29653 unique tokens: ['ai', 'anonymously', 'boss', 'happy', 'help']...>


In [18]:
filtered_df

  and should_run_async(code)


Unnamed: 0,id,date,username,rawContent,retweetCount,likeCount,lemmaContent,processedContent,rmsContent,tokens
0,1640881210586193921,2023-03-29,cameronwilson,I'm looking to speak to workers who've started...,1,1,look speak worker start chatgpt ai model help ...,i am looking to speak to workers who have star...,look speak worker start ai model help job tell...,"[look, speak, worker, start, ai, model, help, ..."
1,1640880822990557184,2023-03-29,katiewalshstx,"""I asked ChatGPT to-"" keep to your damn self, ...",0,2,ask chatgpt damn self want read,i asked chatgpt to keep to your damn self ...,ask damn self want read,"[ask, damn, self, want, read]"
2,1640880099749937152,2023-03-29,katiewalshstx,"I have to mute ""ChatGPT,"" with kindness, I cou...",3,18,mute chatgpt kindness fuck ask,i have to mute chatgpt with kindness i cou...,mute kindness ask,"[mute, kindness, ask]"
3,1640879874473689089,2023-03-29,martypartymusic,@Blacklist_Ray No there is zero AI in chatGPT ...,0,0,zero ai chatgpt marketing thing datum ...,no there is zero ai in chatgpt its a market...,zero ai marketing thing datum neural...,"[zero, ai, marketing, thing, datum, neural, ma..."
4,1640879019020230658,2023-03-29,WRBolen,How the fuck do you end up mixing in details f...,0,2,fuck end mix detail news conference so...,how the fuck do you end up mixing in details f...,end mix detail news conference sound ...,"[end, mix, detail, news, conference, sound, li..."
...,...,...,...,...,...,...,...,...,...,...
1196065,1598036175285276672,2022-11-30,sethbannon,Trying out ChatGPT and wow https://t.co/TQ0t7B...,15,163,try chatgpt wow,trying out chatgpt and wow,try wow,"[try, wow]"
1196066,1598027429909778432,2022-11-30,minimaxir,ChatGPT thwarts my many attempts at malicious ...,22,186,chatgpt thwart attempt malicious javascript in...,chatgpt thwarts my many attempts at malicious ...,thwart attempt malicious javascript injection,"[thwart, attempt, malicious, javascript, injec..."
1196067,1598024131978547200,2022-11-30,minimaxir,ChatGPT vs. riddles https://t.co/YuPYOD0U7o,2,18,chatgpt vs riddle,chatgpt vs riddles,vs riddle,"[vs, riddle]"
1196068,1598015627540635648,2022-11-30,gdb,"Just launched ChatGPT, our new AI system which...",367,2361,launch chatgpt new ai system optimize dialogue...,just launched chatgpt our new ai system which...,launch new ai system optimize dialogue try,"[launch, new, ai, system, optimize, dialogue, ..."


In [16]:
def create_document_matrix(tokens,id2word):
  corpus = []
  ids = []
  for id, text in tokens.iteritems():
    ids.append(id)
    corpus.append(id2word.doc2bow(text))
  return ids, corpus
  
#passing the dataframe column having tokens and dictionary
ids, corpus=create_document_matrix(filtered_df.tokens,id2word)
#print(tweets_df.tokens[0])
#print(corpus[0])

  and should_run_async(code)


In [20]:
len(corpus)

  and should_run_async(code)


1196070

### LDA Model

reference:[genism.LDA document](https://radimrehurek.com/gensim/models/ldamodel.html)

In [21]:
lda_model = gensim.models.ldamodel.LdaModel(\
corpus=corpus,\
id2word=id2word,\
num_topics=5,\
random_state=100,
)

  and should_run_async(code)


save the model

In [None]:
%pwd

  and should_run_async(code)


'/content/drive/MyDrive/BDT SEM2/5005'

In [28]:
from gensim.test.utils import datapath

# Save model to disk.
model_dir = '/content/drive/MyDrive/BDT SEM2/5005/models/'
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
file = datapath(model_dir+"lda_model_verified_filtered_topic5")
lda_model.save(file)

  and should_run_async(code)


Load the model

In [None]:
from gensim.test.utils import datapath

file = datapath(root+"lda_model_22Dec")
# Load a potentially pretrained model from disk.
lda_model = gensim.models.ldamodel.LdaModel.load(file)

### PyLDAvis visualization

[How to understand the visualization](https://community.alteryx.com/t5/Data-Science/Getting-to-the-Point-with-Topic-Modeling-Part-3-Interpreting-the/ba-p/614992)

In [22]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  and should_run_async(code)


In [24]:
# show_topic()

def get_lda_topics(model, num_topics, top_n_words):
  word_dict = {}
  for i in range(num_topics):
    word_dict['Topic #'+ '{:02d}'.format(i+1)] = [i[0] for i in model.show_topic(i, topn = top_n_words)];
  return pd.DataFrame(word_dict)
  
get_lda_topics(lda_model,5,10)

  and should_run_async(code)


Unnamed: 0,Topic #01,Topic #02,Topic #03,Topic #04,Topic #05
0,write,ai,google,model,school
1,ask,tool,ai,user,ban
2,good,like,openai,language,student
3,like,new,microsoft,ai,exam
4,answer,use,search,like,pass
5,think,write,chatbot,datum,amp
6,thing,intelligence,new,use,university
7,know,artificial,bing,month,game
8,question,human,bard,amp,new
9,people,future,company,time,test


### post processing

In [25]:
from operator import itemgetter
from collections import Counter

t=[]

for i in corpus:
    topic_l = lda_model.get_document_topics(i, minimum_probability=None, minimum_phi_value=None, per_word_topics=False)
    t.append(max(topic_l,key=itemgetter(1))[0])

counter = Counter(t)
print(dict(counter))

  and should_run_async(code)


{1: 333543, 0: 349898, 4: 129720, 3: 198674, 2: 184235}


In [26]:
len(t)

  and should_run_async(code)


1196070

In [27]:
filtered_df['topic'] = t
filtered_df

  and should_run_async(code)


Unnamed: 0,id,date,username,rawContent,retweetCount,likeCount,lemmaContent,processedContent,rmsContent,tokens,topic
0,1640881210586193921,2023-03-29,cameronwilson,I'm looking to speak to workers who've started...,1,1,look speak worker start chatgpt ai model help ...,i am looking to speak to workers who have star...,look speak worker start ai model help job tell...,"[look, speak, worker, start, ai, model, help, ...",1
1,1640880822990557184,2023-03-29,katiewalshstx,"""I asked ChatGPT to-"" keep to your damn self, ...",0,2,ask chatgpt damn self want read,i asked chatgpt to keep to your damn self ...,ask damn self want read,"[ask, damn, self, want, read]",0
2,1640880099749937152,2023-03-29,katiewalshstx,"I have to mute ""ChatGPT,"" with kindness, I cou...",3,18,mute chatgpt kindness fuck ask,i have to mute chatgpt with kindness i cou...,mute kindness ask,"[mute, kindness, ask]",1
3,1640879874473689089,2023-03-29,martypartymusic,@Blacklist_Ray No there is zero AI in chatGPT ...,0,0,zero ai chatgpt marketing thing datum ...,no there is zero ai in chatgpt its a market...,zero ai marketing thing datum neural...,"[zero, ai, marketing, thing, datum, neural, ma...",1
4,1640879019020230658,2023-03-29,WRBolen,How the fuck do you end up mixing in details f...,0,2,fuck end mix detail news conference so...,how the fuck do you end up mixing in details f...,end mix detail news conference sound ...,"[end, mix, detail, news, conference, sound, li...",0
...,...,...,...,...,...,...,...,...,...,...,...
1196065,1598036175285276672,2022-11-30,sethbannon,Trying out ChatGPT and wow https://t.co/TQ0t7B...,15,163,try chatgpt wow,trying out chatgpt and wow,try wow,"[try, wow]",4
1196066,1598027429909778432,2022-11-30,minimaxir,ChatGPT thwarts my many attempts at malicious ...,22,186,chatgpt thwart attempt malicious javascript in...,chatgpt thwarts my many attempts at malicious ...,thwart attempt malicious javascript injection,"[thwart, attempt, malicious, javascript, injec...",0
1196067,1598024131978547200,2022-11-30,minimaxir,ChatGPT vs. riddles https://t.co/YuPYOD0U7o,2,18,chatgpt vs riddle,chatgpt vs riddles,vs riddle,"[vs, riddle]",2
1196068,1598015627540635648,2022-11-30,gdb,"Just launched ChatGPT, our new AI system which...",367,2361,launch chatgpt new ai system optimize dialogue...,just launched chatgpt our new ai system which...,launch new ai system optimize dialogue try,"[launch, new, ai, system, optimize, dialogue, ...",1


In [30]:
tweets_with_topic = filtered_df[['id','date','username','rawContent','topic','retweetCount','likeCount']]
tweets_with_topic

  and should_run_async(code)


Unnamed: 0,id,date,username,rawContent,topic,retweetCount,likeCount
0,1640881210586193921,2023-03-29,cameronwilson,I'm looking to speak to workers who've started...,1,1,1
1,1640880822990557184,2023-03-29,katiewalshstx,"""I asked ChatGPT to-"" keep to your damn self, ...",0,0,2
2,1640880099749937152,2023-03-29,katiewalshstx,"I have to mute ""ChatGPT,"" with kindness, I cou...",1,3,18
3,1640879874473689089,2023-03-29,martypartymusic,@Blacklist_Ray No there is zero AI in chatGPT ...,1,0,0
4,1640879019020230658,2023-03-29,WRBolen,How the fuck do you end up mixing in details f...,0,0,2
...,...,...,...,...,...,...,...
1196065,1598036175285276672,2022-11-30,sethbannon,Trying out ChatGPT and wow https://t.co/TQ0t7B...,4,15,163
1196066,1598027429909778432,2022-11-30,minimaxir,ChatGPT thwarts my many attempts at malicious ...,0,22,186
1196067,1598024131978547200,2022-11-30,minimaxir,ChatGPT vs. riddles https://t.co/YuPYOD0U7o,2,2,18
1196068,1598015627540635648,2022-11-30,gdb,"Just launched ChatGPT, our new AI system which...",1,367,2361


In [31]:
tweets_with_topic.to_csv('tweets_with_topic.csv')

  and should_run_async(code)


In [41]:
topic_df = filtered_df[['date','topic']]
topic_df

  and should_run_async(code)


Unnamed: 0,date,topic
0,2023-03-29,1
1,2023-03-29,0
2,2023-03-29,1
3,2023-03-29,1
4,2023-03-29,0
...,...,...
1196065,2022-11-30,4
1196066,2022-11-30,0
1196067,2022-11-30,2
1196068,2022-11-30,1


In [45]:
tmp = topic_df.groupby(['date', 'topic']).size().reset_index(name='topic_count')

  and should_run_async(code)


In [46]:
tmp

  and should_run_async(code)


Unnamed: 0,date,topic,topic_count
0,2022-11-30,0,216
1,2022-11-30,1,54
2,2022-11-30,2,36
3,2022-11-30,3,18
4,2022-11-30,4,54
...,...,...,...
675,2023-04-14,0,5
676,2023-04-14,1,2
677,2023-04-14,2,2
678,2023-04-14,3,4


In [47]:
tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 680 entries, 0 to 679
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         680 non-null    object
 1   topic        680 non-null    int64 
 2   topic_count  680 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 16.1+ KB


  and should_run_async(code)


In [52]:
tmp = tmp.astype({"topic": str,"date":'datetime64[ns]'}, errors='raise') 

  and should_run_async(code)


In [55]:
tmp['topic'] = tmp.topic.astype(str)

  and should_run_async(code)


In [59]:
tmp.to_csv('tmp.csv', index=False)

  and should_run_async(code)
