In [4]:
# Using spacy.load().
import spacy
nlp = spacy.load("en_docusco_spacy")

# Importing as module.
import en_docusco_spacy
nlp = en_docusco_spacy.load()

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import spacy
from tmtoolkit.corpus import Corpus, vocabulary_size, corpus_num_tokens
import re

def pre_process(txt):
    txt = re.sub(r'\bits\b', 'it s', txt)
    txt = re.sub(r'\bIts\b', 'It s', txt)
    txt = " ".join(txt.split())
    return(txt)

# Convert Corpus
## Basic information about the twitter_sentiment_data.csv

In [7]:
import pandas as pd

# Load only the first 1000 rows from the CSV file
df = pd.read_csv('twitter_sentiment_data.csv').head(100)

# Save the subset to a temporary CSV file
df.to_csv('twitter_sentiment_data_1000sample.csv', index=False)


In [8]:
corp = Corpus.from_tabular(
    'twitter_sentiment_data_1000sample.csv',
    id_column = "tweetid",
    text_column="message",  # Specify the column containing the preprocessed text
    spacy_instance=nlp,  # Use the spaCy instance for processing
    spacy_token_attrs=["tag", "ent_iob", "ent_type", "is_punct"]  # Token attributes to include
)

corpus_total = corpus_num_tokens(corp)
corpus_types = vocabulary_size(corp)
total_punct = []
for i in range(0,len(corp)):
    total_punct.append(sum(corp[i]['is_punct']))
total_punct = sum(total_punct)
non_punct = corpus_total - total_punct

In [9]:
print('Aphanumeric tokens:', non_punct, '\nPunctuation tokens:', total_punct, '\nTotal tokens:', corpus_total, '\nToken types:', corpus_types)

Aphanumeric tokens: 1855 
Punctuation tokens: 320 
Total tokens: 2175 
Token types: 681


## Trial 1.1 Frequency Table (with stopwords)

In [10]:
from docuscospacy import convert_corpus, frequency_table, tags_table, ngrams_by_token, ngrams_by_tag, coll_table, tags_dtm, normalize_dtm, dtm_to_coo, kwic_center_node, keyness_table, tag_ruler

In [11]:
tp = convert_corpus(corp)

wc = frequency_table(tp, non_punct)

* AF (Absolute Frequency): The total number of times the token appears in the corpus.
* RF (Relative Frequency): The frequency of the token adjusted for corpus size, typically expressed as occurrences per million words (or similar)。
* Range: The percentage of documents in the corpus where the token appears. For example, if the range is 77.05, the token climate appears in 77.05% of the documents.

In [12]:
wc.head(20).style.hide(axis='index').format(precision=2)

Token,Tag,AF,RF,Range
climate,NN1,79,42587.6,79.0
change,NN1,74,39892.18,74.0
rt,NP1,53,28571.43,53.0
the,AT,46,24797.84,43.0
to,TO,33,17789.76,31.0
global,JJ,25,13477.09,25.0
as,CSA,23,12398.92,23.0
beforetheflood,NP1,23,12398.92,22.0
here,RL,22,11859.84,22.0
is,VBZ,22,11859.84,20.0


## Trial 1.2 Frequency Table (without stopwords)

In [13]:
import pandas as pd
import spacy
from tmtoolkit.corpus import Corpus, vocabulary_size, corpus_num_tokens

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")
nlp.Defaults.stop_words.add("rt")  # Add "rt" to stopwords

# Load the CSV
csv_path = "twitter_sentiment_data.csv"
df = pd.read_csv(csv_path).head(1000)

# Preprocess the text
def pre_process(txt):
    doc = nlp(txt)
    # Filter out stopwords and punctuation
    filtered_tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(filtered_tokens)

# Apply the preprocessing to the text column
df['message'] = df['message'].apply(pre_process)

output_path = "twitter_sentiment_data_preprocessed_1000.csv"
df.to_csv(output_path, index=False)


In [14]:
corp = Corpus.from_tabular(
    "twitter_sentiment_data_preprocessed_1000.csv",  # Path to the preprocessed CSV
    id_column="tweetid",  # If you have an ID column
    text_column="message",  # Specify the preprocessed text column
    spacy_instance=nlp,  # Use your SpaCy instance
    spacy_token_attrs=["tag", "ent_iob", "ent_type", "is_punct"]  # Token attributes to include
)

corpus_total = corpus_num_tokens(corp)
corpus_types = vocabulary_size(corp)
total_punct = []
for i in range(0,len(corp)):
    total_punct.append(sum(corp[i]['is_punct']))
total_punct = sum(total_punct)
non_punct = corpus_total - total_punct


In [15]:
tp = convert_corpus(corp)

wc = frequency_table(tp, non_punct)

In [16]:
wc.head(30).style.hide(axis='index').format(precision=2)

Token,Tag,AF,RF,Range
climate,NN,806,68143.39,79.1
change,NN,751,63493.41,74.8
global,JJ,222,18769.02,21.9
warming,NN,200,16909.03,19.9
@leodicaprio,NNP,139,11751.78,13.9
world,NN,128,10821.78,12.8
beforetheflood,NN,124,10483.6,12.4
watch,VB,121,10229.96,12.1
httã¢â‚¬â,NNP,115,9722.69,11.5
right,UH,105,8877.24,10.5


## Trial 2 Ngram tables

### Trial 2.1 Node word "change" with a span of 3 (i.e. triagrams) as example:

In [17]:
nc = ngrams_by_token(tp, node_word='change', n_tokens=non_punct, node_position=1, span=3, search_type='fixed', count_by='pos')

In [18]:
nc.head(10).style.hide(axis='index').format(precision=2)

Token1,Token2,Token3,Tag1,Tag2,Tag3,AF,RF,Range
change,extending,winter,NN,VBG,NN,14,1183.63,1.4
change,way,covers,NN,NN,VBZ,10,845.45,1.0
change,daily,life,NN,JJ,NN,7,591.82,0.7
change,documentary,free,NN,NN,JJ,7,591.82,0.7
change,documentary,leonardo,NN,NN,NNP,7,591.82,0.7
change,parisagreement,crucial,NN,NNP,JJ,7,591.82,0.7
change,prevent,catastrophe,NN,NN,NN,7,591.82,0.7
change,buying,wall,NN,VBG,NN,6,507.27,0.6
change,join,post,NN,VBP,NN,6,507.27,0.6
change,mentions,cable,NN,VBZ,NN,6,507.27,0.6


### Trial 2.2 Can chose tokens ending with specific suffix to explore:
e.g. explore words end with "ing" with a span of 3

In [19]:
nc = ngrams_by_token(tp, node_word='ing', n_tokens=non_punct, node_position=2, span=3, search_type='ends_with', count_by='pos')

In [20]:
nc.head(10).style.hide(axis='index').format(precision=2)

Token1,Token2,Token3,Tag1,Tag2,Tag3,AF,RF,Range
change,extending,winter,NN,VBG,NN,14,1183.63,1.4
vortex,shifting,climate,NN,VBG,NN,14,1183.63,1.4
global,warming,chinese,JJ,NN,JJ,13,1099.09,1.3
global,warming,real,JJ,NN,JJ,11,930.0,1.1
@lifeaseva,taking,shorter,NNP,VBG,JJR,7,591.82,0.7
global,warming,main,JJ,NN,JJ,7,591.82,0.7
showers,buying,reusable,NNS,VBG,JJ,7,591.82,0.7
change,buying,wall,NN,VBG,NN,6,507.27,0.6
beaches,battering,scientists,NNS,VBG,NNS,5,422.73,0.5
global,warming,climate,JJ,NN,NN,5,422.73,0.5


### Trial 2.3 N-gram table with specific category(collect n-grams using the ngrams_by_tag function)
* Tried "positive", "negative","academic terms","ConfidenceHigh", "Character", but no result showed. Need to find out why?

In [38]:
nc = ngrams_by_tag(tp, tag='ConfidenceHigh', n_tokens=non_punct, tag_position=3, span=3, search_type='fixed', count_by='ds')

Your n-gram search did not return any results.


In [39]:
nc.head(10).style.hide(axis='index').format(precision=2)

AttributeError: 'NoneType' object has no attribute 'head'

### Trial 3.1 Collocations
e.g. filter the collocation with the word "change" only when it's tagged as noun

In [33]:
ct = coll_table(tp, 'change', node_tag='NN', statistic='pmi', count_by='pos')

In [34]:
ct.head(20).style.hide(axis='index').format(precisi on=2)

Token,Tag,Freq Span,Freq Total,MI
alarmism,NN,2,1,4.86
dumb,JJ,2,1,4.86
eachother,NN,2,1,4.86
find,VBP,2,1,4.86
pledges,VBZ,2,1,4.86
stop,NN,2,1,4.86
strict,JJ,2,1,4.86
believe,NN,3,2,4.44
citizens,NNS,3,2,4.44
convince,VBP,3,2,4.44


### Trial 3.2 Filter collocations with frequency and MI
e.g. total frequency > 5 and MI > 3:

In [35]:
ct.query('`Freq Total` > 5 and MI > 3 and Tag.str.startswith("V")').head(10).style.hide(axis='index').format(precision=2)

Token,Tag,Freq Span,Freq Total,MI
covered,VBN,11,11,3.86
covers,VBZ,10,10,3.86
explore,VB,7,7,3.86
extending,VBG,14,14,3.86
fight,VBP,7,7,3.86
finds,VBZ,13,13,3.86
join,VBP,6,6,3.86
linked,VBD,9,9,3.86
protect,VB,8,8,3.86
shifting,VBG,14,14,3.86


### Trial 3.3 calculate collocations while ignoring tags completely by setting tag_ignore to ‘True’:

In [37]:
ct = coll_table(tp, 'change', tag_ignore=True, statistic='npmi')
ct.head(10).style.hide(axis='index').format(precision=2)

Token,Freq Span,Freq Total,MI
climate,795,811,1.0
tackle,105,105,0.57
travels,104,104,0.57
world,120,131,0.56
https://t.co/lkdehj3tnn,102,102,0.56
httã¢â‚¬â,109,115,0.56
exit,50,50,0.49
pact,49,49,0.49
plan,53,59,0.47
trump,74,106,0.45
