## RBCWealth

In [1]:
import pandas as pd

tweets_rbc = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/RBCwealth_since_2018_01_01.csv')

texts_rbc = tweets_rbc['Tweet'].tolist()

In [2]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')
nltk.download('stopwords')

#Define cleaning
def clean_text(text):
    text = str(text)
    text = text.lower()  # convert to lower case
    text = re.sub(r'\d+', '', text)  # remove number
    text = re.sub(r'http\S+', '', text)  # remove url
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation mark
    tokens = word_tokenize(text)  
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # remove stop words
    return ' '.join(tokens)

clean_texts_rbc = [clean_text(text) for text in texts_rbc]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/houhiroshisakai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/houhiroshisakai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Latent Dirichlet Allocation
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# ContextVectorize
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_rbc = vectorizer.fit_transform(clean_texts_rbc)

# Apply LDA
lda_rbc = LatentDirichletAllocation(n_components=10)
lda_rbc.fit(X_rbc)

print(lda_rbc.components_)

[[3.09989678 0.1        1.09999893 ... 0.1000005  0.10002835 0.1       ]
 [0.10001541 0.10000643 0.1        ... 0.10002092 0.1        0.1       ]
 [0.1        1.09994308 0.1        ... 0.10000588 0.1        0.1       ]
 ...
 [1.09999845 2.74435988 1.10000107 ... 0.1        0.1        0.1       ]
 [2.01888095 0.1        0.1        ... 2.0999727  0.10000505 2.1       ]
 [1.10000726 2.10000735 0.1        ... 0.1        1.56236583 0.1       ]]


In [4]:
feature_names_rbc = vectorizer.get_feature_names_out()

In [5]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

print_top_words(lda_rbc, feature_names_rbc, n_top_words=10)

Topic #0:
global learn insight latest read investors weekly markets economic market
Topic #1:
rbc learn read support wealth jersey art mental partner delighted
Topic #2:
learn wealth business financial read family rbc women make management
Topic #3:
learn watch help wealthy barber financial chilton david executor women
Topic #4:
learn financial plan help impact make read aging retirement time
Topic #5:
rbc read wealth management support learn work ceo young thank
Topic #6:
learn family market global investors income investing fixed rbc despite
Topic #7:
learn wealth plan retirement planning consider read health financial care
Topic #8:
learn rbc read impact look partnership support signs wealth career
Topic #9:
investing learn read rbc women impacted start health wealth helped


In [6]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_rbc.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_rbc[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_rbc = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_rbc = df_top_words_rbc.T 

# Save the DataFrame to an Excel file
#excel_path = '/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/ResultIntegration.xlsx'
#df_top_words_transposed_rbc.to_excel(excel_path, index=False)

In [7]:
df_rbc_prefixed = df_top_words_transposed_rbc

## CIBC

In [8]:
tweets_cibc = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/CIBCwealth_since_2018_01_01.csv')

In [9]:
texts_cibc = tweets_cibc['Tweet'].tolist()

In [10]:
clean_texts_cibc = [clean_text(text) for text in texts_cibc]

In [11]:
# ContextVectorize
X_cibc = vectorizer.fit_transform(clean_texts_cibc)

# Apply LDA
lda_cibc = LatentDirichletAllocation(n_components=10)
topics_cibc = lda_cibc.fit(X_cibc)

print(lda_cibc.components_)

[[0.1        2.10000407 0.1        ... 1.09998658 0.1        0.1000386 ]
 [1.10000231 0.1        0.1        ... 1.18906372 1.09999003 0.1       ]
 [1.09999769 0.1        0.1        ... 0.1        0.1        0.10003843]
 ...
 [0.1        1.09999593 0.1        ... 0.10003323 3.60307106 0.1       ]
 [0.1        0.1        3.10000417 ... 4.10006921 2.09994143 0.1       ]
 [0.1        0.1        4.09999071 ... 0.1000185  9.59698151 0.1001207 ]]


In [12]:
feature_names_cibc = vectorizer.get_feature_names_out()

In [13]:
print_top_words(lda_cibc, feature_names_cibc, n_top_words=10)

Topic #0:
reserve federal tax global recession interestrates investment perspectives cibcs economy
Topic #1:
says chief interestrates markets durantaye la luc higher cibcs rate
Topic #2:
register join tax pm et family hear experts cibcfamilyoffice cibc
Topic #3:
bank rate canada rates today economy cibc help inflation learn
Topic #4:
inflation cibcs oil tal market dyk housing benjamin investors canadian
Topic #5:
week discuss available watch investment roundup weekly economic markets reading
Topic #6:
day happy wishing tax ones loved celebrate today time investing
Topic #7:
canadians investors know golombek jamie canada like end cibcs shares
Topic #8:
cibc jamie month investing team year golombek cibcfamilyoffice community contributions
Topic #9:
wealth wood cibc gundy advisors clients women advisor experience industry


In [14]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_cibc.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_cibc[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_cibc = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_cibc = df_top_words_cibc.T 

df_cibc_prefixed = df_top_words_transposed_cibc

In [15]:
df_cibc_prefixed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,reserve,says,register,bank,inflation,week,day,canadians,cibc,wealth
1,federal,chief,join,rate,cibcs,discuss,happy,investors,jamie,wood
2,tax,interestrates,tax,canada,oil,available,wishing,know,month,cibc
3,global,markets,pm,rates,tal,watch,tax,golombek,investing,gundy
4,recession,durantaye,et,today,market,investment,ones,jamie,team,advisors
5,interestrates,la,family,economy,dyk,roundup,loved,canada,year,clients
6,investment,luc,hear,cibc,housing,weekly,celebrate,like,golombek,women
7,perspectives,higher,experts,help,benjamin,economic,today,end,cibcfamilyoffice,advisor
8,cibcs,cibcs,cibcfamilyoffice,inflation,investors,markets,time,cibcs,community,experience
9,economy,rate,cibc,learn,canadian,reading,investing,shares,contributions,industry


In [16]:
df_final = pd.DataFrame()

In [17]:
#Store RBC to excel file
df_rbc_with_name = pd.DataFrame([['RBC']], columns=[0])  # Only a cell includes "RBC"
df_rbc_with_topics = pd.concat([df_rbc_with_name, df_rbc_prefixed], ignore_index=True)


df_final = pd.concat([df_final, df_rbc_with_topics], ignore_index=True)
#Store CIBC to excel file
df_cibc_with_name = pd.DataFrame([['CIBC']], columns=[0])  # 只有一个单元格包含"CIBC"
df_cibc_with_topics = pd.concat([df_cibc_with_name, df_cibc_prefixed], ignore_index=True)

excel_path = '/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/ResultIntegration.xlsx'

df_final = pd.concat([df_final, df_cibc_with_topics], ignore_index=True)
df_final.to_excel(excel_path, sheet_name='Bank_Topics', index=False)

## BMO

In [18]:
tweets_bmo = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/BMO_since_2018_01_01.csv')

texts_bmo = tweets_bmo['Tweet'].tolist()

In [19]:
clean_texts_bmo = [clean_text(text) for text in texts_bmo]

In [20]:
# ContextVectorize
X_bmo = vectorizer.fit_transform(clean_texts_bmo)

# Apply LDA
lda_bmo = LatentDirichletAllocation(n_components=10)
topics_bmo = lda_bmo.fit_transform(X_bmo)

print(lda_bmo.components_)

[[ 0.1         0.1         0.10000352 ...  0.10017287  0.10005787
   0.10001447]
 [ 0.10001566  0.10000315  1.0882488  ...  0.1         0.10000057
   0.1       ]
 [ 0.1000122   9.78425734 10.90067421 ...  2.09978383  0.10000062
   0.1       ]
 ...
 [ 0.10000465  0.10001246  3.47368833 ...  0.1         0.1
   0.1       ]
 [ 0.1         0.1         8.61447523 ...  0.1         0.1
   0.1       ]
 [ 0.1         0.10000091  0.10000057 ...  0.10003046  0.1
   3.09997869]]


In [21]:
feature_names_bmo = vectorizer.get_feature_names_out()

In [22]:
print_top_words(lda_bmo, feature_names_bmo, n_top_words=10)

Topic #0:
bmo wethenorth northovereverything game proud support bmogrowthegood today day learn
Topic #1:
dm apologize wait hi send need feel times free questions
Topic #2:
working wait apologize times possible thank soon patience calls high
Topic #3:
thank feedback dm experience help hello thanks time sharing nb
Topic #4:
bmo card account information credit visit branch hi mastercard debit
Topic #5:
nc know let great thanks investsmart thank youre help welcome
Topic #6:
banking online dm hi app youre mobile sorry learn hello
Topic #7:
message send private sorry learn im hi assist like phone
Topic #8:
thank dm send hi sorry assist help look hello reaching
Topic #9:
bmo nous vous business join et bmoforwomen financial investing episode


In [23]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_bmo.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_bmo[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_bmo = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_bmo = df_top_words_bmo.T 

df_bmo_prefixed = df_top_words_transposed_bmo

## Scotiabank 

In [24]:
tweets_scotia = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/scotiabank_since_2018_01_01.csv')

texts_scotia = tweets_scotia['Tweet'].tolist()

clean_texts_scotia = [clean_text(text) for text in texts_scotia]

# ContextVectorize
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_scotia = vectorizer.fit_transform(clean_texts_scotia)

# Apply LDA
lda_scotia = LatentDirichletAllocation(n_components=10)
topics_scotia = lda_scotia.fit(X_scotia)

print(lda_scotia.components_)


[[ 1.10000015  1.09999119 13.04354861 ...  0.1         0.1
   0.10000067]
 [ 2.09989139  0.1         0.1        ...  0.1         0.1
   0.1       ]
 [ 0.1         0.1         0.10000927 ...  1.09995578  0.10001578
   0.1       ]
 ...
 [ 0.1         1.09996378  0.1        ...  1.10000215  0.1
   0.1       ]
 [ 0.1         0.10002873  1.15641712 ...  0.1         0.1
   0.1       ]
 [ 0.10010056  1.09999444  0.1        ...  1.09994736  0.1
   0.1       ]]


In [25]:
feature_names_scotia = vectorizer.get_feature_names_out()

In [26]:
print_top_words(lda_scotia, feature_names_scotia, n_top_words=10)

Topic #0:
hockey rate scotiabank inflation make inclusive work home accessible game
Topic #1:
listen follow podcasts spotify apple scene rate canada points scotiabank
Topic #2:
canada credit canadians chief rate scotiabanks inflation investment latest rates
Topic #3:
year account blackhistorymonth blackvoices story women savings celebrate black business
Topic #4:
hockey canada financial hockeyforall day help like jersey money new
Topic #5:
nous la scotiabank et scotia sa gatineau célébrons vous autochtones
Topic #6:
la les et des le taux que du en pour
Topic #7:
scotiabank ont little depuis pas canada pour ne new employees
Topic #8:
latest episode scotiabank podcast canadas help canada read new canadian
Topic #9:
financial holiday advisor scotia goals season plan new learn retirement


In [27]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_scotia.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_scotia[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_scotia = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_scotia = df_top_words_scotia.T 

df_scotia_prefixed = df_top_words_transposed_scotia

## TD_US_News

In [28]:
tweets_tdus = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/TDNews_US_since_2018_01_01.csv')

texts_tdus = tweets_tdus['Tweet'].tolist()

clean_texts_tdus = [clean_text(text) for text in texts_tdus]

# ContextVectorize
X_tdus = vectorizer.fit_transform(clean_texts_tdus)

# Apply LDA
lda_tdus = LatentDirichletAllocation(n_components=10)
topics_tdus = lda_tdus.fit(X_tdus)

print(lda_tdus.components_)

[[ 0.1         0.1         0.1        ...  2.50906681  0.1
   0.10000743]
 [ 0.1         1.09999135  0.10005518 ...  0.798476    0.1
   0.1       ]
 [ 0.1         0.10001627  0.1        ...  4.5319798   1.09999346
   0.1       ]
 ...
 [ 1.10000038  0.1         0.1        ...  0.1         1.10000406
   0.1       ]
 [ 0.1         0.1         0.1        ... 11.11640123  0.1
   0.10000833]
 [ 0.1         0.1         1.09994031 ...  1.63148437  0.1
   3.94859763]]


In [29]:
feature_names_tdus = vectorizer.get_feature_names_out()

print_top_words(lda_tdus, feature_names_tdus, n_top_words=10)

Topic #0:
wealth financial tdbank_us home td investment strategist equity head chief
Topic #1:
tdbank_us credit head lending td learn day home card important
Topic #2:
td community bank learn new housing business million ready lgbtq
Topic #3:
td learn tds year tdbank_us read inclusion disability colleagues proud
Topic #4:
td support learn communities local community provide help organizations efforts
Topic #5:
read tdbank_us covid shares time tds new learn journey officer
Topic #6:
check tips moneymattersmonday make help business budget new learn year
Topic #7:
td ceo president customers learn tdbank_us braca greg tdbank_uss ppp
Topic #8:
survey money tdbank_us love according results data relationship check couples
Topic #9:
td financial business tdbank_us bank owners survey finance best learn


In [30]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_tdus.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_tdus[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_tdus = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_tdus = df_top_words_tdus.T 

df_tdus_prefixed = df_top_words_transposed_tdus

## TD US

In [31]:
tweets_tdus1 = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/TDBank_US_since_2018_01_01.csv')

texts_tdus1 = tweets_tdus1['Tweet'].tolist()

In [32]:
clean_texts_tdus1 = [clean_text(text) for text in texts_tdus1]

In [33]:
# ContextVectorize
X_tdus1 = vectorizer.fit_transform(clean_texts_tdus1)

# Apply LDA
lda_tdus1 = LatentDirichletAllocation(n_components=10)
topics_tdus1 = lda_tdus1.fit_transform(X_tdus1)

print(lda_tdus1.components_)

[[1.00000000e-01 1.00000000e-01 5.14249111e+00 ... 1.00000001e-01
  1.00000000e-01 1.00000000e-01]
 [1.00000000e-01 1.00000000e-01 9.82422869e+01 ... 1.00000001e-01
  1.00010916e-01 1.00010916e-01]
 [1.00000000e-01 1.00000000e-01 5.91359643e+02 ... 1.00000001e-01
  2.09989325e+00 2.09989325e+00]
 ...
 [1.00000000e-01 1.00000000e-01 1.67483334e-01 ... 1.00000000e-01
  1.00000000e-01 1.00000000e-01]
 [1.00000000e-01 1.00000000e-01 1.00011753e-01 ... 1.00024404e-01
  1.00048297e-01 1.00048297e-01]
 [1.00000000e-01 1.00000000e-01 3.53510168e+02 ... 3.09989080e+00
  1.00000000e-01 1.00000000e-01]]


In [34]:
feature_names_tdus1 = vectorizer.get_feature_names_out()

print_top_words(lda_tdus1, feature_names_tdus1, n_top_words=10)

Topic #0:
apologize online issues inconvenience working banking app experiencing issue sorry
Topic #1:
appreciate patience customers time working possible td thank feedback thanks
Topic #2:
dm noaccts like send best thank lf details assist good
Topic #3:
td card bank hope store visit information debit hi hey
Topic #4:
free feel dm noaccts tw happy send noacct great know
Topic #5:
dm account send numbers hi like chat concerns thanks hey
Topic #6:
ka hey concerns happy know fees questions account address understand
Topic #7:
account dm send numbers good details hear morning like ask
Topic #8:
dm accts help hi lw sorry plz hold times assist
Topic #9:
dm like send account details feel numbers saw learn tweet


In [35]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_tdus1.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_bmo[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_tdus1 = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_tdus1 = df_top_words_tdus1.T 

df_tdus1_prefixed = df_top_words_transposed_tdus1

## TD_Canada_New

In [36]:
tweets_tdcanada = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/TDNews_Canada_since_2018_01_01.csv')

texts_tdcanada = tweets_tdcanada['Tweet'].tolist()

In [37]:
clean_texts_tdcanada = [clean_text(text) for text in texts_tdcanada]

In [38]:
# ContextVectorize
X_tdcanada = vectorizer.fit_transform(clean_texts_tdcanada)

# Apply LDA
lda_tdcanada = LatentDirichletAllocation(n_components=10)
topics = lda_tdcanada.fit_transform(X_tdcanada)

print(lda_tdcanada.components_)

[[0.1        2.1000061  0.1        ... 2.10001324 0.10005523 0.1       ]
 [0.10000191 0.1        1.09999694 ... 0.1        1.09990892 0.1       ]
 [0.1        0.1        0.1        ... 1.09999186 0.1        0.1       ]
 ...
 [0.1        0.1        0.1        ... 0.1        1.27352652 0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.10000702 0.10000446]
 [0.1        0.1        0.1        ... 0.1        0.1        2.10000449]]


In [39]:
feature_names_tdcanada = vectorizer.get_feature_names_out()

print_top_words(lda_tdcanada, feature_names_tdcanada, n_top_words=10)

Topic #0:
td digital tds customers experiences canada new help customer learn
Topic #1:
td news banking heres indigenous weve bank financial shares digital
Topic #2:
td covid help customers ceo bharat masrani learn canadians pandemic
Topic #3:
fraud financial help black learn money women tips td work
Topic #4:
financial td tds new help canadians global canada canadian protect
Topic #5:
td report read new annual economics innovation canadians proud group
Topic #6:
help financial mortgage season tips holiday ways canadians retirement home
Topic #7:
td help heres canadian support intelligence insurance tips make money
Topic #8:
brucecooper_td tdassetmanagement weeks podcast marketperspectives canada growth markets market economy
Topic #9:
td kids financial customers check tds sure changing heres parents


In [40]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_tdcanada.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_tdcanada[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_tdcanada = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_tdcanada = df_top_words_tdcanada.T 

df_tdcanada_prefixed = df_top_words_transposed_tdcanada

## TD Canada

In [41]:
tweets_tdcanada1 = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/TDNews_Canada_since_2018_01_01.csv')

texts_tdcanada1 = tweets_tdcanada1['Tweet'].tolist()

clean_texts_tdcanada1 = [clean_text(text) for text in texts_tdcanada]

# ContextVectorize
X_tdcanada1 = vectorizer.fit_transform(clean_texts_tdcanada1)

# Apply LDA
lda_tdcanada1 = LatentDirichletAllocation(n_components=10)
topics_tdcanada1 = lda_tdcanada1.fit_transform(X_tdcanada1)

print(lda_tdcanada1.components_)

[[0.1        1.09987438 0.1        ... 0.1        4.87673378 0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.1        1.10000142]
 [0.1        0.1        0.1        ... 1.09999922 0.71148895 0.1       ]
 ...
 [1.09999936 0.10010723 0.1        ... 2.10000397 0.1000092  1.10000117]
 [0.1        0.1        0.1        ... 0.1        1.09999901 0.1       ]
 [1.10000063 0.1        0.1        ... 0.1        0.1        0.1       ]]


In [42]:
feature_names_tdcanada1 = vectorizer.get_feature_names_out()

print_top_words(lda_tdcanada1, feature_names_tdcanada1, n_top_words=10)

Topic #0:
brucecooper_td tdassetmanagement weeks podcast marketperspectives digital explains td tds markets
Topic #1:
td ceo bharat masrani tds group financial bank new canadian
Topic #2:
financial canadians td report advice money finances ai online dyk
Topic #3:
mortgage td home retirement financial canadians covid tips help common
Topic #4:
td help learn customers read support financial canada communities colleagues
Topic #5:
td community economic canada lgbtq tds covid business economist read
Topic #6:
td help protect loved financial canadians ones fraud insurance learn
Topic #7:
td help new heres financial ways dont canadians weve future
Topic #8:
customers customer td new experiences experience black innovation inclusive digital
Topic #9:
td tds season tdnewsroom heres year canadian news holiday business


In [43]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_tdcanada1.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_tdcanada1[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_tdcanada1 = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_tdcanada1 = df_top_words_tdcanada1.T 

df_tdcanada1_prefixed = df_top_words_transposed_tdcanada1

# US Company

## Morgan Stanley

In [44]:
tweets_ms = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/MorganStanley_since_2018_01_01.csv')

texts_ms = tweets_ms['Tweet'].tolist()

clean_texts_ms = [clean_text(text) for text in texts_ms]

# ContextVectorize
X_ms = vectorizer.fit_transform(clean_texts_ms)

# Apply LDA
lda_ms = LatentDirichletAllocation(n_components=10)
topics_ms = lda_ms.fit(X_ms)

print(lda_ms.components_)

[[0.1        0.1        0.10000169 ... 0.1        0.1        0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.1        0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.1        0.1       ]
 ...
 [0.1        0.1        2.09998567 ... 0.1        2.09989402 0.1       ]
 [0.10005864 4.09994799 0.1        ... 2.1        0.1        0.10000675]
 [0.1        0.10003885 0.1        ... 0.1        0.1        0.10000966]]


In [45]:
feature_names_ms = vectorizer.get_feature_names_out()
print_top_words(lda_ms, feature_names_ms, n_top_words=10)

Topic #0:
markets chief investors global market equity read investment episode policy
Topic #1:
morgan stanley health mental msgivesback children head childrens management support
Topic #2:
help learn morgan market stanley sichallenge fund make money financial
Topic #3:
ms earnings net billion management multicultural investment revenues year innovation
Topic #4:
eagleup wealth lisa shalett morgan management investment year eagles booktrustusa
Topic #5:
investing sustainable morgan stanley change investors learn climate sustainability new
Topic #6:
morgan podcast year opportunity new access episode carlaannharris team stanley
Topic #7:
morgan new stanley asia firm annual opportunities summit pride lgbt
Topic #8:
morgan stanley learn msgivesback employees women program day career london
Topic #9:
morgan stanley learn advisors veterans named clients congratulations make wealth


In [46]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_ms.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_ms[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_ms = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_ms = df_top_words_ms.T 

df_ms_prefixed = df_top_words_transposed_ms

## UBS

In [47]:
tweets_ubs = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/UBS_since_2018_01_01.csv')

texts_ubs = tweets_ubs['Tweet'].tolist()

clean_texts_ubs = [clean_text(text) for text in texts_ubs]

# ContextVectorize
X_ubs = vectorizer.fit_transform(clean_texts_ubs)

# Apply LDA
lda_ubs = LatentDirichletAllocation(n_components=10)
topics_ubs = lda_ubs.fit_transform(X_ubs)

print(lda_ubs.components_)

[[ 0.1         0.1         0.1        ...  0.1         0.1
  19.47331979]
 [ 0.1         0.1         0.1        ...  0.1         0.1
   2.21210432]
 [ 0.1         0.10002408  0.10006523 ...  0.1         1.09984244
   0.10000936]
 ...
 [ 0.10001673  2.09996231  0.10000357 ...  0.1         0.1
   0.1       ]
 [ 0.1         0.1         0.1        ...  0.10000353  0.1
   3.08937569]
 [ 0.1         0.1         0.1        ...  2.09999647  0.10015356
   1.92865036]]


In [48]:
feature_names_ubs = vectorizer.get_feature_names_out()
print_top_words(lda_ubs, feature_names_ubs, n_top_words=10)

Topic #0:
shareubs togetherband thetogetherband iwd sdg support awareness help goal choosetochallenge
Topic #1:
shareubs ubs chief economist paul donovan global gwm year video
Topic #2:
shareubs wef paper learn white world read sdg education ubs
Topic #3:
women shareubs financial ownyourworth ubss money learn decisions finances watch
Topic #4:
results business ubs ceo shareubs group ermotti sergio owners quarter
Topic #5:
shareubs ubs finance impact sustainability future china best bank ubsgcc
Topic #6:
shareubs ubsresearch investors report latest global market explore new ubs
Topic #7:
shareubs ubs head ubsconf china conference global ubsevidencelab watch new
Topic #8:
shareubs ubs join watch episode trending learn conversation tune today
Topic #9:
shareubs live results starts stay tomorrow presentation sustainable tuned cest


In [49]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_ubs.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_ubs[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_ubs = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_ubs = df_top_words_ubs.T 

df_ubs_prefixed = df_top_words_transposed_ubs

## Citi

In [50]:
tweets_citi = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/Citi_since_2018_01_01.csv')

texts_citi = tweets_citi['Tweet'].tolist()

clean_texts_citi = [clean_text(text) for text in texts_citi]

# ContextVectorize
X_citi = vectorizer.fit_transform(clean_texts_citi)

# Apply LDA
lda_citi = LatentDirichletAllocation(n_components=10)
topics_citi = lda_citi.fit(X_citi)

print(lda_citi.components_)

[[0.1        3.09998441 1.37222363 ... 0.1        0.10002176 0.1       ]
 [0.1        0.1        1.82764451 ... 1.09994834 0.1        0.10001872]
 [0.1        0.1        0.1        ... 0.1        0.1        0.1       ]
 ...
 [0.1        0.10009384 1.10000406 ... 0.1        0.10012225 0.10000249]
 [0.10006412 0.10005269 0.10005344 ... 0.10004853 0.10008005 0.1       ]
 [0.1        0.1        0.10000367 ... 1.09994772 0.1        2.09997068]]


In [51]:
feature_names_citi = vectorizer.get_feature_names_out()

print_top_words(lda_citi, feature_names_citi, n_top_words=10)

Topic #0:
citi bank global best private day head citivolunteers awards services
Topic #1:
citi learn clients treasury new digital citis payments solutions help
Topic #2:
citi teamciti citis athletes tomorrow news et results stareatgreatness disabilities
Topic #3:
ceo citi jane fraser global information year financial discusses citis
Topic #4:
citi learn pathwaysprogress today proud youth support foundation social impact
Topic #5:
citi digital home money th join citidigimoney proud forward people
Topic #6:
ceo corbat mike watch citis citi global growth live tune
Topic #7:
citi learn help communities support housing proud racial cities foundation
Topic #8:
citi standforprogress women citis diversity proud learn community support colleagues
Topic #9:
report read citi new gps latest payments opportunities global supply


In [52]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_citi.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_citi[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_citi = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_citi = df_top_words_citi.T 

df_citi_prefixed = df_top_words_transposed_citi

## Wells Fargo

In [53]:
tweets_wf = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/WellsFargo_since_2018_01_01.csv')

texts_wf = tweets_wf['Tweet'].tolist()

clean_texts_wf = [clean_text(text) for text in texts_wf]

# ContextVectorize
X_wf = vectorizer.fit_transform(clean_texts_wf)

# Apply LDA
lda_wf = LatentDirichletAllocation(n_components=10)
topics_wf = lda_wf.fit(X_wf)

print(lda_wf.components_)

[[ 0.1         0.1         6.09986492 ...  0.16606249  0.10001067
   0.10000342]
 [ 7.09983689  0.1         0.10007202 ...  0.10000016  0.10000058
   0.10000427]
 [ 0.1         0.1         0.1        ...  0.1000047   0.10000349
   0.100015  ]
 ...
 [ 0.1         0.1         0.1        ...  0.10000059  0.1
   0.10001733]
 [ 0.1         0.1         0.1        ...  0.10000021  0.1
  25.47519338]
 [ 0.10000118  0.1         0.10000037 ...  1.74198748  0.10000318
  27.42453022]]


In [54]:
feature_names_wf = vectorizer.get_feature_names_out()
print_top_words(lda_wf, feature_names_wf, n_top_words=10)

Topic #0:
deposit inconvenience caused apologize wells fargo account direct fees issue
Topic #1:
thank happy support great chris glad ddg opportunity thanks help
Topic #2:
assistance sorry need account hello banker speak thank hear alex
Topic #3:
fargo wells issue apologize branch working visit thank hi technical
Topic #4:
youre team support help online jules try feedback able sorry
Topic #5:
thanks dm thank message hi review email forward received information
Topic #6:
thank proud hi support communities customers time help work pm
Topic #7:
account numbers details dm help tweet situation send hi like
Topic #8:
account numbers dm make sorry provide help details like sure
Topic #9:
number dm phone account numbers send like address details sorry


In [55]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_wf.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_wf[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_wf = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_wf = df_top_words_wf.T 

df_wf_prefixed = df_top_words_transposed_wf

## BOA (Bank of America)

In [56]:
tweets_boa = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/BankofAmerica_since_2018_01_01.csv')

texts_boa = tweets_boa['Tweet'].tolist()

clean_texts_boa = [clean_text(text) for text in texts_boa]

# ContextVectorize
X_boa = vectorizer.fit_transform(clean_texts_boa)

# Apply LDA
lda_boa = LatentDirichletAllocation(n_components=10)
topics_boa = lda_boa.fit(X_boa)

print(lda_boa.components_)

[[0.1        0.1        0.1        ... 0.1        0.1        0.10014002]
 [0.1        0.10000257 0.1        ... 9.09999328 0.1        0.1       ]
 [6.06619449 0.10004111 0.10002638 ... 0.1        0.1        0.1       ]
 ...
 [0.10000199 0.1        0.1        ... 0.10000022 0.1        0.1       ]
 [0.13353118 2.09998966 4.09996893 ... 0.1        0.1000993  0.1       ]
 [0.1        1.09993734 0.1        ... 0.10000119 0.1000006  0.1       ]]


In [57]:
feature_names_boa = vectorizer.get_feature_names_out()
print_top_words(lda_boa, feature_names_boa, n_top_words=10)

Topic #0:
proud help support bettermoneyhabits partnership year partner prices red fight
Topic #1:
link help like connect hi hello account use click dm
Topic #2:
business small help money app owners banking businesses resources bank
Topic #3:
women program change positive thanks partnership creating conversation leaders globalambassadors
Topic #4:
mobile banking app love pay card month deposit free let
Topic #5:
weve help supporting important young thats organizations women provide thanks
Topic #6:
bank america cash rewards earn shopping way holiday chicagomarathon credit
Topic #7:
help financial bettermoneyhabits erica tips youre life new plan bank
Topic #8:
digital tools using fans impressive cantstopbanking proud debit like wallet
Topic #9:
thank communities work thanks appreciate great glad shout sharing support


In [58]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_boa.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_boa[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_boa = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_boa = df_top_words_boa.T 

df_boa_prefixed = df_top_words_transposed_boa

## JP Morgan

In [59]:
tweets_jp = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/jpmorgan_since_2018_01_01.csv')

texts_jp = tweets_jp['Tweet'].tolist()

clean_texts_jp = [clean_text(text) for text in texts_jp]

# ContextVectorize
X_jp = vectorizer.fit_transform(clean_texts_jp)

# Apply LDA
lda_jp = LatentDirichletAllocation(n_components=10)
topics_jp = lda_jp.fit_transform(X_jp)

print(lda_jp.components_)

[[2.1        0.1        0.1        ... 0.1        2.09999669 0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.1        0.1       ]
 [0.1        0.1        0.10001658 ... 0.1        0.1        0.1       ]
 ...
 [0.1        0.1        0.10000353 ... 0.10001258 0.1        0.1       ]
 [0.1        0.1        0.10001947 ... 2.09998742 0.1        0.1       ]
 [0.1        0.1        1.09996306 ... 0.1        0.1        2.1       ]]


In [60]:
feature_names_jp = vectorizer.get_feature_names_out()
print_top_words(lda_jp, feature_names_jp, n_top_words=10)

Topic #0:
tech conference investors better career jpm companies robinhoodnyc hear beer
Topic #1:
jp morgan summer list tech reading new solutions business challenge
Topic #2:
business new leaders women businesses jpms future techtrends podcast entrepreneurs
Topic #3:
jpmcc learn technology energy curators participants jpmcartcollection featured look choice
Topic #4:
jpmorgan chase jpm commercial consumers collection banking art business jpms
Topic #5:
jpm payments dimon jamie income ceo net eps reports chairman
Topic #6:
investment company women false forex asj leadership industry leaders day
Topic #7:
jp global morgans market morgan head outlook shares markets discusses
Topic #8:
jp morgan global research visit markets jpm thank team blockchain
Topic #9:
new year jpms erdoes asset management jp growth mary work


In [61]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_jp.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_jp[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_jp = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_jp = df_top_words_jp.T 

df_jp_prefixed = df_top_words_transposed_jp

## Raymond James

In [62]:
tweets_rj = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/RaymondJames_since_2018_01_01.csv')

texts_rj = tweets_rj['Tweet'].tolist()

clean_texts_rj = [clean_text(text) for text in texts_rj]

# ContextVectorize
X_rj = vectorizer.fit_transform(clean_texts_rj)

# Apply LDA
lda_rj = LatentDirichletAllocation(n_components=10)
topics_rj = lda_rj.fit_transform(X_rj)

print(lda_rj.components_)

[[0.1        0.1        0.1        ... 3.92364415 0.1        0.1       ]
 [0.1        0.10000288 2.46068976 ... 0.1        0.1        0.10001084]
 [0.1        0.1        0.10000151 ... 0.1        1.09994368 0.1       ]
 ...
 [0.1        0.1        0.10000013 ... 0.10000154 0.10000525 0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.10004062 0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.1        0.1       ]]


In [63]:
feature_names_rj = vectorizer.get_feature_names_out()
print_top_words(lda_rj, feature_names_rj, n_top_words=10)

Topic #0:
learn james raymond community career important firm inclusion tips honor
Topic #1:
discuss tune et policy mills ed markets pm analyst raymond
Topic #2:
financial learn help business james giving raymond discover rjwomen students
Topic #3:
markets heres market larryadamrj economy cio year investors experts look
Topic #4:
tax heres learn highlights financial jobsreport new sustainability consider yearend
Topic #5:
discuss tune et chris raymond james change paul subject ceo
Topic #6:
pavel molchanov energy financial art oil like important celebrate life
Topic #7:
planning plan help retirement important financial family consider make youre
Topic #8:
james raymond financial year new home rjf nyse data learn
Topic #9:
rjcares associates james raymond advisors learn scott brown economist month


In [64]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_rj.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_rj[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_rj = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_rj = df_top_words_rj.T 

df_rj_prefixed = df_top_words_transposed_rj

# Africa

## Quant Africa

In [65]:
tweets_qa = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/quantafrica_since_2018_01_01.csv')

texts_qa = tweets_qa['Tweet'].tolist()

clean_texts_qa = [clean_text(text) for text in texts_qa]

# ContextVectorize
X_qa = vectorizer.fit_transform(clean_texts_qa)

# Apply LDA
lda_qa = LatentDirichletAllocation(n_components=10)
topics_qa = lda_qa.fit(X_qa)

print(lda_qa.components_)

[[0.1        2.10001212 1.10000064 ... 0.1        0.1        0.1       ]
 [0.1        0.10000307 0.10000494 ... 0.1        0.1        2.09997195]
 [0.1        0.1        0.10000319 ... 0.1        0.1        0.1       ]
 ...
 [0.1        0.1        0.1        ... 0.1        6.10001962 1.10000319]
 [0.1        0.10003166 8.09998964 ... 1.09999703 0.10000212 0.10000804]
 [2.1        1.09995315 0.1        ... 0.1        0.1        0.1       ]]


In [66]:
feature_names_qa = vectorizer.get_feature_names_out()
print_top_words(lda_qa, feature_names_qa, n_top_words=10)

Topic #0:
javascript community learning learn tech great resource covers provides design
Topic #1:
quanta tech new africa filled innovation growth happy month link
Topic #2:
tech join tomorrow pm upskilling day guest today google sharing
Topic #3:
tech session career pm learning opportunities learn time join hangout
Topic #4:
startup idea pitch session tuesday product join moving implementing market
Topic #5:
sir love happy olorunsheyi cheers birthday happybirthday dr sunday new
Topic #6:
today web language page miss pm used css create shouldnt
Topic #7:
yes talentdev learn frontend developer months dont create javascript online
Topic #8:
tech quanta free alimosho ideation community quantaafrica lagos techbros technology
Topic #9:
data analysis book html provides visualization official visualizations website powerful


In [67]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_qa.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_qa[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_qa = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_qa = df_top_words_qa.T 

df_qa_prefixed = df_top_words_transposed_qa

## Standard Bank

In [68]:
tweets_sb = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/StandardBankZA_since_2018_01_01.csv')

texts_sb = tweets_sb['Tweet'].tolist()

clean_texts_sb = [clean_text(text) for text in texts_sb]

# ContextVectorize
X_sb = vectorizer.fit_transform(clean_texts_sb)

# Apply LDA
lda_sb = LatentDirichletAllocation(n_components=10)
topics_sb = lda_sb.fit_transform(X_sb)

print(lda_sb.components_)

[[0.1000158  0.1        0.1        ... 0.1        0.10001266 0.1       ]
 [0.1        0.1        0.1        ... 1.09999438 0.1        0.1       ]
 [0.10002195 2.09988241 0.10001399 ... 0.1        0.10001533 0.10000414]
 ...
 [0.1        0.1        0.1        ... 1.10000032 0.1        0.1       ]
 [0.10000554 0.1        2.09995008 ... 0.1        0.1        2.09999586]
 [0.1        0.1        0.1        ... 0.1        0.1        0.1       ]]


In [69]:
feature_names_sb = vectorizer.get_feature_names_out()
print_top_words(lda_sb, feature_names_sb, n_top_words=10)

Topic #0:
account bank standard hi funds link card hey transaction need
Topic #1:
sblovessummer sblove financial beatthescam thats goals great year budget youre
Topic #2:
hi know let getting dm need message thank error screenshot
Topic #3:
hi experience service details dm branch inconvenience contact like sorry
Topic #4:
app banking hi try issues know let device thank experiencing
Topic #5:
thank love sblove appreciate thanks glad feedback happy taking day
Topic #6:
dm team hi assist number contact details thank send id
Topic #7:
money instantmoneymondays sblove instant wallet using sblovessummer right welcome thing
Topic #8:
like dm hi look help send details assist want way
Topic #9:
sblove instantmoneymondays thats itcanbe answer sure correct got right challenge


In [70]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_sb.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_sb[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_sb = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_sb = df_top_words_sb.T 

df_sb_prefixed = df_top_words_transposed_sb

## North Thern Trust

In [71]:
tweets_nt = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/NTWealth_since_2018_01_01.csv')

texts_nt = tweets_nt['Tweet'].tolist()

clean_texts_nt = [clean_text(text) for text in texts_nt]

# ContextVectorize
X_nt = vectorizer.fit_transform(clean_texts_nt)

# Apply LDA
lda_nt = LatentDirichletAllocation(n_components=10)
topics_nt = lda_nt.fit(X_nt)

print(lda_nt.components_)

[[0.1        0.1        0.1        ... 0.1        3.10002792 0.1       ]
 [0.1        0.1        0.1        ... 0.1        1.09999223 2.84859821]
 [2.1        1.09996982 1.09999652 ... 0.1        0.1        0.1       ]
 ...
 [0.1        0.1        0.1        ... 4.09971562 1.34055227 0.10001667]
 [0.1        0.1        0.1        ... 3.09981027 0.1        1.99849218]
 [0.1        0.1        0.1        ... 0.10044582 1.09998962 0.1       ]]


In [72]:
feature_names_nt = vectorizer.get_feature_names_out()
print_top_words(lda_nt, feature_names_nt, n_top_words=10)

Topic #0:
wealth plan help learn planning estate retirement market attractive artists
Topic #1:
tax expochicago suzanne shier art learn exchangebynortherntrust digital arts changes
Topic #2:
trust northern expochicago learn sponsor management wealth proud family presenting
Topic #3:
officer fiduciary chief president northerntrust lucina wealth read women named
Topic #4:
katie nixon inflation investors cio management fed outlook wealth market
Topic #5:
policy investment tax view business strategies future bank wealth private
Topic #6:
wealth management nixon katie cio discusses market tax northerntrust investors
Topic #7:
northern trust wealth family team families planning trusts art executive
Topic #8:
family new business learn planning tax wealth strategies philanthropy read
Topic #9:
trust northern proud advice event president experts texas northerntrust wealth


In [73]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_nt.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_nt[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_nt = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_nt = df_top_words_nt.T 

df_nt_prefixed = df_top_words_transposed_nt

## UBA

In [74]:
tweets_uba = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/UBAGroup_since_2018_01_01.csv')

texts_uba = tweets_uba['Tweet'].tolist()

clean_texts_uba = [clean_text(text) for text in texts_uba]

# ContextVectorize
X_uba = vectorizer.fit_transform(clean_texts_uba)

# Apply LDA
lda_uba = LatentDirichletAllocation(n_components=10)
topics = lda_uba.fit_transform(X_uba)

print(lda_uba.components_)

[[1.0999921  0.1        0.10004868 ... 0.1        0.1        0.1       ]
 [0.1        0.1        0.1        ... 0.10001628 0.1        0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.1        2.09983807]
 ...
 [0.1        0.1        0.1        ... 0.10002886 2.09997987 0.1       ]
 [0.10005046 0.10035278 2.09995132 ... 2.09978082 0.10001164 0.1000032 ]
 [0.1        2.09929672 0.1        ... 0.10005318 0.1        0.10003174]]


In [75]:
feature_names_uba = vectorizer.get_feature_names_out()
print_top_words(lda_uba, feature_names_uba, n_top_words=10)

Topic #0:
thank hello kindly dm response informed provided bitlymlzbnp number enable
Topic #1:
leo chat uba account open mmeubachatbanking visit whatsapp using hello
Topic #2:
africasglobalbank join ubaat ubamarketplace live register session ubaafricanentrepreneurs pm click
Topic #3:
hello account thank ubacares open dial visit migrate ubabumperaccount bumper
Topic #4:
uba informed hello thank emanate response number kindly whatsapp dm
Topic #5:
africasglobalbank ubaat new week stay day like help make work
Topic #6:
win africasglobalbank winners love better uba airtime million retweet draw
Topic #7:
africa africasglobalbank african uba ubaafricaday happy world ubaafricaconversations tonyoelumelu africaday
Topic #8:
link click dm thank sharing hello avoid public bitlymlzbnp information
Topic #9:
uba africasglobalbank banking group chairman bank tonyoelumelu app today director


In [76]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_uba.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_uba[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_uba = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_uba = df_top_words_uba.T 

df_uba_prefixed = df_top_words_transposed_uba

# Asia

## HSBC

In [77]:
tweets_hsbc = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/HSBC_since_2018_01_01.csv')

texts_hsbc = tweets_hsbc['Tweet'].tolist()

clean_texts_hsbc = [clean_text(text) for text in texts_hsbc]

# ContextVectorize
X_hsbc = vectorizer.fit_transform(clean_texts_hsbc)

# Apply LDA
lda_hsbc = LatentDirichletAllocation(n_components=10)
topics = lda_hsbc.fit(X_hsbc)

print(lda_hsbc.components_)

[[0.1        0.1000038  0.1        ... 1.09999882 0.10001795 0.1       ]
 [1.10000044 0.1        0.1        ... 0.1        0.1        0.1       ]
 [0.1        0.1        1.09999401 ... 0.1        1.10000273 0.1       ]
 ...
 [0.1        0.1        1.09998613 ... 0.1        0.1        1.09999782]
 [0.1000039  0.1        1.10000066 ... 0.1        0.1        0.1       ]
 [0.1        0.1        2.1000084  ... 1.10000117 2.09997932 0.1       ]]


In [78]:
feature_names_hsbc = vectorizer.get_feature_names_out()
print_top_words(lda_hsbc, feature_names_hsbc, n_top_words=10)

Topic #0:
global finance transition trade sustainable hsbc world hsbcs bank netzero
Topic #1:
hsbc new world people support action group hong technology partnership
Topic #2:
hsbc global business climate solutions world help customers passed new
Topic #3:
hsbc future read trade global sustainable hsbcresults banking businesses growth
Topic #4:
today results hsbcresults learn weve announced set trade hsbc world
Topic #5:
hsbc financial business year partnership people iwd businesses looking work
Topic #6:
entrepreneurs latest report podcast business read hsbc entrepreneur listen founder
Topic #7:
hsbc business global proud bank customers weve world new banking
Topic #8:
hsbcnavigator businesses hsbc china companies supply ciie sustainability business chinas
Topic #9:
digital health mental customers group support hsbc help covid year


In [79]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_hsbc.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_hsbc[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_hsbc = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_hsbc = df_top_words_hsbc.T 

df_hsbc_prefixed = df_top_words_transposed_hsbc

## OCBC (Singapore)

In [80]:
tweets_ocbc = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/OCBCBank_since_2018_01_01.csv')

texts_ocbc = tweets_ocbc['Tweet'].tolist()

clean_texts_ocbc = [clean_text(text) for text in texts_ocbc]

# ContextVectorize
X_ocbc = vectorizer.fit_transform(clean_texts_ocbc)

# Apply LDA
lda_ocbc = LatentDirichletAllocation(n_components=10)
topics_ocbc = lda_ocbc.fit(X_ocbc)

print(lda_ocbc.components_)

[[0.1        1.09996532 0.1        ... 1.00154074 0.10000694 0.1       ]
 [2.38407348 0.1        0.10000242 ... 0.1        0.1        0.10010147]
 [1.5904087  0.10000799 0.10002953 ... 1.19845925 0.1        0.10016341]
 ...
 [0.10000772 0.1        0.1        ... 0.1        0.1        0.1       ]
 [0.1        0.1        0.10002676 ... 0.1        0.1        0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.1        0.10002146]]


In [81]:
feature_names_ocbc = vectorizer.get_feature_names_out()
print_top_words(lda_ocbc, feature_names_ocbc, n_top_words=10)

Topic #0:
ocbc account hi singapore sh touch malaysia looking dg reached
Topic #1:
app hi try dg banking mobile reinstalling deleting delete issues
Topic #2:
dg hi able number service mobile drop dm executive assist
Topic #3:
sh card xf form update hi thank new youre afraid
Topic #4:
know hi xf transfer let cash ocbc thank inconvenience caused
Topic #5:
app version mobile banking phone ocbc following bank model details
Topic #6:
dm thank hi contact xf share number details sorry drop
Topic #7:
hi access dg banking online pin sh token sorry look
Topic #8:
thank team feedback working xf hi inconvenience relevant dg soon
Topic #9:
sh thanks email sorry dm free feel glad good hi


In [82]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_ocbc.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_ocbc[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_ocbc = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_ocbc = df_top_words_ocbc.T 

df_ocbc_prefixed = df_top_words_transposed_ocbc

## Bank of Singapore

In [83]:
tweets_bos = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/bankofSG_since_2018_01_01.csv')

texts_bos = tweets_bos['Tweet'].tolist()

clean_texts_bos = [clean_text(text) for text in texts_bos]

# ContextVectorize
X_bos = vectorizer.fit_transform(clean_texts_bos)

# Apply LDA
lda_bos = LatentDirichletAllocation(n_components=10)
topics_bos = lda_bos.fit(X_bos)

print(lda_bos.components_)

[[0.10000537 0.10003381 0.1        ... 0.1        0.10001324 4.91601106]
 [0.1        1.09996214 0.10005    ... 0.1        0.10007186 0.33883278]
 [2.10000006 1.09996103 1.09994606 ... 0.1        5.16034967 0.1       ]
 ...
 [0.1        0.1        0.1        ... 0.1        0.10000527 0.10000181]
 [0.1        1.09997473 0.1        ... 0.1        0.10004081 1.04514537]
 [0.10002537 0.1        0.1        ... 1.10000166 0.1        0.10000898]]


In [84]:
feature_names_bos = vectorizer.get_feature_names_out()
print_top_words(lda_bos, feature_names_bos, n_top_words=10)

Topic #0:
chief mansoor inflation mohiuddin economist federal rate investment central reserve
Topic #1:
strategist moh siong sim currency says usd likely fx outlook
Topic #2:
chief economist mohiuddin mansoor fed policy inflation says assets risk
Topic #3:
chief economist fed says jerram richard mansoor mohiuddin rate bank
Topic #4:
china market global head greater markets asia singapore north research
Topic #5:
private wealth investment senior bank cheo james strategist global market
Topic #6:
management portfolio global research jean head chia office year investment
Topic #7:
investment head lee market strategy eli global says months watch
Topic #8:
head fixed income global expected says products van walle marc
Topic #9:
investment year singapore bank guests outlook research change world term


In [85]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_bos.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_bos[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_bos = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_bos = df_top_words_bos.T 

df_bos_prefixed = df_top_words_transposed_bos

## Hana Bank (South Korea)

In [86]:
tweets_hana = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/HanaBank4expats_since_2018_01_01.csv')

texts_hana = tweets_hana['Tweet'].tolist()

clean_texts_hana = [clean_text(text) for text in texts_hana]

# ContextVectorize
X_hana = vectorizer.fit_transform(clean_texts_hana)

# Apply LDA
lda_hana = LatentDirichletAllocation(n_components=10)
topics = lda_hana.fit(X_hana)

print(lda_hana.components_)

[[ 0.1         0.1         0.1         0.1         0.10000445  0.1
   0.1         0.1         0.1         2.10000658  0.1         0.1
   0.10000686  0.1         1.09996357  0.1         0.1         2.09996948
   0.1         0.10000532  0.1         0.1         0.1         0.10001433
   0.1         0.1         0.1         0.1         0.1000528   0.1
   0.10000441  0.1         0.1         2.09998477  1.09995771  0.1
   1.36533975  0.1         0.1         1.09997508  0.10001972  0.1
   1.09999166  0.1         0.1         0.1         0.1         2.10001903
   0.1         5.10002669  0.10000549  0.1         0.1         0.10000901
   1.09999627  0.1         0.10002832  0.1         0.1         0.10000532
   0.10000675  0.1         0.1         2.10004161  1.09996406  0.1
   0.10000882  0.1         1.10000405  0.1         1.0999726   0.1
   0.1         0.10000328  0.10001322  1.09996304  0.10000315  0.1
   2.34739198  0.1         0.1         1.09997314  0.1         0.1
   0.1         3.09998083  

In [87]:
feature_names_hana = vectorizer.get_feature_names_out()
print_top_words(lda_hana, feature_names_hana, n_top_words=10)

Topic #0:
korea tips seoul new kebhana card travel fx day good
Topic #1:
parent local report like year gives banks news change starting
Topic #2:
koreas coast expats banks bank new sunday tuesday cool online
Topic #3:
hana korean keb st bank exchange hope pyeongchang krw rates
Topic #4:
krwusd rate report weeks outlook check forecast weekly complete exchange
Topic #5:
seoul easy money banking trip arent new fly overseas locations
Topic #6:
hana travel home send journey app ez abroad win prizes
Topic #7:
hana money app foreigners ez cheaper abroad july english heres
Topic #8:
rate krwusd week exchange range forecasts bank forecast outlook monday
Topic #9:
know beaches new hana banking expats tell app mobile ez


In [88]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_hana.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_hana[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_hana = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_hana = df_top_words_hana.T 

df_hana_prefixed = df_top_words_transposed_hana

## UFJ (Japan)

## Mizuho Bank

# Oceania

## The Australia and New Zealand Banking Group Limited  (ANZ)

In [89]:
tweets_anz = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/ANZ_AU_since_2018_01_01.csv')

texts_anz = tweets_anz['Tweet'].tolist()

clean_texts_anz = [clean_text(text) for text in texts_anz]

# ContextVectorize
X_anz = vectorizer.fit_transform(clean_texts_anz)

# Apply LDA
lda_anz = LatentDirichletAllocation(n_components=10)
topics_anz = lda_anz.fit(X_anz)

print(lda_anz.components_)

[[1.00000000e-01 1.57288192e+02 1.00000306e-01 ... 1.33508027e+00
  1.00000000e-01 1.00000000e-01]
 [1.00000001e-01 4.65041634e+00 1.20641942e+00 ... 1.00000001e-01
  1.00015900e-01 1.00000001e-01]
 [1.00000001e-01 1.38001472e-01 1.00000000e-01 ... 1.00071767e-01
  1.00000000e-01 1.00103423e-01]
 ...
 [1.00165812e-01 5.20725220e+01 1.00032045e-01 ... 1.09983720e+00
  3.00930633e+00 1.00026929e-01]
 [1.00000001e-01 1.00018668e-01 1.00000000e-01 ... 5.55890794e+00
  1.00000000e-01 2.09986697e+00]
 [1.00000001e-01 1.00014294e-01 1.00051793e-01 ... 1.00008386e-01
  1.00000000e-01 1.00030829e-01]]


In [90]:
feature_names_anz = vectorizer.get_feature_names_out()
print_top_words(lda_anz, feature_names_anz, n_top_words=10)

Topic #0:
hi inconvenience anz issues sorry app issue banking caused internet
Topic #1:
hi wait long thanks sorry times team message hoax email
Topic #2:
card hi team ampm credit aest account contact cards like
Topic #3:
hi thanks sorry hear feedback dm help team know message
Topic #4:
banking internet anz pm app hi team aest contact try
Topic #5:
hi dm branch contact number postcode sorry send details like
Topic #6:
link dont thanks hi click delete anz sure sms details
Topic #7:
anz hi account payments funds business pay transfer payment card
Topic #8:
dm number post code send hi contact phone assist sorry
Topic #9:
hi thanks darren payments team sorry kindly customer days pm


In [91]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_anz.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_anz[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_anz = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_anz = df_top_words_anz.T 

df_anz_prefixed = df_top_words_transposed_anz

## Commonwealth Bank of Australia (CBA)

In [92]:
tweets_cba = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/CommBank_since_2018_01_01.csv')

texts_cba = tweets_cba['Tweet'].tolist()

clean_texts_cba = [clean_text(text) for text in texts_cba]

# ContextVectorize
X_cba = vectorizer.fit_transform(clean_texts_cba)

# Apply LDA
lda_cba = LatentDirichletAllocation(n_components=10)
topics_cba = lda_cba.fit_transform(X_cba)

print(lda_cba.components_)

[[2.05159575 0.1        1.11780792 ... 1.09999723 0.10000173 4.10001254]
 [0.10000714 1.1000023  0.1        ... 0.1        1.88172015 0.10001116]
 [0.1        0.1        0.1        ... 0.1        1.28875754 0.1       ]
 ...
 [0.1        0.1        1.14729049 ... 1.10000277 0.10000108 1.09999188]
 [0.1        0.1        0.1        ... 0.1        3.58485722 0.1       ]
 [0.10000499 1.0999977  0.1        ... 0.1        0.10002469 1.11299113]]


In [93]:
feature_names_cba = vectorizer.get_feature_names_out()
print_top_words(lda_cba, feature_names_cba, n_top_words=10)

Topic #0:
hi know reach card nat team help information dm concerns
Topic #1:
dm hi help send enquiry provide assist understand like information
Topic #2:
footprint hi carbon accounts data scam business based industry calculated
Topic #3:
scams information report need security account forwarding future hoaxcbacomau customers
Topic #4:
message send hi dm commbank app details help thank assist
Topic #5:
dm hi like send details sorry best discuss assistance reach
Topic #6:
send like hi message number contact private hear understand sorry
Topic #7:
hi customers information cryptocurrency app commbank thanks personal team scams
Topic #8:
message thanks hi team scam text forward card security details
Topic #9:
team christine visit sorry check hello hear know suspicious hi


In [94]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_cba.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_cba[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_cba = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_cba = df_top_words_cba.T 

df_cba_prefixed = df_top_words_transposed_cba

## National Australia Bank (NAB)

In [95]:
tweets_nab = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/NAB_since_2018_01_01.csv')

texts_nab = tweets_nab['Tweet'].tolist()

clean_texts_nab = [clean_text(text) for text in texts_nab]

# ContextVectorize
X_nab = vectorizer.fit_transform(clean_texts_nab)

# Apply LDA
lda_nab = LatentDirichletAllocation(n_components=10)
topics_nab = lda_nab.fit_transform(X_nab)

print(lda_nab.components_)

[[0.10001587 0.1        0.1        ... 0.1        0.10010085 0.1       ]
 [0.10000956 4.07950768 0.1        ... 3.09995381 3.09980832 0.10013924]
 [0.1        0.10000701 0.1        ... 0.10000651 0.1        0.1       ]
 ...
 [0.1        0.10003732 2.09999911 ... 0.1        0.1        0.1       ]
 [1.09997821 0.10000616 0.1        ... 0.10001724 0.1        0.1       ]
 [1.09996565 5.33766783 0.10000089 ... 0.1        0.1        1.09991184]]


In [96]:
feature_names_nab = vectorizer.get_feature_names_out()
print_top_words(lda_nab, feature_names_nab, n_top_words=10)

Topic #0:
pay nab apple know hi let make payments contactless customers
Topic #1:
dm send hi chat im help sorry hear rg like
Topic #2:
hi sorry working inconvenience issues banking app internet aware issue
Topic #3:
nab read australia atm systems new nabs fee cash scams
Topic #4:
rl card thats hi able cj youre message account lh
Topic #5:
hear feedback sorry thanks hi team need im glad know
Topic #6:
hi ive like team lodge pm time feedback replied youd
Topic #7:
hi nab pm rate customers payments home loan understand aestaedt
Topic #8:
nab business hi customers support accounts need banking customer branch
Topic #9:
thanks message delete team hi messages security scam letting know


In [97]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_nab.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_nab[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_nab = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_nab = df_top_words_nab.T 

df_nab_prefixed = df_top_words_transposed_nab

## Westpac Banking Corporation (WBC)

In [98]:
tweets_wbc = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/Westpac_since_2018_01_01.csv')

texts_wbc = tweets_wbc['Tweet'].tolist()

clean_texts_wbc = [clean_text(text) for text in texts_wbc]

# ContextVectorize
X_wbc = vectorizer.fit_transform(clean_texts_wbc)

# Apply LDA
lda_wbc = LatentDirichletAllocation(n_components=10)
topics_wbc = lda_wbc.fit_transform(X_wbc)

print(lda_wbc.components_)

[[0.10001007 0.1        5.32199946 ... 0.10002027 0.1000326  0.1       ]
 [0.1000539  0.10000352 2.86758631 ... 0.10022302 0.1        0.1       ]
 [0.1        0.1        2.8346606  ... 0.1        1.28933772 3.09997072]
 ...
 [0.1        0.10000452 8.82877994 ... 0.10013353 0.10000206 0.1       ]
 [0.10000091 2.09998376 0.10004913 ... 0.1        0.10002904 0.1       ]
 [0.10009707 0.1        0.10000349 ... 0.1        0.1        0.1       ]]


In [99]:
feature_names_wbc = vectorizer.get_feature_names_out()
print_top_words(lda_wbc, feature_names_wbc, n_top_words=10)

Topic #0:
send dm like hi help sorry hear wed concerns whats
Topic #1:
dm responded know hi weve message send look tweet forward
Topic #2:
westpac sms delete forward banking scam hoax hi text email
Topic #3:
im sorry youre hear hi dm help send youve able
Topic #4:
banking online hi inconvenience caused apologise issues resolved services working
Topic #5:
thanks hi team reaching details share feedback know card attention
Topic #6:
pay apple hi open future customers remain offering options westpac
Topic #7:
help team hi contact know thanks branch happy let reach
Topic #8:
westpac help customers hi australia new helps loan hope support
Topic #9:
complaints information dont process feedback complaint public hesitate link including


In [100]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_wbc.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_wbc[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_wbc = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_wbc = df_top_words_wbc.T 

df_wbc_prefixed = df_top_words_transposed_wbc

# Europe

## BNP Paribas

In [101]:
tweets_bnp = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/BNPParibas_since_2018_01_01.csv')

texts_bnp = tweets_bnp['Tweet'].tolist()

clean_texts_bnp = [clean_text(text) for text in texts_bnp]

# ContextVectorize
X_bnp = vectorizer.fit_transform(clean_texts_bnp)

# Apply LDA
lda_bnp = LatentDirichletAllocation(n_components=10)
topics_bnp = lda_bnp.fit(X_bnp)

print(lda_bnp.components_)

[[ 0.1         0.10000414  0.1        ...  0.1         0.1
   0.10000153]
 [ 0.10019259  0.10000117  0.10000172 ...  0.1         0.10000292
   0.10000143]
 [ 0.10013365  0.10000066  0.1        ...  0.1         0.1
   0.10000152]
 ...
 [ 0.1        33.75441093  0.1        ...  0.1         0.1
   0.1       ]
 [ 0.10015281  0.1         4.0999961  ...  2.09999844 11.0999843
   6.0999574 ]
 [ 2.09935048  0.10000017  0.1        ...  0.10000156  0.10000019
   0.10003508]]


In [102]:
feature_names_bnp = vectorizer.get_feature_names_out()
print_top_words(lda_bnp, feature_names_bnp, n_top_words=10)

Topic #0:
clients head sustainable answer order good social day account wish
Topic #1:
tennis young paribas bnp players teamjeunestalents team support fftennis help
Topic #2:
rolandgarros support vrarlesfestival paribas bnp vr young bnppresults know new
Topic #3:
la et le pour les du des en qui avec
Topic #4:
bnppcsr greenreflex women bfmbusiness watch business paribas bnp energy data
Topic #5:
positivebanking climate sustainable climatechange bank fondationbnpp research bnp paribas projects
Topic #6:
women heforshe bnppcoalitions jblefevre diversity wfgm bnppgenderequality jbonnel ym genderequality
Topic #7:
vivatech positivebanking bnppadvance startups new discover tech innovation mobility day
Topic #8:
la pour le des les et en nous du sur
Topic #9:
vous nous bonjour dm contacter par afin que bonne des


In [103]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_bnp.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_bnp[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_bnp = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_bnp = df_top_words_bnp.T 

df_bnp_prefixed = df_top_words_transposed_bnp

## BNP Asset Mgt

In [104]:
tweets_bnp1 = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/BNPPAM_COM_since_2018_01_01.csv')

texts_bnp1 = tweets_bnp1['Tweet'].tolist()

clean_texts_bnp1 = [clean_text(text) for text in texts_bnp1]

# ContextVectorize
X_bnp1 = vectorizer.fit_transform(clean_texts_bnp1)

# Apply LDA
lda_bnp1 = LatentDirichletAllocation(n_components=10)
topics_bnp1 = lda_bnp1.fit(X_bnp1)

print(lda_bnp1.components_)

[[0.1        0.1        0.1        ... 2.09997851 0.1        0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.10000053 0.1       ]
 [0.1        0.10001162 0.10000161 ... 0.1        0.10000233 0.1       ]
 ...
 [0.1        0.1        2.20238222 ... 0.10001318 0.10001432 0.1       ]
 [0.1        0.1        0.1        ... 2.09940056 0.10000419 2.1       ]
 [0.1        2.09992381 2.09999419 ... 0.10060906 8.09996894 0.1       ]]


In [105]:
feature_names_bnp1 = vectorizer.get_feature_names_out()
print_top_words(lda_bnp1, feature_names_bnp1, n_top_words=10)

Topic #0:
read article outlook latest market inflation markets update investment weekly
Topic #1:
asset investment investors management sustainable paribas bnp new food whats
Topic #2:
bnppam_com china greaterchina head economist chi lo chinas senior global
Topic #3:
sustainability bnppam_com esg global head investing investment investors key strategy
Topic #4:
bnppam_com awards year investment fund esg asset best manager management
Topic #5:
bnppam_com stand podcast discover sustainable markets market week solutions covid
Topic #6:
markets read income fixed financial article energy transition insights economy
Topic #7:
bnppam_com head tells china debt investment investors trade senior emergingmarkets
Topic #8:
learn sri investors asset investing discover equities people read sustainable
Topic #9:
bnppam_com paul apac head asia tells sandhu investors esg client


In [106]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_bnp1.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_bnp1[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_bnp1 = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_bnp1 = df_top_words_bnp1.T 

df_bnp1_prefixed = df_top_words_transposed_bnp1

## Crédit Agricole Group

In [107]:
tweets_cag = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/more_banks/Credit_Agricole_since_2018_01_01.csv')

texts_cag = tweets_cag['Tweet'].tolist()

clean_texts_cag = [clean_text(text) for text in texts_cag]

# ContextVectorize
X_cag = vectorizer.fit_transform(clean_texts_cag)

# Apply LDA
lda_cag = LatentDirichletAllocation(n_components=10)
topics_cag = lda_cag.fit(X_cag)

print(lda_cag.components_)

[[1.09998483 0.1        0.1        ... 0.1        0.1        1.09998893]
 [0.1        0.1        0.1        ... 2.09996272 1.30936212 0.1       ]
 [0.1        0.10000291 0.1        ... 0.1        0.10000292 0.1       ]
 ...
 [0.1        3.09995676 2.09999632 ... 0.1        2.49839579 0.1       ]
 [0.1        0.10003214 0.1        ... 0.10002815 2.49222364 1.09992242]
 [2.09999379 0.10000634 0.10000368 ... 0.1        0.1        0.1000102 ]]


In [108]:
feature_names_cag = vectorizer.get_feature_names_out()
print_top_words(lda_cag, feature_names_cag, n_top_words=10)

Topic #0:
et le en la des les pour nous du dans
Topic #1:
pour en nous et sia sur des le creditagricole votre
Topic #2:
ukraine companies financing russian agricole russia credit group hi activities
Topic #3:
la le dans analyse pour et du des eco qui
Topic #4:
et du pour les nos la le résultats aca des
Topic #5:
la pour des le les et en sur avec du
Topic #6:
vous en le ce la nous et bonjour les plus
Topic #7:
aca results la plasticodyssey lefebvre time dominique le financial président
Topic #8:
les la des et nous en une nos notre dans
Topic #9:
directeur crédit général agricole du sa suspended cc russia le


In [109]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_cag.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_cag[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_cag = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_cag = df_top_words_cag.T 

df_cag_prefixed = df_top_words_transposed_cag

## Barclays PLC

In [110]:
tweets_barclays = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/more_banks/Barclays_since_2018_01_01.csv')

texts_barclays = tweets_barclays['Tweet'].tolist()

clean_texts_barclays = [clean_text(text) for text in texts_barclays]

# ContextVectorize
X_barclays = vectorizer.fit_transform(clean_texts_barclays)

# Apply LDA
lda_barclays = LatentDirichletAllocation(n_components=10)
topics_barclays = lda_barclays.fit(X_barclays)

print(lda_barclays.components_)

[[ 5.18690744  0.1         0.1000089  ... 30.37046102  0.1
   1.15796717]
 [ 9.81361676  0.1         0.10000222 ...  0.23624142  0.10001133
   1.10000551]
 [11.75047246  0.1         0.10000113 ... 14.96146138  0.10000306
   0.1       ]
 ...
 [ 0.10000293  0.1         0.1        ...  0.88703341  0.1000072
   0.1       ]
 [ 1.151914    0.1         0.1        ...  0.10000116  2.29027644
   0.1       ]
 [ 5.50614454  0.1         0.10000121 ...  0.10001739  0.1
   2.7919697 ]]


In [111]:
feature_names_barclays = vectorizer.get_feature_names_out()
print_top_words(lda_barclays, feature_names_barclays, n_top_words=10)

Topic #0:
youve available pm youre thanks askbarclaysus im team sorry need
Topic #1:
group cs venkatakrishnan results barclaysresults read executive ceo announcement chief
Topic #2:
barclaysukhelp thanks dm hi help tweet hey great weve reaching
Topic #3:
barclays info note left new day read years ive leave
Topic #4:
sorry youre im hi thanks help know hear hope let
Topic #5:
complaint log like youd fully include complaints link concerns investigate
Topic #6:
dm number postcode pop contact help support know ill like
Topic #7:
barclays banking post office local uk theyre know new app
Topic #8:
colleagues barclays customers clients year happy world proud work communities
Topic #9:
customers need rewards reach like app colleagues team youll month


In [112]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_barclays.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_barclays[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_barclays = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_barclays = df_top_words_barclays.T 

df_barclays_prefixed = df_top_words_transposed_barclays

## Banco Santander SA (BSSA)

In [113]:
tweets_bssa = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/more_banks/bancosantander_since_2018_01_01.csv')

texts_bssa = tweets_bssa['Tweet'].tolist()

clean_texts_bssa = [clean_text(text) for text in texts_bssa]

# ContextVectorize
X_bssa = vectorizer.fit_transform(clean_texts_bssa)

# Apply LDA
lda_bssa = LatentDirichletAllocation(n_components=10)
topics_bssa = lda_bssa.fit(X_bssa)

print(lda_bssa.components_)

[[0.1        1.10000265 1.10000579 ... 2.09999022 3.09999393 0.10006897]
 [0.1        0.1        0.1        ... 0.1        0.1        0.1       ]
 [0.1        1.09999734 0.1        ... 0.1        0.1        1.0999961 ]
 ...
 [0.1        0.1        0.1        ... 0.1        0.1        0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.1        0.1       ]
 [1.10000402 0.1        0.1        ... 0.1        0.1        0.1       ]]


In [114]:
feature_names_bssa = vectorizer.get_feature_names_out()
print_top_words(lda_bssa, feature_names_bssa, n_top_words=10)

Topic #0:
team know work dont phishinggruposantanderes best tips happy congratulations thank
Topic #1:
la para santander en al las esta resultados por es
Topic #2:
know million bank customers digital want year new global profit
Topic #3:
en la el los para más del que las santander
Topic #4:
san madrid acción cierre bancosantander really mn investors analysts broadcast
Topic #5:
que la por en para te gracias el lo tu
Topic #6:
em santander_br te tu una twitter mucho poder banco uma
Topic #7:
santander digital new financial anabotin today banking international group world
Topic #8:
para en por que el las es gracias la los
Topic #9:
convocatoria nómina tu euros esta santander proyecto la ganadora universia


In [115]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_bssa.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_bssa[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_bssa = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_bssa = df_top_words_bssa.T 

df_bssa_prefixed = df_top_words_transposed_bssa

## Group BPCE

In [116]:
tweets_bpce = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/more_banks/GroupeBPCE_since_2018_01_01.csv')

texts_bpce = tweets_bpce['Tweet'].tolist()

clean_texts_bpce = [clean_text(text) for text in texts_bpce]

# ContextVectorize
X_bpce = vectorizer.fit_transform(clean_texts_bpce)

# Apply LDA
lda_bpce = LatentDirichletAllocation(n_components=10)
topics_bpce = lda_bpce.fit(X_bpce)

print(lda_bpce.components_)

[[0.10001282 3.74075116 1.09995037 ... 2.54932992 0.1        0.10002715]
 [0.1        0.1        0.1        ... 0.1000042  0.10001    0.1       ]
 [0.1        0.1        0.1        ... 3.13513329 0.1        0.1       ]
 ...
 [0.1        0.1        1.10000665 ... 3.09989623 0.1        0.1       ]
 [3.09998152 2.41609647 0.1        ... 0.10006254 0.1        1.0999737 ]
 [0.1        3.06423958 0.1        ... 1.75492456 0.1        0.1       ]]


In [117]:
feature_names_bpce = vectorizer.get_feature_names_out()
print_top_words(lda_bpce, feature_names_bpce, n_top_words=10)

Topic #0:
et les la des le en pour du sur avec
Topic #1:
du les des groupe bpce résultats par le et sur
Topic #2:
du et le les pour des sport en la groupe
Topic #3:
la voilebanquepop et avec en le pour les populaire une
Topic #4:
la le et par en des pour une france est
Topic #5:
la et du les le pour groupe bpce des en
Topic #6:
le la et des en les avec bpce groupe pour
Topic #7:
du le groupe bpce la paris et des jeux laurent
Topic #8:
vous le bien bonjour cordialement je dm si nous votre
Topic #9:
et des du le en bpce la groupe sur les


In [118]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_bpce.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_bpce[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_bpce = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_bpce = df_top_words_bpce.T 

df_bpce_prefixed = df_top_words_transposed_bpce

In [119]:
df_final = pd.DataFrame()
#Canada
df_canada = pd.DataFrame([['Canada']], columns = [0])
#Store RBC to excel file
df_rbc_with_name = pd.DataFrame([['RBC']], columns=[0])  # Only a cell includes "RBC"
df_rbc_with_topics = pd.concat([df_canada, df_rbc_with_name, df_rbc_prefixed], ignore_index=True)


df_final = pd.concat([df_final, df_rbc_with_topics], ignore_index=True)
#Store CIBC to excel file
df_cibc_with_name = pd.DataFrame([['CIBC']], columns=[0]) 
df_cibc_with_topics = pd.concat([df_cibc_with_name, df_cibc_prefixed], ignore_index=True)
df_final = pd.concat([df_final, df_cibc_with_topics], ignore_index=True)

#Store Scotiabank to excel file
df_scotia_with_name = pd.DataFrame([['ScotiaBank']], columns =[0])
df_scotia_with_topics = pd.concat([df_scotia_with_name,df_scotia_prefixed], ignore_index= True)
df_final = pd.concat([df_final, df_scotia_with_topics], ignore_index=True)

#Store TD Canada News to excel file
df_tdcanada_with_name = pd.DataFrame([['TD_Canada_News']], columns = [0])
df_tdcanada_with_topics = pd.concat([df_tdcanada_with_name, df_tdcanada_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_tdcanada_with_topics], ignore_index=True)

#Store TD Canada Official Bank to excel file
df_tdcanada1_with_name = pd.DataFrame([['TD_Canada']], columns = [0])
df_tdcanada1_with_topics = pd.concat([df_tdcanada1_with_name, df_tdcanada1_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_tdcanada1_with_topics], ignore_index=True)

#US Area
df_us = pd.DataFrame([['US']], columns = [0])
#Store TD US News to excel file
df_tdus_with_name = pd.DataFrame([['TD_US_News']], columns = [0])
df_tdus_with_topics = pd.concat([df_us, df_tdus_with_name, df_tdus_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_tdus_with_topics], ignore_index=True)

#Store TD US Official Account to excel file
df_tdus1_with_name = pd.DataFrame([['TD_US']], columns = [0])
df_tdus1_with_topics = pd.concat([df_tdus1_with_name, df_tdus1_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_tdus1_with_topics], ignore_index=True)

#Store Morgan Stanley to excel file
df_ms_with_name = pd.DataFrame([['Morgan Stanley']], columns = [0])
df_ms_with_topics = pd.concat([df_ms_with_name, df_ms_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_ms_with_topics], ignore_index=True)

#Store UBS
df_ubs_with_name = pd.DataFrame([['UBS']], columns = [0])
df_ubs_with_topics = pd.concat([df_ubs_with_name, df_ubs_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_ubs_with_topics], ignore_index=True)
#Citi
df_citi_with_name = pd.DataFrame([['Citi']], columns = [0])
df_citi_with_topics = pd.concat([df_citi_with_name, df_citi_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_citi_with_topics], ignore_index=True)
#Wells Fargo
df_wf_with_name = pd.DataFrame([['Wells Fargo']], columns = [0])
df_wf_with_topics = pd.concat([df_wf_with_name, df_wf_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_wf_with_topics], ignore_index=True)
#BOA (Bank of America)
df_boa_with_name = pd.DataFrame([['Bank of America']], columns = [0])
df_boa_with_topics = pd.concat([df_boa_with_name, df_boa_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_boa_with_topics], ignore_index=True)
#JP Morgan
df_jp_with_name = pd.DataFrame([['JP Morgan']], columns = [0])
df_jp_with_topics = pd.concat([df_jp_with_name, df_jp_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_jp_with_topics], ignore_index=True)
#Raymond James
df_rj_with_name = pd.DataFrame([['Raymond James']], columns = [0])
df_rj_with_topics = pd.concat([df_rj_with_name, df_rj_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_rj_with_topics], ignore_index=True)

#Africa
df_africa = pd.DataFrame([['Africa']], columns = [0])
#Quant Africa
df_qa_with_name = pd.DataFrame([['Quant Africa']], columns = [0])
df_qa_with_topics = pd.concat([df_africa, df_qa_with_name, df_qa_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_qa_with_topics], ignore_index=True)
#Standard Bank
df_sb_with_name = pd.DataFrame([['Standard Bank']], columns = [0])
df_sb_with_topics = pd.concat([df_sb_with_name, df_sb_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_sb_with_topics], ignore_index=True)
#Northern Trust
df_nt_with_name = pd.DataFrame([['Standard Bank']], columns = [0])
df_nt_with_topics = pd.concat([df_nt_with_name, df_nt_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_nt_with_topics], ignore_index=True)
#UBA
df_uba_with_name = pd.DataFrame([['UBA']], columns = [0])
df_uba_with_topics = pd.concat([df_uba_with_name, df_uba_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_uba_with_topics], ignore_index=True)

#Asia
df_asia = pd.DataFrame([['Asia']], columns = [0])
#HSBC
df_hsbc_with_name = pd.DataFrame([['HSBC']], columns = [0])
df_hsbc_with_topics = pd.concat([df_asia, df_hsbc_with_name, df_hsbc_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_hsbc_with_topics], ignore_index=True)
#OCBC Premier banking
df_ocbc_with_name = pd.DataFrame([['OCBC Premier Banking']], columns = [0])
df_ocbc_with_topics = pd.concat([df_ocbc_with_name, df_ocbc_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_ocbc_with_topics], ignore_index=True)
#Bank of Singapore
df_bos_with_name = pd.DataFrame([['Bank of Singapore']], columns = [0])
df_bos_with_topics = pd.concat([df_bos_with_name, df_bos_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_bos_with_topics], ignore_index=True)
#Hana Bank
df_hana_with_name = pd.DataFrame([['Hana Bank']], columns = [0])
df_hana_with_topics = pd.concat([df_hana_with_name, df_hana_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_hana_with_topics], ignore_index=True)

#Oceania
df_oceania = pd.DataFrame([['Oceania']], columns = [0])
#ANZ
df_anz_with_name = pd.DataFrame([['ANZ']], columns = [0])
df_anz_with_topics = pd.concat([df_oceania, df_anz_with_name, df_anz_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_anz_with_topics], ignore_index=True)
#CBA
df_cba_with_name = pd.DataFrame([['CBA']], columns = [0])
df_cba_with_topics = pd.concat([df_cba_with_name, df_cba_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_cba_with_topics], ignore_index=True)
#NAB
df_nab_with_name = pd.DataFrame([['NAB']], columns = [0])
df_nab_with_topics = pd.concat([df_nab_with_name, df_nab_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_nab_with_topics], ignore_index=True)
#WBC
df_wbc_with_name = pd.DataFrame([['WBC']], columns = [0])
df_wbc_with_topics = pd.concat([df_wbc_with_name, df_wbc_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_wbc_with_topics], ignore_index=True)

#Europe
df_europe = pd.DataFrame([['Europe']], columns = [0])
#BNP Paribas
df_bnp_with_name = pd.DataFrame([['BNP Paribas']], columns = [0])
df_bnp_with_topics = pd.concat([df_europe, df_bnp_with_name, df_bnp_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_bnp_with_topics], ignore_index=True)
#Crédit Agricole Group
df_cag_with_name = pd.DataFrame([['Crédit Agricole Group']], columns = [0])
df_cag_with_topics = pd.concat([df_cag_with_name, df_cag_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_cag_with_topics], ignore_index=True)
#Barclays PLC
df_barclays_with_name = pd.DataFrame([['Barclays PLC']], columns = [0])
df_barclays_with_topics = pd.concat([df_barclays_with_name, df_barclays_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_barclays_with_topics], ignore_index=True)
#Banco Santander SA
df_bssa_with_name = pd.DataFrame([['Banco Santander SA']], columns = [0])
df_bssa_with_topics = pd.concat([df_bssa_with_name, df_bssa_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_bssa_with_topics], ignore_index=True)
#Groupe BPCE
df_bpce_with_name = pd.DataFrame([['BNP Paribas']], columns = [0])
df_bpce_with_topics = pd.concat([df_bpce_with_name, df_bpce_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_bpce_with_topics], ignore_index=True)


excel_path = '/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/ResultIntegration.xlsx'
df_final.to_excel(excel_path, sheet_name='Bank_Topics', index=False)