## RBCWealth

In [1]:
import pandas as pd

tweets_rbc = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/RBCwealth_since_2018_01_01.csv')

texts_rbc = tweets_rbc['Tweet'].tolist()

In [2]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')
nltk.download('stopwords')

#Define cleaning
def clean_text(text):
    text = str(text)
    text = text.lower()  # convert to lower case
    text = re.sub(r'\d+', '', text)  # remove number
    text = re.sub(r'http\S+', '', text)  # remove url
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation mark
    tokens = word_tokenize(text)  
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # remove stop words
    return ' '.join(tokens)

clean_texts_rbc = [clean_text(text) for text in texts_rbc]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/houhiroshisakai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/houhiroshisakai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Latent Dirichlet Allocation
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# ContextVectorize
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_rbc = vectorizer.fit_transform(clean_texts_rbc)

# Apply LDA
lda_rbc = LatentDirichletAllocation(n_components=10)
lda_rbc.fit(X_rbc)

print(lda_rbc.components_)

[[0.1        4.09998158 1.09999021 ... 0.10005141 2.10003738 0.1       ]
 [0.10007079 0.1        0.1        ... 0.1        0.1        0.1       ]
 [0.10001903 0.10000307 0.1        ... 0.1        0.1        0.1       ]
 ...
 [0.1        0.1        0.1        ... 0.1        2.09999821 0.1       ]
 [2.0999966  0.10002996 0.10001177 ... 0.1        0.1        0.1       ]
 [3.09996042 0.1        1.09999802 ... 0.1        1.09980963 0.1       ]]


In [4]:
feature_names_rbc = vectorizer.get_feature_names_out()

In [5]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

print_top_words(lda_rbc, feature_names_rbc, n_top_words=10)

Topic #0:
learn rbc art help read communities wealth people canada artists
Topic #1:
learn read rbc market make financial career new clients women
Topic #2:
learn help read financial support impact teamrbc education work wealth
Topic #3:
global insight learn latest investors weekly economic read markets market
Topic #4:
watch wealthy barber chilton david executor learn estate family young
Topic #5:
learn wealth women financial rbc help business impact read investing
Topic #6:
rbc read jersey support wealth partnership management programme proud delighted
Topic #7:
read learn rbc indigenous digital help new world investing difference
Topic #8:
learn investors financial read year market markets protect health earnings
Topic #9:
learn wealth plan important family read financial planning consider future


In [6]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_rbc.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_rbc[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_rbc = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_rbc = df_top_words_rbc.T 

# Save the DataFrame to an Excel file
#excel_path = '/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/ResultIntegration.xlsx'
#df_top_words_transposed_rbc.to_excel(excel_path, index=False)

In [7]:
df_rbc_prefixed = df_top_words_transposed_rbc

## CIBC

In [8]:
tweets_cibc = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/CIBCwealth_since_2018_01_01.csv')

In [9]:
texts_cibc = tweets_cibc['Tweet'].tolist()

In [10]:
clean_texts_cibc = [clean_text(text) for text in texts_cibc]

In [11]:
# ContextVectorize
X_cibc = vectorizer.fit_transform(clean_texts_cibc)

# Apply LDA
lda_cibc = LatentDirichletAllocation(n_components=10)
topics_cibc = lda_cibc.fit(X_cibc)

print(lda_cibc.components_)

[[0.1        0.1        0.1        ... 0.1        2.10000535 0.1       ]
 [1.10000213 0.1        0.1        ... 2.19702399 0.10000192 2.10002401]
 [0.1        2.1000002  4.73427713 ... 1.19461948 2.37168403 0.1000047 ]
 ...
 [1.09999787 0.1        2.465384   ... 0.10001347 9.82824524 0.10015446]
 [0.1        1.0999998  0.1        ... 0.1        0.10003734 0.1       ]
 [0.1        0.1        0.1        ... 1.09994808 2.1000046  0.1       ]]


In [12]:
feature_names_cibc = vectorizer.get_feature_names_out()

In [13]:
print_top_words(lda_cibc, feature_names_cibc, n_top_words=10)

Topic #0:
canadian canada wishing day happy loved ones investors bank investing
Topic #1:
markets fed icymi benjamin tal tax planning learn experts hear
Topic #2:
tax jamie golombek cibcs federal cibc budget new market explains
Topic #3:
cibc today celebrate communities day inflation canada dyk indigenous learn
Topic #4:
oil recession cibcs perspectives year economy inflation selling markets says
Topic #5:
week investment available economic weekly roundup watch discuss markets reading
Topic #6:
team year market visit investment family ambitions information families leadership
Topic #7:
wealth wood gundy cibc advisors advisor women planning experience years
Topic #8:
inflation news rate rates markets end higher income says durantaye
Topic #9:
rate et pm register interestrates bank join cibcfamilyoffice canada economy


In [14]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_rbc.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_rbc[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_cibc = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_cibc = df_top_words_cibc.T 

df_cibc_prefixed = df_top_words_transposed_cibc

In [15]:
df_cibc_prefixed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,learn,learn,learn,global,watch,learn,rbc,read,learn,learn
1,rbc,read,help,insight,wealthy,wealth,read,learn,investors,wealth
2,art,rbc,read,learn,barber,women,jersey,rbc,financial,plan
3,help,market,financial,latest,chilton,financial,support,indigenous,read,important
4,read,make,support,investors,david,rbc,wealth,digital,year,family
5,communities,financial,impact,weekly,executor,help,partnership,help,market,read
6,wealth,career,teamrbc,economic,learn,business,management,new,markets,financial
7,people,new,education,read,estate,impact,programme,world,protect,planning
8,canada,clients,work,markets,family,read,proud,investing,health,consider
9,artists,women,wealth,market,young,investing,delighted,difference,earnings,future


In [16]:
df_final = pd.DataFrame()

In [17]:
#Store RBC to excel file
df_rbc_with_name = pd.DataFrame([['RBC']], columns=[0])  # Only a cell includes "RBC"
df_rbc_with_topics = pd.concat([df_rbc_with_name, df_rbc_prefixed], ignore_index=True)


df_final = pd.concat([df_final, df_rbc_with_topics], ignore_index=True)
#Store CIBC to excel file
df_cibc_with_name = pd.DataFrame([['CIBC']], columns=[0])  # 只有一个单元格包含"CIBC"
df_cibc_with_topics = pd.concat([df_cibc_with_name, df_cibc_prefixed], ignore_index=True)

excel_path = '/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/ResultIntegration.xlsx'

df_final = pd.concat([df_final, df_cibc_with_topics], ignore_index=True)
df_final.to_excel(excel_path, sheet_name='Bank_Topics', index=False)

## BMO

In [17]:
tweets_bmo = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/BMO_since_2018_01_01.csv')

texts_bmo = tweets_bmo['Tweet'].tolist()

In [18]:
clean_texts_bmo = [clean_text(text) for text in texts_bmo]

In [19]:
# ContextVectorize
X_bmo = vectorizer.fit_transform(clean_texts_bmo)

# Apply LDA
lda_bmo = LatentDirichletAllocation(n_components=10)
topics_bmo = lda_bmo.fit_transform(X_bmo)

print(lda_bmo.components_)

[[ 0.1         0.10000756  9.69730682 ...  0.1         0.1
   0.1       ]
 [ 0.1         0.1000031   0.1000004  ...  0.1001267  10.09999501
   0.10012454]
 [ 0.1        16.23299101  9.08592481 ...  0.1         0.10000054
   0.65406565]
 ...
 [ 0.1         0.10004352 14.85448769 ...  0.1000048   0.1
   1.54575797]
 [ 0.10000418  6.8385215   2.30359299 ...  0.1         0.10000162
   0.1       ]
 [ 0.10000576  0.10000814  0.10000853 ...  0.1         0.10000269
   0.10003865]]


In [20]:
feature_names_bmo = vectorizer.get_feature_names_out()

In [21]:
print_top_words(lda_bmo, feature_names_bmo, n_top_words=10)

Topic #0:
send dm sorry learn hi thank hello like help message
Topic #1:
bmo financial learn business bmoforwomen join pm today new women
Topic #2:
working apologize possible patience wait times soon banking online hi
Topic #3:
rl im hello send message sorry help youre ryan private
Topic #4:
nc know thank investsmart wethenorth let help great ns northovereverything
Topic #5:
hi mf phone thanks number nc reach write review forward
Topic #6:
thank dm hi assist help hello reaching send ds gladly
Topic #7:
feedback thanks app bmo appreciate message good mobile hi thank
Topic #8:
wait send hi times apologize dm experienced feel message free
Topic #9:
branch bmo nous vous visit et bank bds branches je


In [27]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_bmo.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_bmo[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_bmo = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_bmo = df_top_words_bmo.T 

df_bmo_prefixed = df_top_words_transposed_bmo

## Scotiabank 

In [24]:
tweets_scotia = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/scotiabank_since_2018_01_01.csv')

texts_scotia = tweets_scotia['Tweet'].tolist()

clean_texts_scotia = [clean_text(text) for text in texts_scotia]

# ContextVectorize
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_scotia = vectorizer.fit_transform(clean_texts_scotia)

# Apply LDA
lda_scotia = LatentDirichletAllocation(n_components=10)
topics_scotia = lda_scotia.fit(X_scotia)

print(lda_scotia.components_)


[[ 0.1         0.1         0.10001233 ...  0.10000098  0.1
   0.1       ]
 [ 0.1         0.1        14.0999311  ...  0.1         0.1
   0.1       ]
 [ 0.1         4.10001711  0.10001351 ...  0.1         1.09988117
   0.1       ]
 ...
 [ 0.1         0.1         0.10000117 ...  0.1         0.1
   0.1       ]
 [ 2.10000161  1.09998782  0.1        ...  0.1         0.1
   1.09993987]
 [ 1.09999212  0.1         0.10000232 ...  0.10000101  0.1
   0.1       ]]


In [25]:
feature_names_scotia = vectorizer.get_feature_names_out()

In [26]:
print_top_words(lda_scotia, feature_names_scotia, n_top_words=10)

Topic #0:
financial good year new canada education fraud luck tonight edmontonoilers
Topic #1:
hockey hockeyforall game scotiabank accessible inclusive year canada diverse help
Topic #2:
scotiabank scene latest episode money earn support points read credit
Topic #3:
la les et des le que taux pour du en
Topic #4:
help learn work scotiabank best canadian canadians canada new like
Topic #5:
holiday financial season spending canada time advice plan growth according
Topic #6:
credit score money canada ces fund way scores macklem canadian
Topic #7:
rate inflation canada bank rates economy boc hikes scotiabank canadians
Topic #8:
home financial scotiabank scotia savings help account save advisor fhsa
Topic #9:
listen follow spotify podcasts apple latest rate indigenous chief economist


In [28]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_scotia.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_scotia[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_scotia = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_scotia = df_top_words_scotia.T 

df_scotia_prefixed = df_top_words_transposed_scotia

## TD_US_News

In [29]:
tweets_tdus = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/TDNews_US_since_2018_01_01.csv')

texts_tdus = tweets_tdus['Tweet'].tolist()

clean_texts_tdus = [clean_text(text) for text in texts_tdus]

# ContextVectorize
X_tdus = vectorizer.fit_transform(clean_texts_tdus)

# Apply LDA
lda_tdus = LatentDirichletAllocation(n_components=10)
topics_tdus = lda_tdus.fit(X_tdus)

print(lda_tdus.components_)

[[0.10000569 0.1        0.1        ... 0.89393282 0.1        0.1       ]
 [3.09996456 0.1        0.1        ... 4.99788573 2.09998919 0.10000275]
 [0.10002963 0.1        0.1        ... 1.20304618 0.1        0.10000084]
 ...
 [0.1        1.09999803 0.1        ... 6.142038   0.10001225 0.10000967]
 [0.1        0.1        0.1        ... 3.74701698 0.1        0.1       ]
 [0.1        1.10000197 1.09999885 ... 0.10000988 0.1        0.1       ]]


In [30]:
feature_names_tdus = vectorizer.get_feature_names_out()

print_top_words(lda_tdus, feature_names_tdus, n_top_words=10)

Topic #0:
td learn new year tds check survey taking look results
Topic #1:
td read covid colleagues story day bank financial head diversity
Topic #2:
tdbank_us head wealth financial survey provides td strategist investment chief
Topic #3:
money tdbank_us couples talk learn tdbank_uss survey benefits business role
Topic #4:
td ready new year challenge learn check help grant organizations
Topic #5:
tips check holiday moneymattersmonday tdbank_us retail spending card save make
Topic #6:
tdbank_us financial survey learn read best year data cited credit
Topic #7:
business td learn small community help tdbank_us owners smallbusiness home
Topic #8:
td colleagues customers learn bank communities president check new ceo
Topic #9:
td read tds business learn finance experience proud businesses community


In [31]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_tdus.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_tdus[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_tdus = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_tdus = df_top_words_tdus.T 

df_tdus_prefixed = df_top_words_transposed_tdus

## TD US

In [33]:
tweets_tdus1 = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/TDBank_US_since_2018_01_01.csv')

texts_tdus1 = tweets_tdus1['Tweet'].tolist()

In [34]:
clean_texts_tdus1 = [clean_text(text) for text in texts_tdus1]

In [35]:
# ContextVectorize
X_tdus1 = vectorizer.fit_transform(clean_texts_tdus1)

# Apply LDA
lda_tdus1 = LatentDirichletAllocation(n_components=10)
topics_tdus1 = lda_tdus1.fit_transform(X_tdus1)

print(lda_tdus1.components_)

[[1.00636298e-01 1.01299926e-01 1.00004868e-01 ... 1.00067875e-01
  1.00000000e-01 1.00000000e-01]
 [1.00000001e-01 2.09818497e+00 5.27647957e+02 ... 1.22338687e+00
  1.00000000e-01 1.00000000e-01]
 [1.00000001e-01 1.00000002e-01 1.10102802e+02 ... 1.00000000e-01
  1.00026369e-01 1.00026369e-01]
 ...
 [1.00000001e-01 1.00000002e-01 2.69477796e+02 ... 1.00000001e-01
  1.00000000e-01 1.00000000e-01]
 [1.00000001e-01 1.00000003e-01 1.00008747e-01 ... 1.97648524e+00
  1.00000000e-01 1.00000000e-01]
 [2.09936369e+00 1.00451132e-01 9.57102180e+01 ... 1.00000001e-01
  2.09989523e+00 2.09989523e+00]]


In [36]:
feature_names_tdus1 = vectorizer.get_feature_names_out()

print_top_words(lda_tdus1, feature_names_tdus1, n_top_words=10)

Topic #0:
like dm hey account numbers wed learn chat send details
Topic #1:
dm send account feel like saw numbers tweet learn details
Topic #2:
dm account send numbers details ask like kindly good thank
Topic #3:
apologize working experiencing time patience online hold issues inconvenience issue
Topic #4:
card td feedback store know service customer debit business bank
Topic #5:
td bank information account hope visit helps contact reach online
Topic #6:
dm account send numbers happy help hear feel understand hey
Topic #7:
dm like noaccts details send wed learn additional feel concerns
Topic #8:
dm accts hi sorry lw help plz hear assistance make
Topic #9:
hope great day thanks happy glad thank welcome hear weekend


In [37]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_tdus1.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_bmo[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_tdus1 = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_tdus1 = df_top_words_tdus1.T 

df_tdus1_prefixed = df_top_words_transposed_tdus1

## TD_Canada_New

In [38]:
tweets_tdcanada = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/TDNews_Canada_since_2018_01_01.csv')

texts_tdcanada = tweets_tdcanada['Tweet'].tolist()

In [39]:
clean_texts_tdcanada = [clean_text(text) for text in texts_tdcanada]

In [40]:
# ContextVectorize
X_tdcanada = vectorizer.fit_transform(clean_texts_tdcanada)

# Apply LDA
lda_tdcanada = LatentDirichletAllocation(n_components=10)
topics = lda_tdcanada.fit_transform(X_tdcanada)

print(lda_tdcanada.components_)

[[0.1        0.1        0.1        ... 0.1        3.10001432 0.1       ]
 [0.1        3.1        0.1        ... 2.1000004  5.10006016 0.1       ]
 [0.100001   0.1        0.10000202 ... 0.10001121 1.09997297 0.1       ]
 ...
 [0.1        0.1        0.1        ... 0.1        0.1        1.10000095]
 [0.1        0.1        0.1        ... 0.1        1.09990886 1.09999726]
 [0.10000101 0.1        1.09992148 ... 0.1        0.1        1.10000179]]


In [41]:
feature_names_tdcanada = vectorizer.get_feature_names_out()

print_top_words(lda_tdcanada, feature_names_tdcanada, n_top_words=10)

Topic #0:
help td canadians canada investing retirement tax new consider savings
Topic #1:
help tips fraud learn canadians canadian financial protect insurance bank
Topic #2:
td new support help colleagues customers communities lgbtq heres proud
Topic #3:
brucecooper_td tdassetmanagement weeks podcast marketperspectives mortgage td economy economic explains
Topic #4:
financial td tds canada learn help home indigenous things canadians
Topic #5:
td community heres help learn canadians scam money make protect
Topic #6:
financial tds planning help bank shares holiday td manager helping
Topic #7:
digital td customers experiences banking customer experience officer intelligence weve
Topic #8:
td ceo bharat masrani tds learn group read canadian black
Topic #9:
financial td canadians covid canadian new money youre tips finances


In [42]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_tdcanada.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_tdcanada[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_tdcanada = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_tdcanada = df_top_words_tdcanada.T 

df_tdcanada_prefixed = df_top_words_transposed_tdcanada

## TD Canada

In [43]:
tweets_tdcanada1 = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/TDNews_Canada_since_2018_01_01.csv')

texts_tdcanada1 = tweets_tdcanada1['Tweet'].tolist()

clean_texts_tdcanada1 = [clean_text(text) for text in texts_tdcanada]

# ContextVectorize
X_tdcanada1 = vectorizer.fit_transform(clean_texts_tdcanada1)

# Apply LDA
lda_tdcanada1 = LatentDirichletAllocation(n_components=10)
topics_tdcanada1 = lda_tdcanada1.fit_transform(X_tdcanada1)

print(lda_tdcanada1.components_)

[[0.10001113 1.09999602 0.1        ... 0.1        0.1        0.1       ]
 [0.1        0.1        0.1        ... 1.09999496 0.1        3.09999845]
 [0.1        0.1        3.1        ... 0.1        0.1        0.1       ]
 ...
 [2.09998529 0.1        0.1        ... 0.1        7.94231811 0.1       ]
 [0.1        0.1        0.1        ... 3.10000504 0.10000579 0.1       ]
 [0.10000358 0.1        0.1        ... 0.1        0.1        0.1       ]]


In [44]:
feature_names_tdcanada1 = vectorizer.get_feature_names_out()

print_top_words(lda_tdcanada1, feature_names_tdcanada1, n_top_words=10)

Topic #0:
td shares financial inclusive innovation help covid work meet colleagues
Topic #1:
td news tds month new dont community weve got worry
Topic #2:
brucecooper_td tdassetmanagement weeks podcast marketperspectives protect help canadians fraud learn
Topic #3:
td investing experience tips energy tds season canada direct holiday
Topic #4:
td financial help ai new bank canadian canadians app customers
Topic #5:
td help work canadian tds banking learn thereadycommitment future community
Topic #6:
td bharat ceo masrani canada tds financial list group economic
Topic #7:
td help financial canadians new retirement planning ways mortgage budget
Topic #8:
digital td money financial mortgage customers tds best bank learn
Topic #9:
td customers canadian financial support read indigenous banking help heres


In [45]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_tdcanada1.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_tdcanada1[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_tdcanada1 = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_tdcanada1 = df_top_words_tdcanada1.T 

df_tdcanada1_prefixed = df_top_words_transposed_tdcanada1

# US Company

## Morgan Stanley

In [47]:
tweets_ms = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/MorganStanley_since_2018_01_01.csv')

texts_ms = tweets_ms['Tweet'].tolist()

clean_texts_ms = [clean_text(text) for text in texts_ms]

# ContextVectorize
X_ms = vectorizer.fit_transform(clean_texts_ms)

# Apply LDA
lda_ms = LatentDirichletAllocation(n_components=10)
topics_ms = lda_ms.fit(X_ms)

print(lda_ms.components_)

[[0.10028971 0.1        1.09999759 ... 0.1        0.1        0.1       ]
 [0.10002185 0.10000609 0.1        ... 0.1        0.1        0.10001165]
 [2.09923498 0.1        0.1        ... 0.1        0.1        0.1       ]
 ...
 [0.10003105 0.10001429 0.1        ... 0.1        0.10000667 0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.10000108 0.1       ]
 [0.10008442 0.1        0.1        ... 0.1        0.1        0.10000341]]


In [48]:
feature_names_ms = vectorizer.get_feature_names_out()
print_top_words(lda_ms, feature_names_ms, n_top_words=10)

Topic #0:
morgan msgivesback stanley employees program work london support learn summer
Topic #1:
sustainable investing learn help make financial best challenge change says
Topic #2:
investors investment markets growth market chief officer morgan wealth management
Topic #3:
sichallenge ceo team fund morgan stanley james opportunity access global
Topic #4:
morgan stanley billion net year ms management earnings revenues director
Topic #5:
multicultural innovation morgan learn stanley lab sustainability women entrepreneurs mcil
Topic #6:
policy eagleup public eagles year booktrustusa market michael plan zezas
Topic #7:
morgan stanley learn head financial help make diversity global stanleys
Topic #8:
global chief market markets equity strategist discusses team investors andrew
Topic #9:
morgan stanley new health learn mental investors investment global stanleys


In [49]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_ms.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_ms[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_ms = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_ms = df_top_words_ms.T 

df_ms_prefixed = df_top_words_transposed_ms

## UBS

In [51]:
tweets_ubs = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/UBS_since_2018_01_01.csv')

texts_ubs = tweets_ubs['Tweet'].tolist()

clean_texts_ubs = [clean_text(text) for text in texts_ubs]

# ContextVectorize
X_ubs = vectorizer.fit_transform(clean_texts_ubs)

# Apply LDA
lda_ubs = LatentDirichletAllocation(n_components=10)
topics_ubs = lda_ubs.fit_transform(X_ubs)

print(lda_ubs.components_)

[[0.1        0.1        0.1        ... 0.1        0.1        7.99126146]
 [0.1        0.1        0.1        ... 0.1        1.09999024 0.10000456]
 [0.1        0.1        0.1        ... 0.10000935 0.1        0.1000147 ]
 ...
 [0.1000013  0.1        0.1        ... 0.1        0.10000963 5.44433962]
 [0.1        0.1        0.1        ... 2.09998981 1.10000014 1.21417386]
 [0.1        0.1        0.1000278  ... 0.1        0.1        5.13154582]]


In [52]:
feature_names_ubs = vectorizer.get_feature_names_out()
print_top_words(lda_ubs, feature_names_ubs, n_top_words=10)

Topic #0:
shareubs best read private bank proud awards industry years ubsresearch
Topic #1:
shareubs investors ubsresearch survey research investor inflation ubsinvestorsentiment insights team
Topic #2:
results ubs ceo quarter live shareubs ermotti sergio starts group
Topic #3:
shareubs ubsresearch report ai latest business companies world billionaires data
Topic #4:
shareubs ubs future finance learn challenge foundation ubsinnovate ubsresearch ubss
Topic #5:
shareubs ubs global investment ubsresearch china market investors head ubsconf
Topic #6:
shareubs paul economist chief women ubs donovan report ownyourworth financial
Topic #7:
shareubs togetherband ubs thetogetherband conversation support iwd join sdg learn
Topic #8:
shareubs report family investors learn nobel nobelperspectives global new office
Topic #9:
shareubs impact sustainable future help make wef sustainability learn togetherband


In [53]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_ubs.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_ubs[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_ubs = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_ubs = df_top_words_ubs.T 

df_ubs_prefixed = df_top_words_transposed_ubs

## Citi

In [54]:
tweets_citi = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/Citi_since_2018_01_01.csv')

texts_citi = tweets_citi['Tweet'].tolist()

clean_texts_citi = [clean_text(text) for text in texts_citi]

# ContextVectorize
X_citi = vectorizer.fit_transform(clean_texts_citi)

# Apply LDA
lda_citi = LatentDirichletAllocation(n_components=10)
topics_citi = lda_citi.fit(X_citi)

print(lda_citi.components_)

[[3.09999846 0.10003112 3.10003045 ... 0.10001294 0.1        0.1       ]
 [0.1        2.10000982 0.10000595 ... 0.1        0.10000921 0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.10001859 0.10001769]
 ...
 [0.1        0.1        0.10018225 ... 0.1        0.10003095 0.1       ]
 [0.1        0.1        0.10000197 ... 3.09997332 0.10000787 0.1       ]
 [0.10000154 0.10001087 0.1        ... 0.1        2.09993338 0.1       ]]


In [55]:
feature_names_citi = vectorizer.get_feature_names_out()

print_top_words(lda_citi, feature_names_citi, n_top_words=10)

Topic #0:
citi proud standforprogress support learn work diversity colleagues continue community
Topic #1:
citi community learn veterans new today day support committed families
Topic #2:
citi report read gps latest new global sustainability digital chief
Topic #3:
best teamciti bank proud named services people awards disabilities private
Topic #4:
ceo jane citi fraser corbat mike global information citis financial
Topic #5:
citi digital citis future banking innovation global money citidigimoney home
Topic #6:
experience citis new clients watch citi solutions digital global treasury
Topic #7:
citi learn global payments treasury clients new citis digital companies
Topic #8:
pathwaysprogress citi youth young foundation youthcolab entrepreneurs learn looking people
Topic #9:
citi support help learn community global foundation communities women proud


In [56]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_citi.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_citi[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_citi = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_citi = df_top_words_citi.T 

df_citi_prefixed = df_top_words_transposed_citi

## Wells Fargo

In [63]:
tweets_wf = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/WellsFargo_since_2018_01_01.csv')

texts_wf = tweets_wf['Tweet'].tolist()

clean_texts_wf = [clean_text(text) for text in texts_wf]

# ContextVectorize
X_wf = vectorizer.fit_transform(clean_texts_wf)

# Apply LDA
lda_wf = LatentDirichletAllocation(n_components=10)
topics_wf = lda_wf.fit(X_wf)

print(lda_wf.components_)

[[ 0.1         0.1         0.1000003  ...  0.10001227  0.1
   0.10001452]
 [ 0.10000304  0.1         0.1        ...  0.10000049  0.1
   0.10000279]
 [ 0.10003722  0.1         0.10000962 ...  0.10000369  0.1
   0.1000005 ]
 ...
 [ 7.09981284  2.099915    6.09996    ...  0.10001761  0.10000974
   8.38210148]
 [ 0.10001258  0.1         0.10000028 ...  0.10001093  0.10000223
  25.51461551]
 [ 0.10012862  0.10007821  0.10002561 ...  0.10000136  6.09997831
   0.10543643]]


In [64]:
feature_names_wf = vectorizer.get_feature_names_out()
print_top_words(lda_wf, feature_names_wf, n_top_words=10)

Topic #0:
account numbers help tell need tweet know specific like happened
Topic #1:
account make sure dm like numbers issues mind experience address
Topic #2:
working issue support happy technical gift apologize ddg dm wed
Topic #3:
speak banker assistance hi contact need visit information card thanks
Topic #4:
account dm details numbers help sorry best hello security thank
Topic #5:
wells fargo experience information address sure make provide account sorry
Topic #6:
online youre banking access able hi issues dm try inconvenience
Topic #7:
email learn visit thank hi receive information message customers forward
Topic #8:
dm number account phone send numbers like details address experience
Topic #9:
thank support thanks glad reach great chris time opportunity help


In [65]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_wf.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_wf[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_wf = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_wf = df_top_words_wf.T 

df_wf_prefixed = df_top_words_transposed_wf

## BOA (Bank of America)

In [66]:
tweets_boa = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/BankofAmerica_since_2018_01_01.csv')

texts_boa = tweets_boa['Tweet'].tolist()

clean_texts_boa = [clean_text(text) for text in texts_boa]

# ContextVectorize
X_boa = vectorizer.fit_transform(clean_texts_boa)

# Apply LDA
lda_boa = LatentDirichletAllocation(n_components=10)
topics_boa = lda_boa.fit(X_boa)

print(lda_boa.components_)

[[0.1        0.1        2.80826351 ... 0.1        1.12077359 0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.1        0.10010396]
 [1.09998774 0.1        0.1        ... 0.10000065 0.1        0.1       ]
 ...
 [0.1        0.10000318 0.10000131 ... 0.10000033 2.07923594 0.1       ]
 [5.10001226 0.1        0.1        ... 0.1        0.1        0.1       ]
 [0.1        1.09997011 1.39172122 ... 0.1        0.1        0.1       ]]


In [67]:
feature_names_boa = vectorizer.get_feature_names_out()
print_top_words(lda_boa, feature_names_boa, n_top_words=10)

Topic #0:
tools digital cantstopbanking fans bank america impressive learn weekend love
Topic #1:
bettermoneyhabits help financial life tips youre plan new spending money
Topic #2:
help thats weve thanks important young provide supporting women conversation
Topic #3:
business help like link know opportunity let small economic connect
Topic #4:
great time bofavolunteers like make congrats bofastudentleaders pay bofapride payments
Topic #5:
banking app mobile help link account connect send use hi
Topic #6:
thank communities work appreciate shout support rewards proud cash help
Topic #7:
women program thanks change positive creating partnership leaders conversation bitlyjevmow
Topic #8:
hear glad sharing thanks erica financial business student help virtual
Topic #9:
support happy help learn communities day year season proud need


In [68]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_boa.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_boa[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_boa = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_boa = df_top_words_boa.T 

df_boa_prefixed = df_top_words_transposed_boa

## JP Morgan

In [69]:
tweets_jp = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/jpmorgan_since_2018_01_01.csv')

texts_jp = tweets_jp['Tweet'].tolist()

clean_texts_jp = [clean_text(text) for text in texts_jp]

# ContextVectorize
X_jp = vectorizer.fit_transform(clean_texts_jp)

# Apply LDA
lda_jp = LatentDirichletAllocation(n_components=10)
topics_jp = lda_jp.fit_transform(X_jp)

print(lda_jp.components_)

[[0.10001151 0.1        0.1        ... 0.1        1.09999945 0.1       ]
 [2.09997044 0.10004384 1.09990422 ... 0.10000647 0.1        0.1       ]
 [0.1        0.1        1.61737598 ... 0.1        0.1        0.10000845]
 ...
 [0.1        0.1        0.1        ... 0.1        0.1        0.1       ]
 [0.1        2.09995616 0.10005822 ... 0.1        0.1        0.1       ]
 [0.1        0.1        2.10000186 ... 0.1        0.1        2.09999155]]


In [70]:
feature_names_jp = vectorizer.get_feature_names_out()
print_top_words(lda_jp, feature_names_jp, n_top_words=10)

Topic #0:
global business payments market beer lori tech future jp morgans
Topic #1:
women business leadership leaders day career jpm advice share togetherwe
Topic #2:
payments business digital jpm erdoes management businesses years mary art
Topic #3:
ddftennis jpmcc day tennis clients jp dubai participants morgan congrats
Topic #4:
leaders tech jpm help program students industry business new support
Topic #5:
income jpm net eps usopen team reports curators global womens
Topic #6:
jp morgan new list summer research reading technology year ai
Topic #7:
jpmhc companies market jp tech capital healthcare industry trends investors
Topic #8:
jp global morgans markets outlook ceo economic head morgan market
Topic #9:
company investment asj false forex ai undersummit technology learning machine


In [71]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_jp.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_jp[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_jp = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_jp = df_top_words_jp.T 

df_jp_prefixed = df_top_words_transposed_jp

## Raymond James

In [72]:
tweets_rj = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/RaymondJames_since_2018_01_01.csv')

texts_rj = tweets_rj['Tweet'].tolist()

clean_texts_rj = [clean_text(text) for text in texts_rj]

# ContextVectorize
X_rj = vectorizer.fit_transform(clean_texts_rj)

# Apply LDA
lda_rj = LatentDirichletAllocation(n_components=10)
topics_rj = lda_rj.fit_transform(X_rj)

print(lda_rj.components_)

[[ 0.1         0.10000045  0.1        ...  0.10003177  7.09998969
   0.1       ]
 [ 2.09987612  0.1         0.1        ...  0.1         0.1000071
   0.1       ]
 [ 0.10012063 15.09999655 12.09999008 ...  6.0999386   0.1
   0.10001844]
 ...
 [ 0.1         0.10000123  0.10000992 ...  0.10001123  0.10000026
   0.1       ]
 [ 0.1         0.1         0.1        ...  0.1         0.10000216
   0.1       ]
 [ 0.1         0.10000163  0.1        ...  0.1         0.1
   0.1       ]]


In [73]:
feature_names_rj = vectorizer.get_feature_names_out()
print_top_words(lda_rj, feature_names_rj, n_top_words=10)

Topic #0:
learn ones loved make protect time started years help heres
Topic #1:
time consider start tips new jobsreport tax help highlights home
Topic #2:
discuss tune et policy pm analyst mills ed change subject
Topic #3:
rjcares associates learn james advisors raymond financial month support communities
Topic #4:
financial cio larryadamrj plan future year investors markets consider new
Topic #5:
chief scott brown economist record raymond james says year make
Topic #6:
raymond james today happy retirement legacy life community learn help
Topic #7:
know heres dont market theres lower mean volatility doesnt like
Topic #8:
james raymond financial plan help learn planning rjf nyse data
Topic #9:
markets heres economy experts look whats investment market expect investors


In [74]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_rj.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_rj[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_rj = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_rj = df_top_words_rj.T 

df_rj_prefixed = df_top_words_transposed_rj

# Africa

## Quant Africa

In [132]:
tweets_qa = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/quantafrica_since_2018_01_01.csv')

texts_qa = tweets_qa['Tweet'].tolist()

clean_texts_qa = [clean_text(text) for text in texts_qa]

# ContextVectorize
X_qa = vectorizer.fit_transform(clean_texts_qa)

# Apply LDA
lda_qa = LatentDirichletAllocation(n_components=10)
topics_qa = lda_qa.fit(X_qa)

print(lda_qa.components_)

[[0.1        0.1        1.56835098 ... 2.07757643 0.1        1.09999342]
 [0.1        0.1        1.10000144 ... 0.12242357 5.09992224 0.1       ]
 [0.1        0.1        0.1        ... 0.1        1.09998457 1.10000086]
 ...
 [0.1        0.1        1.09998235 ... 0.1        0.1        0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.1        1.10000572]
 [2.09999495 0.1        0.1        ... 0.1        0.1        0.1       ]]


In [133]:
feature_names_qa = vectorizer.get_feature_names_out()
print_top_words(lda_qa, feature_names_qa, n_top_words=10)

Topic #0:
tech join today talentdev career time pm session miss space
Topic #1:
data tools analysis visualization powerful provides yes web software science
Topic #2:
learn used learning web olorunsheyi tech community quantaafrica markup google
Topic #3:
tech startup quanta africa session community idea talentdev offer dr
Topic #4:
quanta technology tech collaborate innovate new free alimosho quantaafrica techbros
Topic #5:
future cofounder tech quanta today innovation yes community way pertinencegroup
Topic #6:
session startup join tomorrow quanta idea day today pitch link
Topic #7:
website create official offers react creating library design allows interactive
Topic #8:
javascript web language programming covers used development google angular dont
Topic #9:
css year html quanta set amazing africa talentdev dont browser


In [134]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_qa.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_qa[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_qa = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_qa = df_top_words_qa.T 

df_qa_prefixed = df_top_words_transposed_qa

## Standard Bank

In [75]:
tweets_sb = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/StandardBankZA_since_2018_01_01.csv')

texts_sb = tweets_sb['Tweet'].tolist()

clean_texts_sb = [clean_text(text) for text in texts_sb]

# ContextVectorize
X_sb = vectorizer.fit_transform(clean_texts_sb)

# Apply LDA
lda_sb = LatentDirichletAllocation(n_components=10)
topics_sb = lda_sb.fit_transform(X_sb)

print(lda_sb.components_)

[[0.1        0.1        0.1        ... 0.10010024 0.1        0.1       ]
 [0.1        0.1        0.1        ... 0.10006001 0.10001029 0.1       ]
 [0.1        2.09998136 1.10000165 ... 0.1        0.1        0.1       ]
 ...
 [0.1        0.1        1.09998182 ... 0.1        0.1        0.1       ]
 [0.10007801 0.1        0.1        ... 0.1        1.099982   0.1       ]
 [0.1        0.1        0.1        ... 2.09983974 0.1        0.1       ]]


In [76]:
feature_names_sb = vectorizer.get_feature_names_out()
print_top_words(lda_sb, feature_names_sb, n_top_words=10)

Topic #0:
sblove instantmoneymondays money instant sblovessummer thats right wallet itcanbe answer
Topic #1:
account card hi banking funds app use need transaction link
Topic #2:
sblove sblovessummer welcome great youre thats love thanks wow glad
Topic #3:
thank hi getting team touch appreciate matter attention support assistance
Topic #4:
banking hi best hey say forward clients family experience makes
Topic #5:
app know let thank appreciate try love hi device banking
Topic #6:
dm details hi contact assist number send team like look
Topic #7:
hi beatthescam issues provide sblovessummer error app kindly help better
Topic #8:
team hi bank reach fraud standard payment address email provide
Topic #9:
sblove sblovessummer financial goals year savings like budget link youre


In [77]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_sb.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_sb[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_sb = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_sb = df_top_words_sb.T 

df_sb_prefixed = df_top_words_transposed_sb

## North Thern Trust

In [78]:
tweets_nt = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/NTWealth_since_2018_01_01.csv')

texts_nt = tweets_nt['Tweet'].tolist()

clean_texts_nt = [clean_text(text) for text in texts_nt]

# ContextVectorize
X_nt = vectorizer.fit_transform(clean_texts_nt)

# Apply LDA
lda_nt = LatentDirichletAllocation(n_components=10)
topics_nt = lda_nt.fit(X_nt)

print(lda_nt.components_)

[[0.1        0.1        1.1000008  ... 0.1        0.1        0.10000726]
 [0.1        0.1        0.1        ... 1.3130838  4.09999417 0.1       ]
 [1.09967644 0.10002437 0.1        ... 0.10000947 0.1        0.10001047]
 ...
 [0.1        0.1        0.1        ... 2.57112763 1.09998915 5.09995854]
 [0.10030724 0.1        1.0999992  ... 0.1        0.1        0.1       ]
 [0.1        0.1        0.1        ... 0.1        3.10002106 0.1       ]]


In [79]:
feature_names_nt = vectorizer.get_feature_names_out()
print_top_words(lda_nt, feature_names_nt, n_top_words=10)

Topic #0:
business wealth women region president family planning magazine learn elevating
Topic #1:
tax planning trust new northern learn business wealth event proud
Topic #2:
katie nixon inflation investors cio management wealth fed policy outlook
Topic #3:
global trust northern family expochicago dont jim inflation mcdonald paul
Topic #4:
nixon katie cio management wealth market art questions energy trust
Topic #5:
private bank northerntrust best art northern policy community digital expochicago
Topic #6:
northern trust chief officer expochicago fiduciary wealth announce management help
Topic #7:
wealth tax financialeducation learn retirement family charitable financial resources support
Topic #8:
financial plan wealth northern future trust learn american families explore
Topic #9:
wealth business northern learn art strategies hosted common trust explore


In [80]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_nt.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_nt[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_nt = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_nt = df_top_words_nt.T 

df_nt_prefixed = df_top_words_transposed_nt

## UBA

In [81]:
tweets_uba = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/UBAGroup_since_2018_01_01.csv')

texts_uba = tweets_uba['Tweet'].tolist()

clean_texts_uba = [clean_text(text) for text in texts_uba]

# ContextVectorize
X_uba = vectorizer.fit_transform(clean_texts_uba)

# Apply LDA
lda_uba = LatentDirichletAllocation(n_components=10)
topics = lda_uba.fit_transform(X_uba)

print(lda_uba.components_)

[[1.10003267 0.10025843 0.1        ... 0.1        0.1        0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.10009902 0.10005755]
 [0.1        0.1        0.1        ... 0.10000779 0.10002096 0.1001622 ]
 ...
 [0.1        0.1        0.1        ... 0.10000518 0.10008848 0.10001905]
 [1.09981844 0.1        2.09992242 ... 0.10000133 0.1        2.0997612 ]
 [0.1        0.1        0.1        ... 0.10000178 0.1        0.1       ]]


In [82]:
feature_names_uba = vectorizer.get_feature_names_out()
print_top_words(lda_uba, feature_names_uba, n_top_words=10)

Topic #0:
thank uba hello bank contacting africas global send link like
Topic #1:
africa africasglobalbank uba african tonyoelumelu bank group read ubaafricanentrepreneurs business
Topic #2:
africasglobalbank ubamarketplace love happy day today friday ubaat leorewards uba
Topic #3:
number account enable mobile kindly assist thank provide email address
Topic #4:
uba africasglobalbank ubaafricaday africa africaday ubaafricaconversations join group click tonyoelumelu
Topic #5:
informed hello response thank dm kindly provided bitlymlzbnp emanate uba
Topic #6:
africasglobalbank winners win uba draw time way nan ubacares ubaat
Topic #7:
africasglobalbank new ubaat make best ubaceoawards airtime win happy today
Topic #8:
thank hello kindly dm bitlymlzbnp link sharing avoid public click
Topic #9:
account hello leo open chat uba dial visit thank ubacares


In [83]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_uba.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_uba[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_uba = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_uba = df_top_words_uba.T 

df_uba_prefixed = df_top_words_transposed_uba

# Asia

## HSBC

In [84]:
tweets_hsbc = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/HSBC_since_2018_01_01.csv')

texts_hsbc = tweets_hsbc['Tweet'].tolist()

clean_texts_hsbc = [clean_text(text) for text in texts_hsbc]

# ContextVectorize
X_hsbc = vectorizer.fit_transform(clean_texts_hsbc)

# Apply LDA
lda_hsbc = LatentDirichletAllocation(n_components=10)
topics = lda_hsbc.fit(X_hsbc)

print(lda_hsbc.components_)

[[0.1        2.09997873 0.1        ... 0.1        3.09997939 0.1       ]
 [0.1        0.10000269 0.1        ... 1.09989396 0.10001897 0.1       ]
 [0.1        0.1        2.10002437 ... 0.1        0.1        0.1       ]
 ...
 [0.1        0.1        1.13442599 ... 0.10012791 0.1        1.09999601]
 [0.1        0.1        0.1        ... 1.09997813 0.1        0.1       ]
 [0.1        0.10001858 1.09997081 ... 0.1        0.1        1.10000399]]


In [85]:
feature_names_hsbc = vectorizer.get_feature_names_out()
print_top_words(lda_hsbc, feature_names_hsbc, n_top_words=10)

Topic #0:
hsbc results read climate financial sustainable today transition hsbcresults change
Topic #1:
hsbc colleagues proud mental health support inclusive future role business
Topic #2:
hsbc help asset digital support covid management way banking bond
Topic #3:
hsbc john china ceo flint business ciie hsbcnavigator group international
Topic #4:
businesses hsbcnavigator hsbc global report asia read new sustainable research
Topic #5:
hsbc ceo noel quinn global support customers covid future world
Topic #6:
supply hsbc global chain chains asian new hsbcs chinese trade
Topic #7:
bank global hsbc trade finance banking technology customers year best
Topic #8:
businesses business solutions climate partnership future help global world new
Topic #9:
social cities netzero sustainability transition businesses global read green environmental


In [86]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_hsbc.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_hsbc[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_hsbc = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_hsbc = df_top_words_hsbc.T 

df_hsbc_prefixed = df_top_words_transposed_hsbc

## OCBC (Singapore)

In [87]:
tweets_ocbc = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/OCBCBank_since_2018_01_01.csv')

texts_ocbc = tweets_ocbc['Tweet'].tolist()

clean_texts_ocbc = [clean_text(text) for text in texts_ocbc]

# ContextVectorize
X_ocbc = vectorizer.fit_transform(clean_texts_ocbc)

# Apply LDA
lda_ocbc = LatentDirichletAllocation(n_components=10)
topics_ocbc = lda_ocbc.fit(X_ocbc)

print(lda_ocbc.components_)

[[0.10000923 0.1        0.1        ... 0.1        0.1        0.10000349]
 [4.99671433 0.1        2.09996787 ... 0.1        0.10011599 0.10001212]
 [0.1        0.10000119 1.0999994  ... 0.10046125 0.10007097 0.1       ]
 ...
 [0.1        0.1        0.1        ... 0.10001367 0.1        0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.1        0.1       ]
 [0.10000483 0.10000709 0.1        ... 0.10008208 0.1        0.1       ]]


In [88]:
feature_names_ocbc = vectorizer.get_feature_names_out()
print_top_words(lda_ocbc, feature_names_ocbc, n_top_words=10)

Topic #0:
sh youre tweet hey came welcome hi transactions onetoken transaction
Topic #1:
thank dm hi xf contact share sorry details hear number
Topic #2:
account hi card sh xf access banking code pin clarify
Topic #3:
service banking dg hi executive able internet update online form
Topic #4:
sh app thanks mobile banking version know good phone morning
Topic #5:
ocbc hi singapore dg touch looking malaysia xf reached relevant
Topic #6:
hi try issue dg inconvenience thank app working caused apologise
Topic #7:
email hi secured send sh thank banking sent mail reference
Topic #8:
hi dm dg drop number able sh bank mobile assist
Topic #9:
sh account transfer ocbc hi cheque deposit pm cash overseas


In [89]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_ocbc.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_ocbc[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_ocbc = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_ocbc = df_top_words_ocbc.T 

df_ocbc_prefixed = df_top_words_transposed_ocbc

## Bank of Singapore

In [90]:
tweets_bos = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/bankofSG_since_2018_01_01.csv')

texts_bos = tweets_bos['Tweet'].tolist()

clean_texts_bos = [clean_text(text) for text in texts_bos]

# ContextVectorize
X_bos = vectorizer.fit_transform(clean_texts_bos)

# Apply LDA
lda_bos = LatentDirichletAllocation(n_components=10)
topics_bos = lda_bos.fit(X_bos)

print(lda_bos.components_)

[[1.10000203 0.1        0.1        ... 1.10000253 0.10000824 0.1       ]
 [0.1        0.1        0.1        ... 1.09999747 1.55721679 0.1       ]
 [0.10005602 0.1000711  0.1        ... 0.1        0.10002339 0.1000418 ]
 ...
 [0.1        1.09996478 1.09999983 ... 0.1        0.1000431  0.10002427]
 [1.09999918 0.1        0.1        ... 0.1        0.1        0.1       ]
 [0.1        0.1        0.1        ... 0.1        3.73516201 0.1       ]]


In [91]:
feature_names_bos = vectorizer.get_feature_names_out()
print_top_words(lda_bos, feature_names_bos, n_top_words=10)

Topic #0:
research global singapore management head chia jean outlook portfolio office
Topic #1:
market investment watch chief months markets global officer insights rajeev
Topic #2:
economist chief says jerram mansoor richard bank mohiuddin growth economy
Topic #3:
global social need sustainable market development investors says years covid
Topic #4:
investment lee head eli strategy says markets policy strategist ahead
Topic #5:
moh siong sim strategist currency says usd risk likely fx
Topic #6:
global head marc walle products van family new shares investors
Topic #7:
fed economist inflation chief mansoor mohiuddin rate federal rates expected
Topic #8:
says bank chief uk mohiuddin mansoor economist global business ceo
Topic #9:
private china bank market greater best wealth global management head


In [92]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_bos.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_bos[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_bos = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_bos = df_top_words_bos.T 

df_bos_prefixed = df_top_words_transposed_bos

## Hana Bank (South Korea)

In [93]:
tweets_hana = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/HanaBank4expats_since_2018_01_01.csv')

texts_hana = tweets_hana['Tweet'].tolist()

clean_texts_hana = [clean_text(text) for text in texts_hana]

# ContextVectorize
X_hana = vectorizer.fit_transform(clean_texts_hana)

# Apply LDA
lda_hana = LatentDirichletAllocation(n_components=10)
topics = lda_hana.fit(X_hana)

print(lda_hana.components_)

[[ 0.1         0.1         0.1         0.1        11.21419964  0.1
   0.10000851  0.1         0.1         0.1         0.1         0.1
   0.1         0.1         1.45714247  0.1         0.1         1.09999113
   0.1        10.09999591  0.1         0.1         0.1        15.25902512
   0.1         0.1         0.1         0.1         6.72208089  0.10002792
  12.09999817  1.09995343  0.1         2.53811121  0.1         0.1
   5.09998636  0.1         0.10000072  0.1         0.1         0.10000822
   0.1         0.1000095   0.1         0.1         0.10000077  2.09993984
   0.1         0.1        12.09996105  0.1         3.09998988 20.80710785
   0.1         0.1         0.1         0.1         0.1        10.0999853
   0.1         0.1         0.10005934  0.1         0.1         0.1
   6.64064604  0.1         0.1         0.1         0.1         0.1
   0.1        16.09997569 22.41458938  1.09995951  0.10001176  0.1
   0.1         0.1         0.1         2.09999213  0.1         0.1
   0.1        

In [94]:
feature_names_hana = vectorizer.get_feature_names_out()
print_top_words(lda_hana, feature_names_hana, n_top_words=10)

Topic #0:
rate krwusd range exchange week forecasts korean bank dollar wonus
Topic #1:
like app koreas annual hana st countries best dont win
Topic #2:
krwusd rate weeks forecast outlook report exchange heres complete check
Topic #3:
bank card hana keb overseas foreign exchange tell youll doesnt
Topic #4:
seoul easy visit koreas arent check coast lines tips korea
Topic #5:
korea new sunday july banking tips hana locations change coast
Topic #6:
money travel abroad home ez app hana new foreigners fast
Topic #7:
hope pyeongchang good hana banks tuesday day cool fly month
Topic #8:
weekly report outlook krwusd check expats koreas parent group english
Topic #9:
korea forecast weekly rate hard cheaper rates hapsmagazine fx picture


In [95]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_hana.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_hana[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_hana = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_hana = df_top_words_hana.T 

df_hana_prefixed = df_top_words_transposed_hana

## UFJ (Japan)

## Mizuho Bank

# Oceania

## The Australia and New Zealand Banking Group Limited  (ANZ)

In [96]:
tweets_anz = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/ANZ_AU_since_2018_01_01.csv')

texts_anz = tweets_anz['Tweet'].tolist()

clean_texts_anz = [clean_text(text) for text in texts_anz]

# ContextVectorize
X_anz = vectorizer.fit_transform(clean_texts_anz)

# Apply LDA
lda_anz = LatentDirichletAllocation(n_components=10)
topics_anz = lda_anz.fit(X_anz)

print(lda_anz.components_)

[[1.00067728e-01 4.33370129e+00 1.00000001e-01 ... 1.00020650e-01
  1.00028194e-01 1.00069214e-01]
 [1.00000001e-01 7.53892798e+01 1.00000001e-01 ... 5.64640353e+00
  1.00000000e-01 1.00001775e-01]
 [1.00000001e-01 2.73116440e+01 1.22052456e+00 ... 1.00013281e-01
  9.09993915e+00 1.00041036e-01]
 ...
 [1.00053985e-01 4.97790851e+01 1.00002910e-01 ... 1.00000001e-01
  1.00015751e-01 1.00017712e-01]
 [1.00000001e-01 4.16668725e+01 1.00000001e-01 ... 4.04158777e+00
  1.00000000e-01 1.00000000e-01]
 [1.00000001e-01 1.46629923e+02 1.00000001e-01 ... 2.53832999e+00
  1.00004102e-01 1.00000000e-01]]


In [97]:
feature_names_anz = vectorizer.get_feature_names_out()
print_top_words(lda_anz, feature_names_anz, n_top_words=10)

Topic #0:
card credit anz hi account need access select check youre
Topic #1:
number contact dm post hi like code send postcode phone
Topic #2:
sorry hi team banking kindly internet anz thanks darren ampm
Topic #3:
hi account payments funds branch business sorry payment nishant atm
Topic #4:
thanks email hi hoax delete link sms hoaxcybersecurityanzcom click message
Topic #5:
hi dm sorry send help thanks hear message details look
Topic #6:
team thanks hi feedback pm aest pass customer contact ampm
Topic #7:
anz hi app pay new branch link banking available use
Topic #8:
issue hi app issues try sorry anz working inconvenience banking
Topic #9:
hi inconvenience banking internet caused app anz know let apologise


In [98]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_anz.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_anz[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_anz = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_anz = df_top_words_anz.T 

df_anz_prefixed = df_top_words_transposed_anz

## Commonwealth Bank of Australia (CBA)

In [99]:
tweets_cba = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/CommBank_since_2018_01_01.csv')

texts_cba = tweets_cba['Tweet'].tolist()

clean_texts_cba = [clean_text(text) for text in texts_cba]

# ContextVectorize
X_cba = vectorizer.fit_transform(clean_texts_cba)

# Apply LDA
lda_cba = LatentDirichletAllocation(n_components=10)
topics_cba = lda_cba.fit_transform(X_cba)

print(lda_cba.components_)

[[0.10000215 0.1        0.1        ... 0.1        2.95719437 1.41099343]
 [1.09997963 0.1        3.63515521 ... 0.1        0.1        1.55548059]
 [0.10000143 0.1        1.56482571 ... 0.1        0.1        0.1       ]
 ...
 [1.09998801 0.1        0.10000062 ... 0.1        0.10001849 0.1       ]
 [0.10000783 0.1        0.10000792 ... 1.10000096 1.31117473 0.10002546]
 [0.10000712 2.09999191 0.10000096 ... 0.1        0.10001478 0.1       ]]


In [100]:
feature_names_cba = vectorizer.get_feature_names_out()
print_top_words(lda_cba, feature_names_cba, n_top_words=10)

Topic #0:
know thanks information report scams hoaxcbacomau let future security rina
Topic #1:
commbank app banking personal yello delete eligibility information click thanks
Topic #2:
check remember online look scams calls stop reject scam messages
Topic #3:
dm details hi like reach chris sorry thank help assistance
Topic #4:
customers hi message help scams branches genuine need funds atms
Topic #5:
send like hi message contact private number understand hear feedback
Topic #6:
dm send hi christine siji best sorry chat hello visit
Topic #7:
team need commbank hi account payid scam link thank transfers
Topic #8:
hi information help thanks team assist send enquiry understand dm
Topic #9:
card hi message details commbank anna thank assist credit dm


In [101]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_cba.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_cba[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_cba = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_cba = df_top_words_cba.T 

df_cba_prefixed = df_top_words_transposed_cba

## National Australia Bank (NAB)

In [102]:
tweets_nab = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/NAB_since_2018_01_01.csv')

texts_nab = tweets_nab['Tweet'].tolist()

clean_texts_nab = [clean_text(text) for text in texts_nab]

# ContextVectorize
X_nab = vectorizer.fit_transform(clean_texts_nab)

# Apply LDA
lda_nab = LatentDirichletAllocation(n_components=10)
topics_nab = lda_nab.fit_transform(X_nab)

print(lda_nab.components_)

[[0.1        4.93462299 0.1        ... 2.09998513 0.10000742 0.10016927]
 [0.10000152 0.10002018 0.1        ... 0.100015   0.1        0.1       ]
 [2.09992841 0.10001398 2.09998719 ... 0.1        0.10000107 0.1       ]
 ...
 [0.10000363 0.10003891 0.10000909 ... 0.1        0.1        0.1       ]
 [1.12515841 5.54515423 0.1        ... 0.1        0.1        0.1       ]
 [0.10000613 0.10001053 0.1        ... 0.10000814 3.09999151 0.1       ]]


In [103]:
feature_names_nab = vectorizer.get_feature_names_out()
print_top_words(lda_nab, feature_names_nab, n_top_words=10)

Topic #0:
dm send hi chat help like rg im id keen
Topic #1:
feedback team thanks understand appreciate like hi lodge thats wait
Topic #2:
banking app nab internet hi payments mobile issues working sorry
Topic #3:
need glad help rl hear youre worries pm team branch
Topic #4:
message delete hi thanks nab messages team security aware suspicious
Topic #5:
sorry hi im hear dm send able inconvenience working thanks
Topic #6:
thanks know hi team letting link ive ill rl click
Topic #7:
nab card hi business customers banking account new home loan
Topic #8:
know try pay hi let apple looking tc latest youre
Topic #9:
lh hey hear sorry im card rg ng hi account


In [104]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_nab.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_nab[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_nab = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_nab = df_top_words_nab.T 

df_nab_prefixed = df_top_words_transposed_nab

## Westpac Banking Corporation (WBC)

In [105]:
tweets_wbc = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/Westpac_since_2018_01_01.csv')

texts_wbc = tweets_wbc['Tweet'].tolist()

clean_texts_wbc = [clean_text(text) for text in texts_wbc]

# ContextVectorize
X_wbc = vectorizer.fit_transform(clean_texts_wbc)

# Apply LDA
lda_wbc = LatentDirichletAllocation(n_components=10)
topics_wbc = lda_wbc.fit_transform(X_wbc)

print(lda_wbc.components_)

[[ 5.05368795  0.1         4.0286193  ...  0.99267232  0.1
   0.1       ]
 [ 0.1         0.10000157 16.85344738 ...  1.20725812  0.10001896
   0.10002431]
 [ 0.1         0.1000011   0.10001475 ...  0.1         0.10007906
   0.1       ]
 ...
 [ 0.1         0.10002816  0.10000561 ...  0.1         3.07233027
   0.1       ]
 [ 0.1         0.1         0.10002982 ...  0.1         0.1
   0.1       ]
 [ 0.14618015  0.10000926  2.60879172 ...  0.10006955  0.10005166
   0.1       ]]


In [106]:
feature_names_wbc = vectorizer.get_feature_names_out()
print_top_words(lda_wbc, feature_names_wbc, n_top_words=10)

Topic #0:
dm send like concerns feedback ensure look hi thanks support
Topic #1:
team thanks hi branch know days help contact card pm
Topic #2:
dm send whats like information understanding better help best hi
Topic #3:
westpac hi help customers hope helps australia loan thanks home
Topic #4:
hi future open apple customers pay remain offering banking thanks
Topic #5:
message direct help information send hi complaints private right including
Topic #6:
hi know banking responded online issues inconvenience dm weve caused
Topic #7:
westpac pay help bank use australians apple payments customer cards
Topic #8:
sorry im hear hi help dm youve send way left
Topic #9:
hear dm hi send thanks sorry help im experience like


In [107]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_wbc.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_wbc[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_wbc = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_wbc = df_top_words_wbc.T 

df_wbc_prefixed = df_top_words_transposed_wbc

# Europe

## BNP Paribas

In [109]:
tweets_bnp = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/BNPParibas_since_2018_01_01.csv')

texts_bnp = tweets_bnp['Tweet'].tolist()

clean_texts_bnp = [clean_text(text) for text in texts_bnp]

# ContextVectorize
X_bnp = vectorizer.fit_transform(clean_texts_bnp)

# Apply LDA
lda_bnp = LatentDirichletAllocation(n_components=10)
topics_bnp = lda_bnp.fit(X_bnp)

print(lda_bnp.components_)

[[ 0.1         4.36495021  0.1        ...  0.1         0.10000125
   0.1       ]
 [ 0.1        30.53112384  0.1        ...  0.1         0.1
   0.1       ]
 [ 0.1         0.10000065  0.1        ...  0.1         0.1
   0.10000606]
 ...
 [ 0.1         0.100008    0.1        ...  0.1         0.10000034
   0.10000306]
 [ 0.1         1.80231969  0.1        ...  0.1         0.1
   0.1       ]
 [ 0.10010057  0.10000431  4.09999202 ...  2.1        11.09997778
   6.09370476]]


In [110]:
feature_names_bnp = vectorizer.get_feature_names_out()
print_top_words(lda_bnp, feature_names_bnp, n_top_words=10)

Topic #0:
positivebanking ceo bonnafé oyw group bnpparibas bank bnp paribas bnppcoalitions
Topic #1:
vivatech new mobility clients sustainable solutions bnppadvance data business support
Topic #2:
women vivatech wfgm tech positivebanking diversity meet womens_forum gender entrepreneurs
Topic #3:
vivatech startups positivebanking discover lab fondationbnpp innovation program bnppadvance digital
Topic #4:
paribas bnp vivatech support tennis young bnppcsr people impact discover
Topic #5:
clients wish day contact good answer dear account attended order
Topic #6:
jblefevre jbonnel ym bivwak xbond nicochan pierrecappelli fgraillot sebbourguignon atoucinho
Topic #7:
vous nous bonjour contacter dm afin bonne conseiller que par
Topic #8:
head positivebanking pff antoinesire vivatech ceo company engagement talk business
Topic #9:
la pour le et les des en nous du sur


In [111]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_bnp.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_bnp[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_bnp = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_bnp = df_top_words_bnp.T 

df_bnp_prefixed = df_top_words_transposed_bnp

## BNP Asset Mgt

In [112]:
tweets_bnp1 = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/BNPPAM_COM_since_2018_01_01.csv')

texts_bnp1 = tweets_bnp1['Tweet'].tolist()

clean_texts_bnp1 = [clean_text(text) for text in texts_bnp1]

# ContextVectorize
X_bnp1 = vectorizer.fit_transform(clean_texts_bnp1)

# Apply LDA
lda_bnp1 = LatentDirichletAllocation(n_components=10)
topics_bnp1 = lda_bnp1.fit(X_bnp1)

print(lda_bnp1.components_)

[[2.09996406 0.1        4.09996341 ... 2.09996159 7.10000161 0.1       ]
 [0.1        0.1        0.1        ... 0.1000202  0.10000039 0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.1        0.1       ]
 ...
 [0.1        0.10000294 1.10000221 ... 6.09960653 1.09994041 0.1       ]
 [0.1        0.1        0.1000108  ... 0.1        0.1000064  0.10000842]
 [0.10000841 0.1        1.09995137 ... 0.10002204 0.1000074  0.1       ]]


In [113]:
feature_names_bnp1 = vectorizer.get_feature_names_out()
print_top_words(lda_bnp1, feature_names_bnp1, n_top_words=10)

Topic #0:
bnppam_com china investment senior equities tells head greaterchina chinas economist
Topic #1:
bnppam_com head plsainvest stand jambachtsheer real global sustainability announce appointment
Topic #2:
video solutions latest learn investors multiasset covid sustainability strategy investing
Topic #3:
market asset read article paribas bnp inflation podcast markets latest
Topic #4:
bnppam_com head investment debt esg apac investors tells stewardship wilsonotto
Topic #5:
bnppam_com paul head sandhu apac client maqs investors advisory tells
Topic #6:
awards bnppam_com year best fund asset esg manager sustainability equity
Topic #7:
climate sri change global investment sustainable discover people research sustainability
Topic #8:
learn thegreatinstability economic inflation investors chinas chinese rates usd markets
Topic #9:
read markets article investing latest equity equities sustainable growth future


In [114]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_bnp1.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_bnp1[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_bnp1 = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_bnp1 = df_top_words_bnp1.T 

df_bnp1_prefixed = df_top_words_transposed_bnp1

## Crédit Agricole Group

In [115]:
tweets_cag = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/more_banks/Credit_Agricole_since_2018_01_01.csv')

texts_cag = tweets_cag['Tweet'].tolist()

clean_texts_cag = [clean_text(text) for text in texts_cag]

# ContextVectorize
X_cag = vectorizer.fit_transform(clean_texts_cag)

# Apply LDA
lda_cag = LatentDirichletAllocation(n_components=10)
topics_cag = lda_cag.fit(X_cag)

print(lda_cag.components_)

[[0.1        0.1        0.10005227 ... 0.1        0.10013138 0.10001695]
 [0.1        0.1        0.1        ... 0.1        2.81311784 0.1       ]
 [0.1        0.1        2.09994773 ... 0.1        1.10001044 0.1       ]
 ...
 [0.10001833 0.1        0.1        ... 1.10001179 0.1        0.1       ]
 [0.1        0.1        0.1        ... 0.1        0.10002414 0.10001977]
 [0.1        0.1        0.1        ... 0.1        0.1        2.09996328]]


In [116]:
feature_names_cag = vectorizer.get_feature_names_out()
print_top_words(lda_cag, feature_names_cag, n_top_words=10)

Topic #0:
vivatech ca_cib cavivatech le nous sur stand notre aux avec
Topic #1:
group ukraine agricole banking credit teams million customer met fund
Topic #2:
russia companies financing russian hi war institutions soon ukraine begun
Topic #3:
pour vous la en nous et plus les des le
Topic #4:
le la les et des en pour dans du sur
Topic #5:
vous nous la le en que une pff et je
Topic #6:
du le crédit agricole et sa en directeur la général
Topic #7:
et du nos aca pour résultats en le est les
Topic #8:
la et les des le en du au pour avec
Topic #9:
la vous sur les dans pour des bonne le creditagricole


In [117]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_cag.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_cag[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_cag = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_cag = df_top_words_cag.T 

df_cag_prefixed = df_top_words_transposed_cag

## Barclays PLC

In [118]:
tweets_barclays = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/more_banks/Barclays_since_2018_01_01.csv')

texts_barclays = tweets_barclays['Tweet'].tolist()

clean_texts_barclays = [clean_text(text) for text in texts_barclays]

# ContextVectorize
X_barclays = vectorizer.fit_transform(clean_texts_barclays)

# Apply LDA
lda_barclays = LatentDirichletAllocation(n_components=10)
topics_barclays = lda_barclays.fit(X_barclays)

print(lda_barclays.components_)

[[ 1.94436966  0.1         0.10000569 ... 11.51836224  0.1
   0.86748877]
 [ 1.20153368  0.1         0.10000127 ...  0.10000182  6.09998554
   0.1       ]
 [ 1.24283358  1.10001608  9.09994601 ...  0.10001354  0.1
   0.10000943]
 ...
 [39.53882524  0.1         0.10000125 ... 38.12746009  0.1
   0.10002848]
 [ 6.48425319  0.1         0.1        ...  9.35933992  0.10000131
   3.0264945 ]
 [ 2.62484127  0.1         0.1        ...  0.10008565  0.1
   1.173503  ]]


In [120]:
feature_names_barclays = vectorizer.get_feature_names_out()
print_top_words(lda_barclays, feature_names_barclays, n_top_words=10)

Topic #0:
sorry try im whats today happened need help thank app
Topic #1:
colleagues customers happy barclays clients world uk bank new wish
Topic #2:
check app ensure need absolutely account pay youre working update
Topic #3:
complaint like log youd link complaints fully sure include make
Topic #4:
thanks hi available hope youve hey theyre youre help askbarclaysus
Topic #5:
barclays group read new venkatakrishnan cs proud results years colleagues
Topic #6:
banking post local office app community online weve use thank
Topic #7:
youre sorry im know thanks dm let hi help really
Topic #8:
dm number barclaysukhelp contact postcode pop send help chat support
Topic #9:
barclays rewards blue message community wimbledon today offer support day


In [123]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_barclays.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_barclays[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_barclays = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_barclays = df_top_words_barclays.T 

df_barclays_prefixed = df_top_words_transposed_barclays

## Banco Santander SA (BSSA)

In [124]:
tweets_bssa = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/more_banks/bancosantander_since_2018_01_01.csv')

texts_bssa = tweets_bssa['Tweet'].tolist()

clean_texts_bssa = [clean_text(text) for text in texts_bssa]

# ContextVectorize
X_bssa = vectorizer.fit_transform(clean_texts_bssa)

# Apply LDA
lda_bssa = LatentDirichletAllocation(n_components=10)
topics_bssa = lda_bssa.fit(X_bssa)

print(lda_bssa.components_)

[[0.10001716 0.1        0.10000877 ... 0.1        0.1        0.1       ]
 [0.1        2.09991637 1.1000022  ... 0.1        0.1        0.1       ]
 [2.09999297 0.1        1.09998903 ... 0.1        0.1        0.1       ]
 ...
 [0.1        0.1        0.1        ... 0.1        0.1        0.1       ]
 [0.1        0.10001883 0.1        ... 0.1        0.1        0.1       ]
 [1.09996789 0.1        0.1        ... 0.1        0.1        0.1       ]]


In [125]:
feature_names_bssa = vectorizer.get_feature_names_out()
print_top_words(lda_bssa, feature_names_bssa, n_top_words=10)

Topic #0:
la en el que para los más las por es
Topic #1:
santander start challenge day work make future ready blockchain dont
Topic #2:
en la del los el para santander resultados las accionistas
Topic #3:
santander millones en euros results beneficio el más bancosantander al
Topic #4:
en para una la más que santander tu los el
Topic #5:
help digital new financial know bank work want santander people
Topic #6:
por gracias hola favor en banco que te nuestros compañeros
Topic #7:
santander banking global know digital bank today anabotin best dont
Topic #8:
million profit customers really santander attributable share money santanders pay
Topic #9:
san madrid acción cierre bancosantander public presence dirigiendo contributed outlook


In [126]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_bssa.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_bssa[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_bssa = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_bssa = df_top_words_bssa.T 

df_bssa_prefixed = df_top_words_transposed_bssa

## Group BPCE

In [127]:
tweets_bpce = pd.read_csv('/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/more_banks/GroupeBPCE_since_2018_01_01.csv')

texts_bpce = tweets_bpce['Tweet'].tolist()

clean_texts_bpce = [clean_text(text) for text in texts_bpce]

# ContextVectorize
X_bpce = vectorizer.fit_transform(clean_texts_bpce)

# Apply LDA
lda_bpce = LatentDirichletAllocation(n_components=10)
topics_bpce = lda_bpce.fit(X_bpce)

print(lda_bpce.components_)

[[2.02583507 2.02583826 0.1        ... 2.10002695 0.1        0.1       ]
 [0.1        0.1        0.1        ... 3.22075618 0.1        0.1       ]
 [0.10000404 0.10001002 0.1        ... 1.91668401 0.1        0.1       ]
 ...
 [0.1        0.1        2.09980337 ... 0.10000534 2.0999528  0.10001037]
 [0.1        0.1        0.10001947 ... 4.77325404 0.1        0.1       ]
 [1.17416089 1.17413987 0.1        ... 3.76341116 0.1        0.1       ]]


In [128]:
feature_names_bpce = vectorizer.get_feature_names_out()
print_top_words(lda_bpce, feature_names_bpce, n_top_words=10)

Topic #0:
la en et le les par sur des une au
Topic #1:
des le la les par et en sur pour avec
Topic #2:
vous bien bonjour cordialement je dm le si votre souhaitez
Topic #3:
du la les laurent mignon des président directoire bpce groupe
Topic #4:
la et les du des pour le leur en bpce
Topic #5:
du bpce et groupe des les résultats la le sur
Topic #6:
le pour et en la une les voilebanquepop par des
Topic #7:
du le et groupe les bpce des en pour la
Topic #8:
et du les paris la groupe des le bpce pour
Topic #9:
la le et des du groupe les en bpce une


In [129]:
# Get top words per topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda_bpce.components_):
    top_features_ind = topic.argsort()[:-11:-1]
    top_features = [feature_names_bpce[i] for i in top_features_ind]
    top_words_per_topic.append(top_features)

# Create a DataFrame with the top words for each topic
df_top_words_bpce = pd.DataFrame(top_words_per_topic)
df_top_words_transposed_bpce = df_top_words_bpce.T 

df_bpce_prefixed = df_top_words_transposed_bpce

In [136]:
df_final = pd.DataFrame()
#Canada
df_canada = pd.DataFrame([['Canada']], columns = [0])
#Store RBC to excel file
df_rbc_with_name = pd.DataFrame([['RBC']], columns=[0])  # Only a cell includes "RBC"
df_rbc_with_topics = pd.concat([df_canada, df_rbc_with_name, df_rbc_prefixed], ignore_index=True)


df_final = pd.concat([df_final, df_rbc_with_topics], ignore_index=True)
#Store CIBC to excel file
df_cibc_with_name = pd.DataFrame([['CIBC']], columns=[0]) 
df_cibc_with_topics = pd.concat([df_cibc_with_name, df_cibc_prefixed], ignore_index=True)
df_final = pd.concat([df_final, df_cibc_with_topics], ignore_index=True)

#Store Scotiabank to excel file
df_scotia_with_name = pd.DataFrame([['ScotiaBank']], columns =[0])
df_scotia_with_topics = pd.concat([df_scotia_with_name,df_scotia_prefixed], ignore_index= True)
df_final = pd.concat([df_final, df_scotia_with_topics], ignore_index=True)

#Store TD Canada News to excel file
df_tdcanada_with_name = pd.DataFrame([['TD_Canada_News']], columns = [0])
df_tdcanada_with_topics = pd.concat([df_tdcanada_with_name, df_tdcanada_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_tdcanada_with_topics], ignore_index=True)

#Store TD Canada Official Bank to excel file
df_tdcanada1_with_name = pd.DataFrame([['TD_Canada']], columns = [0])
df_tdcanada1_with_topics = pd.concat([df_tdcanada1_with_name, df_tdcanada1_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_tdcanada1_with_topics], ignore_index=True)

#US Area
df_us = pd.DataFrame([['US']], columns = [0])
#Store TD US News to excel file
df_tdus_with_name = pd.DataFrame([['TD_US_News']], columns = [0])
df_tdus_with_topics = pd.concat([df_us, df_tdus_with_name, df_tdus_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_tdus_with_topics], ignore_index=True)

#Store TD US Official Account to excel file
df_tdus1_with_name = pd.DataFrame([['TD_US']], columns = [0])
df_tdus1_with_topics = pd.concat([df_tdus1_with_name, df_tdus1_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_tdus1_with_topics], ignore_index=True)

#Store Morgan Stanley to excel file
df_ms_with_name = pd.DataFrame([['Morgan Stanley']], columns = [0])
df_ms_with_topics = pd.concat([df_ms_with_name, df_ms_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_ms_with_topics], ignore_index=True)

#Store UBS
df_ubs_with_name = pd.DataFrame([['UBS']], columns = [0])
df_ubs_with_topics = pd.concat([df_ubs_with_name, df_ubs_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_ubs_with_topics], ignore_index=True)
#Citi
df_citi_with_name = pd.DataFrame([['Citi']], columns = [0])
df_citi_with_topics = pd.concat([df_citi_with_name, df_citi_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_citi_with_topics], ignore_index=True)
#Wells Fargo
df_wf_with_name = pd.DataFrame([['Wells Fargo']], columns = [0])
df_wf_with_topics = pd.concat([df_wf_with_name, df_wf_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_wf_with_topics], ignore_index=True)
#BOA (Bank of America)
df_boa_with_name = pd.DataFrame([['Bank of America']], columns = [0])
df_boa_with_topics = pd.concat([df_boa_with_name, df_boa_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_boa_with_topics], ignore_index=True)
#JP Morgan
df_jp_with_name = pd.DataFrame([['JP Morgan']], columns = [0])
df_jp_with_topics = pd.concat([df_jp_with_name, df_jp_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_jp_with_topics], ignore_index=True)
#Raymond James
df_rj_with_name = pd.DataFrame([['Raymond James']], columns = [0])
df_rj_with_topics = pd.concat([df_rj_with_name, df_rj_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_rj_with_topics], ignore_index=True)

#Africa
df_africa = pd.DataFrame([['Africa']], columns = [0])
#Quant Africa
df_qa_with_name = pd.DataFrame([['Quant Africa']], columns = [0])
df_qa_with_topics = pd.concat([df_africa, df_qa_with_name, df_qa_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_qa_with_topics], ignore_index=True)
#Standard Bank
df_sb_with_name = pd.DataFrame([['Standard Bank']], columns = [0])
df_sb_with_topics = pd.concat([df_sb_with_name, df_sb_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_sb_with_topics], ignore_index=True)
#Northern Trust
df_nt_with_name = pd.DataFrame([['Standard Bank']], columns = [0])
df_nt_with_topics = pd.concat([df_nt_with_name, df_nt_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_nt_with_topics], ignore_index=True)
#UBA
df_uba_with_name = pd.DataFrame([['UBA']], columns = [0])
df_uba_with_topics = pd.concat([df_uba_with_name, df_uba_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_uba_with_topics], ignore_index=True)

#Asia
df_asia = pd.DataFrame([['Asia']], columns = [0])
#HSBC
df_hsbc_with_name = pd.DataFrame([['HSBC']], columns = [0])
df_hsbc_with_topics = pd.concat([df_asia, df_hsbc_with_name, df_hsbc_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_hsbc_with_topics], ignore_index=True)
#OCBC Premier banking
df_ocbc_with_name = pd.DataFrame([['OCBC Premier Banking']], columns = [0])
df_ocbc_with_topics = pd.concat([df_ocbc_with_name, df_ocbc_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_ocbc_with_topics], ignore_index=True)
#Bank of Singapore
df_bos_with_name = pd.DataFrame([['Bank of Singapore']], columns = [0])
df_bos_with_topics = pd.concat([df_bos_with_name, df_bos_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_bos_with_topics], ignore_index=True)
#Hana Bank
df_hana_with_name = pd.DataFrame([['Hana Bank']], columns = [0])
df_hana_with_topics = pd.concat([df_hana_with_name, df_hana_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_hana_with_topics], ignore_index=True)

#Oceania
df_oceania = pd.DataFrame([['Oceania']], columns = [0])
#ANZ
df_anz_with_name = pd.DataFrame([['ANZ']], columns = [0])
df_anz_with_topics = pd.concat([df_oceania, df_anz_with_name, df_anz_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_anz_with_topics], ignore_index=True)
#CBA
df_cba_with_name = pd.DataFrame([['CBA']], columns = [0])
df_cba_with_topics = pd.concat([df_cba_with_name, df_cba_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_cba_with_topics], ignore_index=True)
#NAB
df_nab_with_name = pd.DataFrame([['NAB']], columns = [0])
df_nab_with_topics = pd.concat([df_nab_with_name, df_nab_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_nab_with_topics], ignore_index=True)
#WBC
df_wbc_with_name = pd.DataFrame([['WBC']], columns = [0])
df_wbc_with_topics = pd.concat([df_wbc_with_name, df_wbc_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_wbc_with_topics], ignore_index=True)

#Europe
df_europe = pd.DataFrame([['Europe']], columns = [0])
#BNP Paribas
df_bnp_with_name = pd.DataFrame([['BNP Paribas']], columns = [0])
df_bnp_with_topics = pd.concat([df_europe, df_bnp_with_name, df_bnp_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_bnp_with_topics], ignore_index=True)
#Crédit Agricole Group
df_cag_with_name = pd.DataFrame([['Crédit Agricole Group']], columns = [0])
df_cag_with_topics = pd.concat([df_cag_with_name, df_cag_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_cag_with_topics], ignore_index=True)
#Barclays PLC
df_barclays_with_name = pd.DataFrame([['Barclays PLC']], columns = [0])
df_barclays_with_topics = pd.concat([df_barclays_with_name, df_barclays_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_barclays_with_topics], ignore_index=True)
#Banco Santander SA
df_bssa_with_name = pd.DataFrame([['Banco Santander SA']], columns = [0])
df_bssa_with_topics = pd.concat([df_bssa_with_name, df_bssa_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_bssa_with_topics], ignore_index=True)
#Groupe BPCE
df_bpce_with_name = pd.DataFrame([['BNP Paribas']], columns = [0])
df_bpce_with_topics = pd.concat([df_bpce_with_name, df_bpce_prefixed], ignore_index = True)
df_final = pd.concat([df_final, df_bpce_with_topics], ignore_index=True)


excel_path = '/Users/houhiroshisakai/Desktop/Schulich/Term 2/MBAN 6090/ResultIntegration.xlsx'
df_final.to_excel(excel_path, sheet_name='Bank_Topics', index=False)