# Topic Modelling/Voyant text preparation

To make topic topelling possible, you need to strip the text of all insignificant strips of words and make them as coherent as possible without losing quality. Key to this is a cleaning master function.

In [1]:
import pandas as pd
from os.path import join
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from numpy import nan

# alternative model building package
# import sklearn
# from sklearn.decomposition import NMF

# package to clean text
import re

# Load modules for cleaning tasks
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import GermanStemmer

In [2]:
src = "/media/s/Linux_storage/Analyse_Verkehrswende_Transformation/Data/Greenwashing"

df = pd.read_csv(join(src, 'WordTree.csv'), encoding='utf-8', dtype='unicode')

In [3]:
# Standard cleaning tasks

def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # remove bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('RT @[\w_]+:', '', tweet) # remove retweet ## Alternatives: RT @[\w_]+: OR (RT\s@[A-Za-z]+[A-Za-z0-9-_]+)
    tweet = re.sub('@[A-Za-z0-9_]', '', tweet) # remove tweeted at ## Alternatives: @[A-Za-z0-9_]+ OR (@[A-Za-z]+[A-Za-z0-9-_]+)
    return tweet

my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.GermanStemmer(ignore_stopwords=False).stem
#word_rooter2 = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
#my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@' ## Alternatives: r'[^\w\s] OR !"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@ 

# cleaning master function
def clean_tweet(tweet, bigrams=False): # change bigrams to True to enable further analysis
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    #tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub(r'[^\w\s]', ' ', tweet) # strip punctuation
    tweet = re.sub(r'\b\w{1,3}\b', '', tweet, re.UNICODE)
    tweet = re.sub('\s+', ' ', tweet, re.UNICODE) # remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet, re.UNICODE) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords
    tweet_token_list = [word_rooter(word) if '#' not in word else word 
                        for word in tweet_token_list] # apply word rooter
    #tweet_token_list = [word_rooter2(word) if '#' not in word else word 
    #                    for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

In [5]:
df

Unnamed: 0,Body
0,\nSkip to main content\nAdvertisementInternati...
1,\nPlease enable cookies.\nError\n1020\nRay ID:...
2,\nSkip to main content\nAdvertisementInternati...
3,400 Bad request\nYour browser sent an invalid ...
4,\nClose\n \n聽/聽\n鈫惵犫啋\n \nSearch\nSearch\nYale...
...,...
2167,\nWhat鈥檚 On\nCulture\nStyle\nHealth & Beauty\n...
2168,\n \nHome\nSustainable Investment Week\nESG\nS...
2169,Not Acceptable!An appropriate representation o...
2170,\n \nSign in\nHome\nNews\nGlobal rubber market...


In [4]:
df['clean_text'] = df.Body.apply(clean_tweet) # This gets cleaned tweets

#df['clean_tweet'] = df['clean_tweet'].apply(
#    lambda x: x.replace("test", "")) # Merge certain singular words

In [8]:
df['clean_text']

0        skip main content advertisementinternational ...
1        pleas enabl cooki error  eefd  access denied ...
2        skip main content advertisementinternational ...
3        requ your brows sent invalid requ _xd_ highly...
4        clos 鈫惵犫啋 search search yal environment publi...
                              ...                        
2167     what鈥檚 cultur styl health beauty fashion jewe...
2168     hom sustainabl investment week sustainabl inv...
2169     acceptabl appropriat representation requested...
2170     sign hom news global rubb market news economi...
2171     tuesday july  breaking news giorgio chiellini...
Name: clean_text, Length: 2172, dtype: object

## Topic Modelling

In [5]:
src = "/media/s/Linux_storage/Analyse_Verkehrswende_Transformation/Data/Greenwashing"

df = pd.read_csv(join(src, 'greenwashing_clean.csv'), encoding='utf-8', dtype='unicode', 
                   parse_dates=['created_at'])

In [7]:
df.head

<bound method NDFrame.head of        Unnamed: 0                   id                created_at  \
0          689365  1109566187342565384 2019-03-23 21:22:48+00:00   
1          659519  1131217759877181440 2019-05-22 15:18:25+00:00   
2          526567  1204674273023053824 2019-12-11 08:08:23+00:00   
3          184374  1370839105123323905 2021-03-13 20:48:06+00:00   
4          675614  1116158418350432256 2019-04-11 01:57:58+00:00   
...           ...                  ...                       ...   
186858     633541  1155997398130257920 2019-07-30 00:23:51+00:00   
186859     633532  1156014938986434562 2019-07-30 01:33:33+00:00   
186860     633526  1156030252881068032 2019-07-30 02:34:24+00:00   
186861     633506  1156065050013999104 2019-07-30 04:52:41+00:00   
186862     579462  1186349718823194630 2019-10-21 18:33:08+00:00   

                                                     text  author.username  \
0       ADOS co-founder Yvette Carnell admits to being...      ImaniKushan 

In [5]:
df = df.dropna()

In [30]:
# Add additional stopwords
df['clean_tweet'] = df['clean_tweet'].apply(
    lambda x: x.replace("greenwashing", "")) # Merge certain singular words

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_tweet'] = df['clean_tweet'].apply(


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation to get Matrix
tf = vectorizer.fit_transform(df['clean_text']).toarray()
#tf = vectorizer.fit_transform(df.pop('text'))

# adding "features" columns as SparseSeries
#for i, col in enumerate(vectorizer.get_feature_names()):
#    df[col] = pd.SparseSeries(tf[:, i].toarray().ravel(), fill_value=0)

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()


In [7]:
#tf_feature_names

In [8]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 15

# Here we have two different models: LDA vs NFM. NFM might work better with Tweet data.
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
#model = NMF(n_components=None, random_state=0, alpha=.1, l1_ratio=.5)

In [9]:
model.fit(tf)

def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        df = pd.DataFrame(topic_dict)
        df.to_csv('/media/s/Linux_storage/Analyse_Verkehrswende_Transformation/Data/Greenwashing' + '/' + 'topicmodelling_wordcloud.csv')
    return df


In [10]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,...,Topic 10 words,Topic 10 weights,Topic 11 words,Topic 11 weights,Topic 12 words,Topic 12 weights,Topic 13 words,Topic 13 weights,Topic 14 words,Topic 14 weights
0,de,1765.4,2021,651.0,_x000d_,3576.0,聽,3553.6,climat,1149.2,...,esg,2577.2,green,1190.4,news,815.5,ago,770.3,climat,3424.4
1,product,1090.5,jul,375.2,_xd_,1010.5,2020,710.4,news,741.7,...,investment,2361.0,journal,369.6,environmental,786.0,bitcoin,362.0,鈥,2869.6
2,la,887.3,1,354.9,english,473.3,2019,676.7,鈥,725.2,...,fund,1973.5,data,271.8,business,768.8,hour,278.6,energy,2183.4
3,greenwashing,803.7,news,354.2,law,469.7,2018,631.5,eu,635.7,...,investor,1798.9,articl,254.1,event,725.7,min,265.3,fuel,1637.7
4,di,708.9,investing,341.9,program,435.3,email,599.9,green,601.8,...,financial,1419.1,greenwashing,219.7,contact,684.1,mining,243.5,oil,1534.4
5,en,706.8,icon,326.7,paid,421.3,march,587.9,energy,522.7,...,market,976.6,sustainability,211.7,new,662.4,energy,198.0,gas,1477.9
6,e,593.7,read,324.5,best,304.7,july,586.8,uk,510.4,...,investing,961.5,climat,205.2,us,658.2,pow,189.6,fossil,1424.2
7,claim,467.8,more,317.3,news,300.2,april,564.8,said,487.6,...,asset,948.9,bond,193.7,greenwashing,616.5,鈥,187.9,chang,1351.7
8,consum,365.7,string,279.5,consum,223.1,february,549.5,july,475.8,...,鈥,870.3,issu,193.0,search,585.0,project,167.6,emission,1230.4
9,le,351.3,july,236.3,free,190.6,2017,526.3,european,461.0,...,risk,850.9,environmental,187.0,email,562.0,vaccin,159.6,new,1187.5
