# Importing packages

In [None]:
# Standard packages
import pandas as pd
import os
import re

# Natural language (pre)processing
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag


# Wordcloud for descriptive images
from wordcloud import WordCloud

# LDA packages
from gensim import matutils, models
import scipy.sparse
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Importing the Twitter dataset

In [None]:
# Import the dataset
Tourists = pd.read_csv('TwitterHashtags_LDA_NL.csv', sep= ',', low_memory = False, lineterminator='\n')

# Basic Data Cleaning

In [None]:
Tourists.shape

In [None]:
# Use a lambda function to make everything lower case.
Tourists['text'] = Tourists['text'].map(lambda x: x.lower())

In [None]:
# Remove punctuation
Tourists['text'] = Tourists['text'].map(lambda x: re.sub('[,\.!?@]', '', x))

In [None]:
# Remove hyperlinks that start with HTTP.
Tourists['text'] = Tourists['text'].map(lambda x: re.sub(r"http\S+", "", x))

In [None]:
# Remove words with less than 3 characters (stopwords)
Tourists['text'] = Tourists['text'].map(lambda x: re.sub(r'\b\w{1,3}\b', '', x))

In [None]:
Tourists.head()

# Natural Language Toolkit cleaning

In [None]:
# Create Stemming objects
pStemmer = PorterStemmer() # English only
lStemmer = LancasterStemmer() # English only
# Snowball stemmers for the top 5 languages in the twitter dataset.
dutchStemmer = SnowballStemmer("dutch", ignore_stopwords = True)
englishStemmer = SnowballStemmer("english", ignore_stopwords = True)
spanishStemmer = SnowballStemmer("german", ignore_stopwords = True)
frenchStemmer = SnowballStemmer("french", ignore_stopwords = True)
germanStemmer = SnowballStemmer("spanish", ignore_stopwords = True)

#### Creating stemming functions that can be applied to the text in the dataframe

In [None]:
def stemDutch(inputSentence):
    
    # Creates a list of words in the sentence
    words = word_tokenize(inputSentence)
    
    # Empty list to be filled in with the stemmed words
    stemmedSentence=[]
    
    # Iterate over all the words in the sentence
    for word in words:
        
        # Add the individually stemmed word to the empty list 
        stemmedSentence.append(dutchStemmer.stem(word))
        
        # Add a space to the empty list
        stemmedSentence.append(" ")
    
    # Return the stemmed sentence
    return "".join(stemmedSentence)

In [None]:
def stemEnglish(inputSentence):
    
    # Creates a list of words in the sentence
    words = word_tokenize(inputSentence)
    
    # Empty list to be filled in with the stemmed words
    stemmedSentence=[]
    
    # Iterate over all the words in the sentence
    for word in words:
        
        # Add the individually stemmed word to the empty list 
        stemmedSentence.append(englishStemmer.stem(word))
        
        # Add a space to the empty list
        stemmedSentence.append(" ")
    
    # Return the stemmed sentence
    return "".join(stemmedSentence)

In [None]:
def stemSpanish(inputSentence):
    
    # Creates a list of words in the sentence
    words = word_tokenize(inputSentence)
    
    # Empty list to be filled in with the stemmed words
    stemmedSentence=[]
    
    # Iterate over all the words in the sentence
    for word in words:
        
        # Add the individually stemmed word to the empty list 
        stemmedSentence.append(spanishStemmer.stem(word))
        
        # Add a space to the empty list
        stemmedSentence.append(" ")
    
    # Return the stemmed sentence
    return "".join(stemmedSentence)

In [None]:
def stemFrench(inputSentence):
    
    # Creates a list of words in the sentence
    words = word_tokenize(inputSentence)
    
    # Empty list to be filled in with the stemmed words
    stemmedSentence=[]
    
    # Iterate over all the words in the sentence
    for word in words:
        
        # Add the individually stemmed word to the empty list 
        stemmedSentence.append(frenchStemmer.stem(word))
        
        # Add a space to the empty list
        stemmedSentence.append(" ")
    
    # Return the stemmed sentence
    return "".join(stemmedSentence)

In [None]:
def stemGerman(inputSentence):
    
    # Creates a list of words in the sentence
    words = word_tokenize(inputSentence)
    
    # Empty list to be filled in with the stemmed words
    stemmedSentence=[]
    
    # Iterate over all the words in the sentence
    for word in words:
        
        # Add the individually stemmed word to the empty list 
        stemmedSentence.append(germanStemmer.stem(word))
        
        # Add a space to the empty list
        stemmedSentence.append(" ")
    
    # Return the stemmed sentence
    return "".join(stemmedSentence)

#### Lemmatization function (english only)

In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()

In [None]:
def lemmatizeSentence(inputSentence):
    
    # Transform to lower case
    inputSentence = inputSentence.lower()
    
    # Punctuations 
    punctuations = "?:!.,;"
    
    # Tokenize the words
    words = nltk.word_tokenize(inputSentence)
    
    # Remove word if it is a punctuation or excluded word
    for word in words:
        if word in punctuations:
            words.remove(word)
    
    # Empty array of lemmatized words, to be filled in by the loop.
    lemmatizedSentence = []
    
    # Iterate over all the words in the sentence and apply lemmatization
    for word in words:
        
        # Add the individually stemmed word to the empty list 
        lemmatizedSentence.append(lemmatizer.lemmatize(word, pos="v")) # Pos is very important!
        
        # Add a space to the empty list
        lemmatizedSentence.append(" ")
    
    # Return the lemmatized sentence
    return "".join(lemmatizedSentence)

In [None]:
# Apply the functions to stem the sentences and create a new column (only the first time)
Tourists['text_processed'] = Tourists['text'].map(lambda x: stemDutch(x)) # Dutch stemming
Tourists['text_processed'] = Tourists['text_processed'].map(lambda x: stemEnglish(x)) # English
Tourists['text_processed'] = Tourists['text_processed'].map(lambda x: stemSpanish(x)) # Spanish
Tourists['text_processed'] = Tourists['text_processed'].map(lambda x: stemFrench(x)) # French
Tourists['text_processed'] = Tourists['text_processed'].map(lambda x: stemGerman(x)) # German

In [None]:
# Wordcloud to give an idea of the processed text.

# Create a string with all the tweets
allWords = ','.join(list(Tourists['text_processed'].values))

# Instantiate a wordcloud
wordcloud = WordCloud(width=800, height=400, background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')

# Fill in the wordcloud
wordcloud.generate(allWords)

# Display the wordcloud
wordcloud.to_image()

# Latent Dirichlet Allocation

In [None]:
# Download package that is needed for the function
nltk.download('averaged_perceptron_tagger')

#### Functions to process the text more, maintaining only nouns or adjectives or both

In [None]:
def getNouns(inputText):
    
    # Lambda function to check if the word is a noun
    is_noun = lambda pos: pos[:2] == 'NN'
    
    # Split up the sentence into words
    words = word_tokenize(inputText)
    
    # Get only the nouns
    nouns = [word for (word, pos) in pos_tag(words) if is_noun(pos)] 
    
    # Return the nouns
    return ' '.join(nouns)

In [None]:
def getNounsAndAdj(inputText):
    
    # Lambda function to check if the word is a noun or an adjective
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    
    # Split up the sentence into words
    words = word_tokenize(inputText)
    
    # Get only nouns and adjectives
    nounsAndAdj = [word for (word, pos) in pos_tag(words) if is_noun_adj(pos)] 
    
    # Return the nouns and adjectives
    return ' '.join(nounsAndAdj)

In [None]:
# Creating two new, one with nouns, one with nouns and adjectives. This way there are three dataframes to perform LDA on.
Tourists_Nouns = Tourists.copy()
Tourists_NounsAndAdj = Tourists.copy()

In [None]:
# Apply lambda function to get only nouns
Tourists_Nouns['nouns'] = Tourists_Nouns['text_processed'].map(lambda x: getNouns(x))

In [None]:
# Apply lambda function to get nouns and adjectives
Tourists_NounsAndAdj['nouns'] = Tourists_NounsAndAdj['text_processed'].map(lambda x: getNounsAndAdj(x))

#### Transform the dataframes to the final form for LDA

In [None]:
Tourists.head()

In [None]:
# Only keep two columns.
Tourists = Tourists[['hashtag', 'text_processed']]
Tourists_Nouns = Tourists_Nouns[['hashtag', 'nouns']]
Tourists_NounsAndAdj = Tourists_NounsAndAdj[['hashtag','nouns']]

In [None]:
# Set the hashtag as index.
Tourists.set_index('hashtag', inplace=True)
Tourists_Nouns.set_index('hashtag', inplace=True)
Tourists_NounsAndAdj.set_index('hashtag', inplace=True)

In [None]:
# Adding stopwords
add_stop_words = ['stopwords']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

#### Creating a document term matrix

In [None]:
# Create count vectorizer object including some stopwords
cvn = CountVectorizer(stop_words=stop_words)

In [None]:
# Create a sparse matrix with the nouns data
SparseMatrix = cvn.fit_transform(Tourists.text_processed)

In [None]:
# Create a document term matrix
DocumentTermMatrix = pd.DataFrame(SparseMatrix.toarray(), columns=cvn.get_feature_names())

In [None]:
# Change the index of the document term matrix to match the original index
DocumentTermMatrix.index = Tourists.index

In [None]:
# Show the document term matrix
DocumentTermMatrix

In [None]:
# Generate a gensim corpus
corpus = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(DocumentTermMatrix.transpose()))

In [None]:
# Create a vocabulary dictionary
vocabulary = dict((v, k) for k, v in cvn.vocabulary_.items())

In [None]:
# Conduct the LDA
lda = models.LdaModel(corpus=corpus, num_topics=20, id2word=vocabulary, passes=10)

In [None]:
# Show the topics of the LDA
test = lda.show_topics(num_topics=20)

In [None]:
test

In [None]:
corpus_transformed = lda[corpus]
# Get the list of topic probabilities per document (per tweet)
topicList = list(corpus_transformed)

In [None]:
# Create a highest topic list
highestTopicList = []

In [None]:
# Get the most likely topic for each tweet.
for i in topicList:
    
    # Get the number of topics.
    numberOfTopics = len(i)
    
    # Set the highest topic and topic probability to -1, will be changed in the loop below.
    highestTopic = -1
    highestTopicProbability = -1
    
    # Iterate over the topics
    for j in range(0, numberOfTopics):
        
        # Only if the next topic scores higher than the one before, change the topic.
        if i[j][1] > highestTopicProbability:
            highestTopic = i[j][0] # Set the highest topic.
            highestTopicProbability = i[j][1] # Set the highest topic probability.
    
    # Append the highest topic to the list
    highestTopicList.append(highestTopic)
    
    # Reset the values just in case.
    highestTopic = -1
    highestTopicProbability = -1

In [None]:
# Zip the highest topic list and indexes together
topicPerPost = list(zip(highestTopicList, Tourists.index))

In [None]:
# Instantiate an empty dictionary
topicDictionary = {}

In [None]:
# Iterate over topicPerTweet to 1) assign topic names and 2) create a dictionary.
for i in topicPerPost:
    topicDictionary[i[1]] = 'Dutch Topic ' + str(i[0] + 1)

In [None]:
# Create a topic dataframe.
topicDataframe = pd.DataFrame(topicDictionary, index=[0])
topicDataframe = topicDataframe.T
topicDataframe = topicDataframe.rename(columns={0: "topic"})

In [None]:
# Get an overview of the topic distribution over the hashtags.
topicDataframe['topic'].value_counts()

In [None]:
# Export, import, export to fill in the right column names.
topicDataframe.to_csv('Topics_Twitter_NL.csv')
topicDataframe_Imported = pd.read_csv('Topics_Twitter_NL.csv', sep= ',', low_memory = False, lineterminator='\n')
topicDataframe_Imported.rename(columns = {'Unnamed: 0':'hashtag'}, inplace=True)
topicDataframe_Imported.to_csv('Topics_Twitter_NL.csv', index=False)

# Assigning the topics to the tweets

In [None]:
topicDataframe_Imported.head()

In [None]:
# Import the correct dataset
Tourists_EN = pd.read_csv('TwitterHashtags_LDA_NL.csv', sep= ',', low_memory = False, lineterminator='\n')

In [None]:
# Importing dataset
topicDataframe_Imported = pd.read_csv('Topics_Twitter_NL.csv', sep= ',', low_memory = False, lineterminator='\n')

In [None]:
Tourists_EN

In [None]:
# Merge the sets.
Hashtag_Merged = pd.merge(Tourists_EN, topicDataframe_Imported, on='hashtag')

In [None]:
# Export to csv for reuse.
Hashtag_Merged.to_csv('TwitterHashtags_LDA_EN.csv', index=False)

In [None]:
Hashtag_Merged.head()

In [None]:
# Import the tourists.
Tourists = pd.read_csv('tourists_total.csv', sep= ',', low_memory = False, lineterminator='\n')

In [None]:
# Get only the english tourists.
Tourists = Tourists[Tourists['lang'] == 'nl']

In [None]:
# Create an empty list to story the topic and tweet matches.
TopicsPerTweet = []

In [None]:
# Fill in this dictionary in a loop.
for index, row in Hashtag_Merged.iterrows():
    
    # Create a list of the matched tweets.
    matched_items = row['matched_items'].split(',')
    
    # Iterate over that list to find the tweets.
    for i in matched_items:
        
        # Transform the string to an item number int to match it with the tourist dataset.
        item_number = int(i.strip())
        
         # List comprehesion to get the first item of each item.
        item_number_temp = [x[0] for x in TopicsPerTweet]
        
        # If the item_number is already in the list, concatenate the topic
        if item_number in item_number_temp:
            
            # Get the index.
            indexOfItem = item_number_temp.index(item_number)
            
            # Update the values that belong to this index/hashtag
            TopicsPerTweet[indexOfItem][1] += (',' + str(row['topic'])) # Add the topic.
        
        # Else just add it to the list.
        else:
            TopicsPerTweet.append([item_number, str(row['topic'])])
    
    # Print the current progress.
    print((index/len(Hashtag_Merged)))

        
        
    

In [None]:
# Create a dataframe with all the information.
TopicsPerTweet_DF = pd.DataFrame(TopicsPerTweet, columns = ['item_number', 'topics'])  

In [None]:
TopicsPerTweet_Cleaned = []
# Iterate over the newly created dataframe to combine non-unique topics.
for index, row in TopicsPerTweet_DF.iterrows():
    
    # Get all the topics in a list.
    topics = row['topics'].split(',')
    
    # Get the unique topics by using a list and turning that into a list.
    topicsUnique = list(set(topics))
    
    topicString = ', '.join(topicsUnique)
    
    TopicsPerTweet_Cleaned.append([row['item_number'], topicString])
    

In [None]:
# Create a cleaned dataframe with all the information.
TopicsPerTweet_Cleaned_DF = pd.DataFrame(TopicsPerTweet_Cleaned, columns = ['item_number', 'topics'])  

In [None]:
# Function to find length of a string to use in a lambda function.
def findLen(str): 
    counter = 0    
    for i in str: 
        counter += 1
    return counter 

In [None]:
TopicsPerTweet_Cleaned_DF['length'] = TopicsPerTweet_Cleaned_DF['topics'].map(lambda x: findLen(x))

In [None]:
TopicsPerTweet_Cleaned_OnlySingleTopic = TopicsPerTweet_Cleaned_DF[TopicsPerTweet_Cleaned_DF['length'] < 14]

In [None]:
TopicsPerTweet_Cleaned_OnlySingleTopic['topics'].value_counts()

In [None]:
TopicsPerTweet_Cleaned_MultipleTopics = TopicsPerTweet_Cleaned_DF[TopicsPerTweet_Cleaned_DF['length'] >= 14]

In [None]:
probabilityDistributionTopics = [0.36348639, 0.079136151, 0.079961007, 0.055565276, 0.0537156, 0.047416702, 0.038393281, 0.031669458, 0.031594471, 0.028869948, 0.026720324, 0.024995626, 0.022096133, 0.02177119, 0.021221286, 0.020546404, 0.017396956, 0.016697078, 0.011672957, 0.007073762]
MultipleTopicsPerTweet_Cleaned = []

# Assign just one topic to those tweets based on the probability distribution of the hashtags.
for index, row in TopicsPerTweet_Cleaned_MultipleTopics.iterrows():
    
    # Get the topics
    topics = row['topics'].split(', ')
    
    # Variables to fill in based on probability distribution.
    highestProbability = -1
    topic = ''
    
    # Iterate over the topics.
    for i in topics:
        
        # Get the topic number.
        topic = int(i.split(' ')[2])
        
        # Find the probability in the distribution.
        probability = probabilityDistributionTopics[topic - 1]
        
        # If the probablity is higher than the one before, change the variables.
        if probability > highestProbability:
            highestProbability = probability
            topic = i
    
    MultipleTopicsPerTweet_Cleaned.append([row['item_number'], i])
        
    
    


In [None]:
# Create a cleaned dataframe with all the information.
MultipleTopicsPerTweet_Cleaned_DF = pd.DataFrame(MultipleTopicsPerTweet_Cleaned, columns = ['item_number', 'topics'])  

In [None]:
# Combine everything into one final dataframe.
frames = [MultipleTopicsPerTweet_Cleaned_DF, TopicsPerTweet_Cleaned_OnlySingleTopic]
TopicsPerTweet_Final_EN = pd.concat(frames)

In [None]:
TopicsPerTweet_Final_EN.drop(columns='length', inplace = True)

In [None]:
TopicsPerTweet_Final_EN.rename(columns={'topics':'topic'}, inplace=True)

In [None]:
# Export to csv for reuse.
TopicsPerTweet_Final_EN.to_csv('tweets_topic_NL.csv', index=False)