In [1]:
%run setup.ipynb

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [2]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer 
from nltk.stem.wordnet import WordNetLemmatizer 
from bs4 import BeautifulSoup


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\39324\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\39324\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words 
              (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 
            'keep_list', which have to be retained even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = pd.Series(dtype = 'object')
    for row in corpus:
        row=html.unescape(row)
        qs = []
        for word in row.split():
            if word=='WHO' or word=='W.H.O' or word=='W.H.O.':
                qs.append('WHO')
            if word not in keep_list:
                #removal of  punctuation, articles, question marks, tabs but also URLS, hashtags
                #by re.sub: replacing using regular expressions
                #pattern is the pattern the function has to find in the string, and substitute it with repl
                #lowering all terms
                #urls
                p1 = re.sub(pattern='http://\S+|https://\S+',repl= '', string=word)
                #&amp removal
                p1 = re.sub(pattern='&(?!amp;)',repl='',string=p1)
                #hashtags
                p1 = re.sub(pattern='#[A-Za-z0-9_]+',repl= '', string=p1)
                #replace all non letters with spaces
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl='', string=p1)
                # retwetted usernames
                p1 = re.sub(pattern='RT @([a-zA-Z0-9_]{1,50} +)',repl='', string=p1)
                #usernames
                p1 = re.sub(pattern='@([a-zA-Z0-9_]{1,50} +)',repl='', string=p1)
                #punctuation and special characters
                p1 = re.sub(pattern=r'[^\w\s]',repl='', string=p1)
                p1 = p1.lower()
                qs.append(p1)
            else: qs.append(word)
        cleaned_corpus = pd.concat([cleaned_corpus, pd.Series(' '.join(qs))])
    return cleaned_corpus

  p1 = re.sub(pattern='http://\S+|https://\S+',repl= '', string=word)


In [4]:
#removal of words without a concrete meaning: wh words and stopwords in english dictionary represent useless data
#stopword are those of the NLTK Stopword List: english dicstionary
def stopwords_removal(corpus):
    stop = set(stopwords.words('english'))
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [5]:
#lemmatization with wordnetlemmatizer to normalize the text: removing suffixes to create standardized words
#lemma is an actual word, stem is not
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [6]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        print('snowball')
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else:
        print('stemmer')
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [7]:
def preprocess(corpus, keep_list, cleaning = True, 
               stemming = False, stem_type = None, 
               lemmatization = False, 
               remove_stopwords = True):
    '''
    Purpose : Function to perform all pre-processing tasks seen until now
              (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus'    - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean 
                  variables indicating whether a particular task should 
                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. 
                  Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. 
           There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)    
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]        

    return corpus

# Functions for top n unigrams, bi-grams, tri-grams

UNIGRAMS

In [8]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

BIGRAMS

In [9]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

TRIGRAMS

In [10]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

TRANSFORM CORPUS IN A LIST OF UNIGRAMS FOR EVERY DOCUMENT

In [11]:
def list_unigrams(corpus_preprocessed):
    words = corpus_preprocessed.split()
    unigram_tuples = list(words)
    return [unigram_tuples[i] for i in range(len(unigram_tuples))]

TRANSFORM CORPUS IN A LIST OF BIIGRAMS FOR EVERY DOCUMENT

In [12]:
def list_bigrams(corpus_preprocessed):
    words = corpus_preprocessed.split()
    bigram_tuples = list(ngrams(words, 2))
    return ['{} {}'.format(bigram_tuples[i][0], bigram_tuples[i][1]) for i in range(len(bigram_tuples))]

SHOWING TOPICS

In [1]:
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords


GET DOMAIN NAME (GET THE NAME OF THE SOURCE FROM AN ANALYSED URL)

In [None]:
def get_domain_name(url):
    parsed_uri = urlparse(url)
    domain = parsed_uri.netloc
    return domain

SUM OF VALUES OF DICTIONARIES IN A LIST

In [None]:
def sum_of_values(word_count):
    sum_list = []
    for dictionary in word_count:
        sum_list.append(sum(dictionary.values()))
    return sum_list