<a href="https://colab.research.google.com/github/KampHost/caldiss/blob/master/TEXT_AKH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Portfolio Text Analysis

### Assignment 1: 
1. Take the following text and transform it into a list of lists with with each element being a tokenized sentence.
2. Remove stopwords, lower all tokens and keep only alpha-numeric tokens.


### Preamble

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', -1) #to see more text
import numpy as np
import seaborn as sns
import itertools
from collections import Counter
# very short RegEx
import re
import nltk #this part is needed on colab.
nltk.download('punkt')
nltk.download('stopwords')
# Tokenizing sentences
from nltk.tokenize import sent_tokenize
# Tokenizing words
from nltk.tokenize import word_tokenize
# Tokenizing Tweets made easy!
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Data and preprocessing

In [0]:
text = "I’ve been called many things in my life, but never an optimist. That was fine by me. I believed pessimists lived in a constant state of pleasant surprise: if you always expected the worst, things generally turned out better than you imagined. The only real problem with pessimism, I figured, was that too much of it could accidentally turn you into an optimist."

In [0]:
# Split on fullstop
text.lower().split(".")

In [0]:
# split on empty space
text.split(" ")

In [0]:
# Defining stopwords
stopwords_en = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 
                'ourselves', 'you', "you're", "you've", "you'll", 
                "you'd", 'your', 'yours', 'yourself', 'yourselves', 
                'he', 'him', 'his', 'himself', 'she', "she's", 'her', 
                'hers', 'herself', 'it', "it's", 'its', 'itself', 
                'they', 'them', 'their', 'theirs', 'themselves', 'what', 
                'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 
                'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 
                'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 
                'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 
                'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 
                'between', 'into', 'through', 'during', 'before', 'after', 'above', 
                'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 
                'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 
                'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 
                'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 
                'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 
                'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 
                'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', 
                "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', 
                "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 
                'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 
                'won', "won't", 'wouldn', "wouldn't"]

In [0]:
# Keeping only words that are not stopwords
[word for word in text.lower().split() if word not in stopwords_en]

In [0]:
# Drop fullstop and numbers
'.,'.strip(r'[" ,.!?:;"]')
[word.strip(r'[" ,.!?:;"]') for word in text.lower().split() if word not in stopwords_en and not word.isdigit()]

In [0]:
# Getting the stences
sentences = sent_tokenize(text)
print(sentences)

In [0]:
# Use word_tokenize to tokenize the third sentence: tokenized_sent
tokenized_sent = word_tokenize(sentences[1])
# Make a set of unique tokens in the entire scene: unique_tokens
unique_tokens = set(word_tokenize(text))
print(tokenized_sent)
print(unique_tokens)

In [0]:
stop_words = set(stopwords.words('english'))

In [0]:
[word.lower() for word in word_tokenize(text) if word not in stop_words and word.isalnum()]

### Assignment 2: 
1. Make a list of the most common hashtags.
Get the hashtags from the text, not using the column containing them already

In [0]:
import pandas as pd
pd.set_option('display.max_colwidth', -1) #to see more text

import numpy as np
import seaborn as sns

import itertools
from collections import Counter

In [0]:
# Tokenizing Tweets made easy!
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

In [0]:
tweets_df = pd.read_json('https://github.com/CALDISS-AAU/sdsphd19_coursematerials/raw/master/data/tweets_boomer.zip')
tweets_df.head()

In [0]:
# Reset index (not really needed but why not)
tweets_df = tweets_df.set_index(pd.to_datetime(tweets_df.created_at))

In [0]:
#Identifying hashtags
tweets_df['tags'] = tweets_df['tweet'].map(lambda textline: [tag for tag in tknzr.tokenize(textline) if tag.startswith('#')])

In [0]:
# Only keep tweets where a hashtag i present
tweets_df = tweets_df[tweets_df['tags'].map(len) > 0]

In [0]:
# Collect
tags = itertools.chain(*tweets_df['tags'])

In [0]:
counted_tags = Counter(tags)

In [0]:
counted_tags.most_common()[:11]

# Assignment 3

###Perform an LDA analysis of the #OKBoomer dataset

- Filter the corpus using tweet-preprocessor - try to figure out how to use it using it's documentation
- Clean up further with SpaCy (keep only ADV, ADJ, NOUN)
- Use Gensim to build a Dictionary (Filter extremes) and Corpus
- Use Gensim to run LDA
- Identify 10 topics
- Plot topic-counts by day

###Preamble

In [0]:
#Load data
boomer = pd.read_json('https://github.com/CALDISS-AAU/sdsphd19_coursematerials/raw/master/data/tweets_boomer.zip')

In [0]:
#Filter the corpus using tweet-preprocessor
! pip install tweet-preprocessor
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.HASHTAG)

In [0]:
clean_tweet= boomer['tweet'].map(p.clean)

In [0]:
clean_tweet=pd.DataFrame(clean_tweet)

In [0]:
# SPACY tokenizer and filter
import spacy
nlp = spacy.load("en")

In [0]:
tokens = []
for tweet in nlp.pipe(clean_tweet['tweet']):
  proj_tok = [token.lemma_.lower() for token in tweet if token.pos_ in ['NOUN', 'ADJ', 'ADV'] and not token.is_stop] 
  tokens.append(proj_tok)

In [9]:
# Create a Dictionary 
!pip install -qq -U gensim
from gensim.corpora.dictionary import Dictionary

[K     |████████████████████████████████| 24.2MB 100kB/s 
[?25h

In [0]:
# Create a Dictionary from the tokenized tweets
dictionary = Dictionary(tokens)

In [0]:
# filter out low-/high-frequency stuff, also limit the vocabulary to max 1000 words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100)

In [0]:
# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in tokens]

In [0]:
# we'll use the faster multicore version of LDA
from gensim.models import LdaMulticore

In [0]:
# Training the model (makes some mess atm due to version clashes)
lda_model = LdaMulticore(corpus, id2word=dictionary,  num_topics=10, workers = 4, passes=10)

In [0]:
# Based on the model predict topic for each tweet
predict_topic=[]
for result in lda_model[corpus]:
  prediction=sorted(result, key=lambda x: -x[1])[0][0]
  predict_topic.append(prediction)

In [0]:
boomer['predicted'] = predict_topic

In [0]:
topic_list = list(set(predict_topic))

In [0]:
for col in topic_list:
  boomer[col] = 0

In [0]:
for topic in topic_list:
  boomer[topic] = boomer['predicted'].map(lambda t: 1 if topic==t else 0)

In [0]:
# Ser Index to datetime
boomer.set_index(pd.to_datetime(boomer.created_at), inplace=True)

In [0]:
boomer.resample('D')[predict_topic].sum().plot()