In [1]:
#April 12, 2020
#Obtains Developer Key for Reddit, obtains Client ID and Client Secret in Developer Mode
CLIENT_ID = "4sHFBhBufH9AZA" 
CLIENT_SECRET = "7zgcETH6V9MwN_PjQs37iOTADSg"

In [2]:
#Creates a User Agent for Reddit API. Uses User Agent String Test Version 0.1 and Reddit Username
USER_AGENT = "python:<test0.1> (by /u/OwnCicada9)"

In [3]:
#Reddit Credentials, Personal Account To Access Data
USERNAME = "OwnCicada9" 
PASSWORD = "123456"

In [4]:
#Function to log in with Reddit Credentials. Reddit Login API returns a token for further connections.
#Logs in to Reddit, Set User Agent, obtains Access Token for future requests.
import requests
def login(username, password):
    if password is None:
        password = getpass.getpass("Enter reddit password for user {}: ".format(username))    
    headers = {"User-Agent": USER_AGENT}
    # Setup an auth object with our credentials
    client_auth = requests.auth.HTTPBasicAuth(CLIENT_ID, CLIENT_SECRET)
    # Make a post request to the access_token endpoint
    post_data = {"grant_type": "password", "username": username, "password": password}
    response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth,     
                             data=post_data, headers=headers) 
    return response.json()

In [5]:
#Call function to get the Access Token
token = login(USERNAME, PASSWORD)

In [6]:
#Token object (dictionary) contains the access_token string for future requests, contains scope of the token (everything) and time expired 
token

{'access_token': '475024319316-7An3zIPnHs72_fRB3FJIboR6NXE',
 'token_type': 'bearer',
 'expires_in': 3600,
 'scope': '*'}

In [7]:
#Uses token to obtain sets of links from subreddit. Uses /r/<subredditname> API endpoint to return Hot (function of age, up and down votes and liberal content) stories
subreddit = "worldnews"

In [8]:
#URL for previous endpoints used to create full URL using string formatting
url = "https://oauth.reddit.com/r/{}".format(subreddit)

In [9]:
#Set headers to allow authorization token and set user agent to stop requests from being heavily restricted
headers = {"Authorization": "bearer {}".format(token['access_token']), 
"User-Agent": USER_AGENT}

In [10]:
#Use requests library for function call, ensuring setting the headers
response = requests.get(url, headers=headers)

In [11]:
#Calls json() which results in Python Dictionary with the information returned by Reddit
result = response.json()

In [12]:
#Contains top 25 results from given subreddit
#Get title by iterating over the stories in the response, these stories are stored in dictionary's data key
for story in result['data']['children']: 
    print(story['data']['title'])

Livethread 11: Global COVID-19 Pandemic
The pope just proposed a universal basic income.
Yemeni Women Launch Their Own 'Me Too' Online Campaign Against Sexual Harassers
No return to ‘normality’ until coronavirus vaccine is available, Trudeau says
UK spy agencies urge China rethink once Covid-19 crisis is over: Britain’s intelligence community believes the UK needs to reassess its relationship with China after the coronavirus crisis subsides and consider if tighter controls are needed over high-tech and other strategic industries.
Malaysian bears in peril as China pushes cure for Covid-19: 'Two environmental groups have warned of a threat to the survival of the already endangered Malaysian sun bear, now that China is promoting bear bile as treatment for Covid-19.'
UK Prime Minister Boris Johnson thanks hospital staff, saying 'I owe them my life'
Nurses told to refuse to treat coronavirus patients if they have inadequate PPE
Boris Johnson leaves hospital as he continues recovery from cor

In [13]:
#Uses sleep function to rate limit our calls , as our code makes repeated calls to an API
from time import sleep

In [14]:
#Function accepts a subreddit name and authorization token
#Accept a number of pages to read, default of 5 here, gets cursor to get next page of results, set after first page, not needed for first
def get_links(subreddit, token, n_pages=5):
    stories = []
    after = None
    for page_number in range(n_pages):
        # Sleep before making calls to avoid going over the API limit
        sleep(2)
        # Setup headers and make call, just like in the login function
        headers = {"Authorization": "bearer {}".format(token['access_token']), "User-Agent": USER_AGENT} 
        url = "https://oauth.reddit.com/r/{}?limit=100". format(subreddit)
        if after:
            # Append cursor for next page, if we have one
            url += "&after={}".format(after)
        response = requests.get(url, headers=headers)
        result = response.json()
        # Get the new cursor for the next loop
        after = result['data']['after']
        # Add all of the news items to our stories list
        for story in result['data']['children']:
            stories.append((story['data']['title'], story['data']['url'], story['data']['score']))
    return stories

In [15]:
#Calls stories function by passing authorization token and subreddit name
stories = get_links("worldnews", token)
print(len(stories))

501


In [16]:
#Sets up data folder path to download full webpage from each link and stores them in raw subfolder
import os 
data_folder = os.path.join(os.path.expanduser("~"), "OneDrive", "Desktop", "Data Mining", "Chapter10HW", "Data", "websites", "raw")

In [17]:
#Imports hash function for MD5 hashing to create unique filenames for articles y hashing the URL
#Hash function converts our input (string containing the title) to string that seems to be random, same input = same output each iteration, slightly different input = very different output, one-way function
import hashlib

In [18]:
#Skip any website downloads that fail, maintain a simple counter of number of errors and surpresses them, if counter is too high we attempt to resolve them
number_errors = 0

In [None]:
#Iterate through each of the stories, download the website and save results to file
for title, url, score in stories:
    output_filename = hashlib.md5(url.encode()).hexdigest() 
    fullpath = os.path.join(data_folder, output_filename + ".txt")
    try: 
        response = requests.get(url) 
        data = response.text 
        with open(fullpath, 'w') as outf: 
            outf.write(data)
    except Exception as e:
        number_errors += 1
        # You can use this to view the errors, if you are getting too many:
        #raise

In [None]:
#Returns number of errors
number_errors

In [None]:
#Find story in each of the raw data elements using simple algorithm
filenames = [os.path.join(data_folder, filename) for filename in os.listdir(data_folder)]

In [24]:
#Create output folder for text only versions to extract  
text_output_folder = os.path.join(os.path.expanduser("~"), "OneDrive", "Desktop", "Data Mining", "Chapter10HW", "Data", "websites", "textonly")

In [25]:
#Use lxml library to parse HTML files and extract text from the files
import lxml
from lxml import etree

In [26]:
#Create list to ignore nodes of Javascript, styles and comments
skip_node_types = ["script", "head", "style", etree.Comment]

In [27]:
#Create function to parse HTML file into lxml etree, and create separate function to parse the tree while looking for text
#Call getroot() function to get root node instead of full etree to write text extraction function for any node and then write a recursive function, call child nodes to extract text and concatenate child nodes
parser = etree.HTMLParser()

def get_text_from_file(filename):
    with open(filename) as inf:
        html_tree = etree.parse(inf, parser) 
    return get_text_from_node(html_tree.getroot())

In [28]:
#Check that the text is at least 100 characters long
def get_text_from_node(node):
    if len(node) == 0: 
        # No children, just return text from this item
        if node.text: 
            return node.text 
        else:
            return ""
    else:
        # This node has children, return the text from it:
        results = (get_text_from_node(child)
                   for child in node
                   if child.tag not in skip_node_types)
    result = str.join("\n", (r for r in results if len(r) > 1))
    if len(result) >= 100:
        return result
    else:
        return ""

In [29]:
#Execute code to stop blank return lines from being returned (node has no children or text) and use a generator , to make more efficient by iterating through raw HTML pages, calling extraction text function and saving results to textonly subfolder
for filename in os.listdir(data_folder):
    text = get_text_from_file(os.path.join(data_folder, filename)) 
    with open( os.path.join(text_output_folder, filename), 'w') as outf: 
        outf.write(text)

TypeError: object of type 'NoneType' has no len()

In [30]:
#Uses K-Means Algorithm from Sci-kit Learn, which imported from the cluster module
from sklearn.cluster import KMeans

In [31]:
#Import TfidfVectorizer class which applies a weighting to the counts of each term's count depending on the documents it appears in using : tf/ log(df)
#tf = term frequency (how many times it appears in the document), df = terms document frequency (how many documents the corpus it appears in)
#Improves performance in this text mining application, terms that appear in many documents are weighted lower
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
#Set up pipeline for analysis: apply vectorizer and apply k-means algorithm
from sklearn.pipeline import Pipeline
n_clusters = 10 
pipeline = Pipeline([('feature_extraction', TfidfVectorizer(max_df=0.4)),
                                     ('clusterer', KMeans(n_clusters=n_clusters)) ])

In [33]:
#Ignore any word that occurs in more than 40% of docs above and , open documents
documents = [open(os.path.join(text_output_folder, filename)).read() for filename in os.listdir(text_output_folder)]

In [34]:
#Fit and predict the pipeline, followed the process number of times, do not give the target classes for our dataset to fit function, makes it unsupervised
pipeline.fit(documents)
labels = pipeline.predict(documents)

ValueError: max_df corresponds to < documents than min_df

In [35]:
#View how many samples were placed in each cluster using the Counter class
from collections import Counter
c = Counter(labels) 
for cluster_number in range(n_clusters): 
    print("Cluster {} contains {} samples".format(cluster_number, c[cluster_number]))

NameError: name 'labels' is not defined

In [36]:
#Find inertia of the algorithm, can be retrieved from any KMeans , it serves as the criterion it uses to develop the centroid, used to minimize the distance from each sample to the nearest centroid
pipeline.named_steps['clusterer'].inertia_

AttributeError: 'KMeans' object has no attribute 'inertia_'

In [37]:
#Create X Matrix,from text documents once per value of n_clusters to greatly improve the speed of the code
inertia_scores = [] 
n_cluster_values = list(range(2, 20)) 
for n_clusters in n_cluster_values: 
    cur_inertia_scores = [] 
    X = TfidfVectorizer(max_df=0.4).fit_transform(documents) 
    for i in range(10): 
        km = KMeans(n_clusters=n_clusters).fit(X) 
        cur_inertia_scores.append(km.inertia_) 
        inertia_scores.append(cur_inertia_scores)

ValueError: max_df corresponds to < documents than min_df

In [38]:
#Displays inertia scores of clusters
n_clusters

2

In [39]:
#Inertia decreases with reducing improvement as the number of clusters improves, examines increase of values between 6 to 7
n_clusters = 6 
pipeline = Pipeline([('feature_extraction', TfidfVectorizer(max_df=0.4)),
                     ('clusterer', KMeans(n_clusters=n_clusters)) ])
pipeline.fit(documents) 
labels = pipeline.predict(documents)

ValueError: max_df corresponds to < documents than min_df

In [40]:
#Displays n_clusters
n_clusters

6

In [41]:
#Extract term list from feature extraction step
terms = pipeline.named_steps['feature_extraction'].get_feature_names()

NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.

In [42]:
#Set up counter for counting size of each classes
c = Counter(labels)

NameError: name 'labels' is not defined

In [43]:
#Iterate over the most important terms for the cluster , take five largest values from the centroid, by finding features that have the highest values in the centroid
for cluster_number in range(n_clusters): 
    print("Cluster {} contains {} samples".format(cluster_number, c[cluster_number]))
    print(" Most important terms")
    centroid = pipeline.named_steps['clusterer'].cluster_centers_[cluster_number]
    most_important = centroid.argsort()
    for i in range(5):
        term_index = most_important[-(i+1)]
        print(" {0}) {1} (score: {2:.4f})".format(i+1, terms[term_index], centroid[term_index]))


NameError: name 'c' is not defined

In [44]:
#Call transform function on KMeans Instance
X = pipeline.transform(documents)

NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.

In [45]:
#Create a co-association matrix from array of labels by iterating over labels and records where two samples have the same label
#Use SciPy's csr_matrix, cluster the data
from scipy.sparse import csr_matrix

In [46]:
#Function definition takes a set of labels and records the rows and columns of each match, this is done in a list
#Csr_Matrix is a Sparce Matrix are sets of lists recording position of nonzero values, pair of samples with same label
import numpy as np
def create_coassociation_matrix(labels):
    rows = [] 
    cols = []
    unique_labels = set(labels) 
    for label in unique_labels:
        indices = np.where(labels == label)[0]
        for index1 in indices:
            for index2 in indices:
                rows.append(index1)
                cols.append(index2)
    data = np.ones((len(rows),)) 
    return csr_matrix((data, (rows, cols)), dtype='float')

In [47]:
#Get Co-association Matrix from labels
C = create_coassociation_matrix(labels)

NameError: name 'labels' is not defined

In [48]:
#Display Co-association Matrix
C

NameError: name 'C' is not defined

In [49]:
#Compute MST with the minimum_spanning_tree function in the sparse package
#MST is the spanning tree with lowest total weight, edges on the graph that connects all nodes together
from scipy.sparse.csgraph import minimum_spanning_tree

In [50]:
#The mst function is called directly on sparse matrix returned by co-association function
mst = minimum_spanning_tree(C)

NameError: name 'C' is not defined

In [51]:
#Higher values show higher clusters of values, though a minimum spanning tree, sees input as a distance , and higher scores penalized  so we compute the minimum spanning tree on the negation
mst = minimum_spanning_tree(-C)

NameError: name 'C' is not defined

In [52]:
#Remove any node with weight less than threshold. Iterate over edges in MST, and remove any that are less than specific value
#Create extra labels, then the co-association matrix, and then the two matrices, uses multiple iterations
pipeline.fit(documents) 
labels2 = pipeline.predict(documents) 
C2 = create_coassociation_matrix(labels2) 
C_sum = (C + C2) / 2

ValueError: max_df corresponds to < documents than min_df

In [53]:
#Compute the MST and remove and remove an edge that didnt occur in both labels
mst = minimum_spanning_tree(-C_sum) 
mst.data[mst.data > -1] = 0

NameError: name 'C_sum' is not defined

In [54]:
#The threshold value was negated in addition to the co-association matric, as the value we wanted was one with a value of 1
#Find the connected components, a way to get all of the samples that are connected by edges after removing the edges with low weights
from scipy.sparse.csgraph import connected_components 
number_of_clusters, labels = connected_components(mst)

NameError: name 'mst' is not defined

In [55]:
#Create clustering algorithm that performs all steps in EAC algorithm, creates basic structure of class using scikit learn
#The fit function performs the k means clusters a few times, combine with thee co association matrices for each iteration, done in a generator, to save memory and create them in the matrix
#Create new single k means run with our data set and co association matrix , use sum to add them together
#Single Clustering function designed to perform single iteration of k means on data and retrun predicted labels
#Randomly choose clusters and use Numpy randint function and nclustersrange parameter, sets range of possible values, cluster and predict data set using k means
#Fit Predict calls fit and returns labels for document
from sklearn.base import BaseEstimator, ClusterMixin 
class EAC(BaseEstimator, ClusterMixin):
    def __init__(self, n_clusterings=10, cut_threshold=0.5, n_clusters_range=(3, 10)): 
        self.n_clusterings = n_clusterings
        self.cut_threshold = cut_threshold
        self.n_clusters_range = n_clusters_range

    def fit(self, X, y=None):
        C = sum((create_coassociation_matrix(self._single_clustering(X))
                 for i in range(self.n_clusterings)))
        mst = minimum_spanning_tree(-C)
        mst.data[mst.data > -self.cut_threshold] = 0
        mst.eliminate_zeros()
        self.n_components, self.labels_ = connected_components(mst)
        return self
 
    def _single_clustering(self, X):
        n_clusters = np.random.randint(*self.n_clusters_range)
        km = KMeans(n_clusters=n_clusters)
        return km.fit_predict(X)
 
    def fit_predict(self, X):
        self.fit(X)
        return self.labels_

In [56]:
#Run on previous code by setting up a pipeline as before and using EAC by using a KMeans instance as final part of pipeline
pipeline = Pipeline([('feature_extraction', TfidfVectorizer(max_df=0.4)),
                     ('clusterer', EAC()) ])

In [57]:
#Create a matrix X by extracting features from data set using TfidfVectorizer and sample to incrementally update the model
vec = TfidfVectorizer(max_df=0.4) 
X = vec.fit_transform(documents)

ValueError: max_df corresponds to < documents than min_df

In [58]:
#Import MiniBatchKMeans and create instance of it
#Comes from scikit'learn package, this algorithm allows online learning, implements partial fit function, takes a set of samples and updates the model
#Follows clustering formats and removes previous training and refits the data
from sklearn.cluster import MiniBatchKMeans 
mbkm = MiniBatchKMeans(random_state=14, n_clusters=3)

In [59]:
#Randomly sample the X matrix to simulate data coming from an external source. Once data enters we update the model
batch_size = 10 
for iteration in range(int(X.shape[0] / batch_size)): 
    start = batch_size * iteration 
    end = batch_size * (iteration + 1) 
    mbkm.partial_fit(X[start:end])

NameError: name 'X' is not defined

In [60]:
#Get labels for original dataset by asking instance to predict
labels = mbkm.predict(X)

NameError: name 'X' is not defined

In [61]:
#Uses HashingVectorizer class to use hashing algorithms to drastially reduce the memory of computing bag of words model, only hashes are recorded
#Large number of 2 pow 18, using sparse matrices can place large value
labels

NameError: name 'labels' is not defined

In [62]:
#Pipeline class doesnt allow for online learning use
#Creates a subclass of Pipeline using this function
#Does all the transformation steps, calls partial fit on the final step which should be the classifier or clustering algorithm
class PartialFitPipeline(Pipeline):
    def partial_fit(self, X, y=None):
        Xt = X
        for name, transform in self.steps[:-1]:
            Xt = transform.transform(Xt)
        return self.steps[-1][1].partial_fit(Xt, y=y)

In [63]:
#Creates pipeline uses MiniBatchKMeans in online learning, with Hashing Vectorizer, aside from the new classes, using the same process from the chapter and fitting a few documents at once
from sklearn.feature_extraction.text import HashingVectorizer

pipeline = PartialFitPipeline([('feature_extraction', HashingVectorizer()),
                               ('clusterer', MiniBatchKMeans(random_state=14, n_clusters=3)) ])
batch_size = 10 
for iteration in range(int(len(documents) / batch_size)): 
    start = batch_size * iteration 
    end = batch_size * (iteration + 1)
    pipeline.partial_fit(documents[start:end]) 
labels = pipeline.predict(documents)

NotFittedError: This MiniBatchKMeans instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [64]:
#Displays the label
labels

NameError: name 'labels' is not defined