# **Blog Generation Project**

In [1]:
#lets start with importing libs
import numpy as np
import pandas as pd
import nltk
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from scipy import spatial
import networkx as nx
import csv

In [2]:
!cd /content/drive/MyDrive/

In [4]:
# loading the dataset from a CSV file
df = pd.read_csv('/content/drive/MyDrive/medium_articles.csv')
df.head()


Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology..."


In [6]:
df = df[['title','text']]
df.head()

Unnamed: 0,title,text
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage..."


In [7]:
df = df.drop_duplicates()
df.head()

Unnamed: 0,title,text
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage..."


In [8]:
df.shape

(190827, 2)

# Summary Generation

In [11]:
nltk.download('punkt')
nltk.download('stopwords')
count = 0

def generateSummary(blog):
    global count
    count += 1
    print("Summarizing blog ", count)

    try:
        #Tokenize
        sentences = sent_tokenize(blog)

        #Cleaning
        sentences_clean = [re.sub(r'[^\w\s]', '', sentence.lower()) for sentence in sentences]

        #remove stopwords
        stop_words = set(stopwords.words('english'))
        sentence_tokens = [[word for word in sentence.split() if word not in stop_words] for sentence in sentences_clean]

        #train Word2Vec model on sentence tokens
        w2v = Word2Vec(sentences=sentence_tokens, vector_size=100, min_count=1, epochs=100)

        #(average of word vectors in the sentence)
        sentence_embeddings = []
        for tokens in sentence_tokens:
            if len(tokens) > 0:
                sentence_vector = np.mean([w2v.wv[word] for word in tokens], axis=0)
            else:
                sentence_vector = np.zeros(100)
            sentence_embeddings.append(sentence_vector)

        #(cosine similarity between sentence embeddings)
        similarity_matrix = np.zeros((len(sentences), len(sentences)))
        for i in range(len(sentence_embeddings)):
            for j in range(len(sentence_embeddings)):
                if i != j:
                    similarity_matrix[i][j] = 1 - spatial.distance.cosine(sentence_embeddings[i], sentence_embeddings[j])

        #PageRank algorithm
        nx_graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(nx_graph)

        #rank sentences
        ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

        #select top 25% sentences for summary
        sent_count = max(1, round(0.25 * len(sentences)))
        summary = " ".join([ranked_sentences[i][1] for i in range(sent_count)])

        return summary

    except Exception as e:
        print("An error occurred:", str(e))
        return float('NaN')

#sample
blog_text = """
Artificial intelligence (AI) is transforming the world as we know it. From self-driving cars to automated customer service, AI is becoming an essential part of various industries.
Its applications in healthcare are revolutionizing diagnosis and treatment, while AI-driven tools in education are enhancing personalized learning experiences.
AI is also playing a pivotal role in cybersecurity, helping to detect and mitigate threats in real time. As AI continues to evolve, it will likely become more integrated into our daily lives,
providing innovative solutions to complex problems. However, it also raises ethical concerns, such as the impact on jobs and privacy issues.
Balancing AI's benefits with responsible use will be crucial as we move forward in this new era of technology.
"""

summary = generateSummary(blog_text)
print("Summary:")
print(summary)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Summarizing blog  1
Summary:
As AI continues to evolve, it will likely become more integrated into our daily lives, 
providing innovative solutions to complex problems. AI is also playing a pivotal role in cybersecurity, helping to detect and mitigate threats in real time.


# writing summaries into csv file

In [12]:
import math
import csv

filename = "articlesSet.csv"
fields = ['title', 'summary', 'content']

with open(filename, 'a') as csvfile:

    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields)
    def callback(row):
        summary = generateSummary(row['text'])
        if(type(summary) != str):
            return

        rows = [row['title'], summary, row['text']]
        csvwriter.writerow(rows)
    df.apply(callback, axis=1)


Summarizing blog  2
Summarizing blog  3
Summarizing blog  4
Summarizing blog  5
Summarizing blog  6
Summarizing blog  7
Summarizing blog  8


  dist = 1.0 - uv / math.sqrt(uu * vv)


An error occurred: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 100 iterations')
Summarizing blog  9
Summarizing blog  10
Summarizing blog  11
Summarizing blog  12
Summarizing blog  13
Summarizing blog  14
Summarizing blog  15
Summarizing blog  16
An error occurred: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 100 iterations')
Summarizing blog  17
Summarizing blog  18
Summarizing blog  19
Summarizing blog  20
Summarizing blog  21
Summarizing blog  22
Summarizing blog  23
Summarizing blog  24
An error occurred: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 100 iterations')
Summarizing blog  25
Summarizing blog  26
An error occurred: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 100 iterations')
Summarizing blog  27
Summarizing blog  28
An error occurred: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 100 iter

KeyboardInterrupt: 

# Thats it for section 1 , see you guys in next section:


