#### Importing necessary library for RSS Feed Extraction

In [5]:
import pandas as pd
import feedparser
import re

##### Apply the fucntions to extract RSS Feed Data

In [13]:

url_reader = open("FeedUrl.txt","r")
url_links = url_reader.read().strip().split()

entire_data = open("entire_data.txt","a")

for i,url in enumerate(url_links):
    
    url_text_reader = open("{}".format(i+1),"r+")
    
    url_title = url_text_reader.read().strip().split()
    
    feed = feedparser.parse(url)
    
    for entry in feed.entries:
        
        Title = re.sub(r",","",entry.title)
        
        if Title in url_title:
            continue                                # check for duplicates
        
        Year = str(entry.published_parsed.tm_year)
        Month = str(entry.published_parsed.tm_mon)
        Day = str(entry.published_parsed.tm_mday)
        summary = re.sub(r",","",entry.summary)     # Removing commas from the text
        summary = re.sub(r"<[^>]*>","",summary)     # Removing tags 
        Summary = re.sub(r"\n"," ",summary)         # Removing New line characters
        
        url_text_reader.write(Title+"\n")
        entire_data.write(Year+","+Month+","+Day+","+Title+","+Summary+"\n")
        
        title_count += 1
        
    url_text_reader.close()

url_reader.close()
entire_data.close()


In [22]:
main_df = pd.read_csv("entire_data.txt",sep=",",header=None)
main_df.columns=["Year","Month","Day","Title","Summary"]

###### Removing the columns with empty data

In [21]:
main_df.isna().sum()

Year        0
Month       0
Day         0
Title       2
Summary    39
dtype: int64

In [105]:
Main_DF = main_df.dropna()
Main_DF.sample(5)

Unnamed: 0,Year,Month,Day,Title,Summary
1276,2016,12,12,NIPS 2016 Reflections,It was a great conference. The organizers had...
534,2017,8,25,Biggish time series data,Informal presentation for a UNSW research grou...
353,2018,12,16,Amazon RDS Oracle Instance Running Out of Disc...,pre{border: 2px solid #666; padding: 10px; bac...
513,2018,3,13,IJF Tao Hong Award 2018,Every two years the International Journal of F...
705,2015,4,10,Feeling the FPP love,It is now exactly 12 months since the print ve...


###### Joining Title and summary for KeyPharse extraction

In [106]:
Main_DF["Text"] = Main_DF["Title"]+" "+ Main_DF["Summary"]
Main_DF.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Main_DF["Text"] = Main_DF["Title"]+" "+ Main_DF["Summary"]


Unnamed: 0,Year,Month,Day,Title,Summary,Text
411,2020,10,26,Call for papers: Innovations in hierarchical f...,There is a new call for papers for a special i...,Call for papers: Innovations in hierarchical f...
1200,2020,10,6,Machine Learning Week 2021 Call for Speakers,Copyright © 2021 https://jtonedm.com James Tay...,Machine Learning Week 2021 Call for Speakers C...
507,2018,4,23,Upcoming talks: May-July 2018,First semester teaching is nearly finished and...,Upcoming talks: May-July 2018 First semester t...
412,2020,10,21,Model selection in reconciling hierarchical ti...,Model selection has been proven an effective s...,Model selection in reconciling hierarchical ti...
554,2017,4,30,Converting to blogdown,This website has gone through several major up...,Converting to blogdown This website has gone t...


##### Import packages for Key-Phrase extraction

In [107]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import wordnet
from yake import yake

###### Making Lemmatization as custom fuction as it is often used

In [108]:
def is_not_numeric(x):
    try:
        float(x)
        return False
    except:
        return True
    pass

stop_words = set(stopwords.words("english")).union("nbsp","ldquo","nuitblog","linkedin","post","event","amp","lot","working","quot","paper","article","job","demand")
lemmar = wordnet.WordNetLemmatizer()

def lemmatizer(string_line):

    word = word_tokenize(string_line.lower())
    words = [x for x in word if (x not in stop_words and len(x) > 1 and is_not_numeric(x))]

    lemma_word = ""

    for word in words:
        lemma = lemmar.lemmatize(word)
        lemma_word += lemma + " "
    
    return lemma_word

###### Applying Lemmatixation for Title and Summary together i.e. Text column

In [109]:
Main_DF["Lemmatized"] = Main_DF["Text"].apply(lambda x : lemmatizer(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Main_DF["Lemmatized"] = Main_DF["Text"].apply(lambda x : lemmatizer(x))


In [110]:
Main_DF.sample(10)

Unnamed: 0,Year,Month,Day,Title,Summary,Text,Lemmatized
991,2010,3,21,My standard LaTeX preamble,When I was a PhD student I found I needed a lo...,My standard LaTeX preamble When I was a PhD st...,standard latex preamble phd student found need...
1298,2021,4,6,The $1000 GPT-3,**&nbsp;Nuit Blanche is now on Twitter: @NuitB...,The $1000 GPT-3 **&nbsp;Nuit Blanche is now on...,gpt-3 nbsp nuit blanche twitter nuitblog nbsp ...
917,2011,5,29,Comparing HoltWinters() and ets(),I received this email today: I have a questio...,Comparing HoltWinters() and ets() I received t...,comparing holtwinters ets received email today...
259,2021,3,9,How to write a research grant proposal?,Today I will discuss how to write a good resea...,How to write a research grant proposal? Today ...,write research grant proposal today discus wri...
696,2015,5,31,A new R package for detecting unusual time series,The anomalous package provides some tools to d...,A new R package for detecting unusual time ser...,new package detecting unusual time series anom...
930,2011,1,11,Six places left for the forecasting workshop,There are six places left for the forecasting ...,Six places left for the forecasting workshop T...,six place left forecasting workshop six place ...
444,2020,1,3,FFORMA: Feature-based Forecast Model Averaging,We propose an automated method for obtaining w...,FFORMA: Feature-based Forecast Model Averaging...,fforma feature-based forecast model averaging ...
1270,2017,2,16,Software Engineering vs Machine Learning Concepts,Not all core concepts from software engineerin...,Software Engineering vs Machine Learning Conce...,software engineering v machine learning concep...
909,2011,8,12,Beware of junk journals and publishers,Today I received the following email: Dear Pr...,Beware of junk journals and publishers Today I...,beware junk journal publisher today received f...
1313,2020,3,14,Au Revoir Backprop ! Bonjour Optical Transfer ...,**&nbsp;Nuit Blanche is now on Twitter: @NuitB...,Au Revoir Backprop ! Bonjour Optical Transfer ...,au revoir backprop bonjour optical transfer le...


In [111]:
Main_DF.reset_index(inplace=True)

In [112]:
Main_DF.drop(columns="index")

Unnamed: 0,Year,Month,Day,Title,Summary,Text,Lemmatized
0,2021,6,25,Text Preprocessing in NLP with Python codes,ArticleVideo Book This article was published a...,Text Preprocessing in NLP with Python codes Ar...,text preprocessing nlp python code articlevide...
1,2021,6,25,Part 14: Step by Step Guide to Master NLP – Ba...,ArticleVideo Book This article was published a...,Part 14: Step by Step Guide to Master NLP – Ba...,part step step guide master nlp basic topic mo...
2,2021,6,25,Part 13: Step by Step Guide to Master NLP – Re...,ArticleVideo Book This article was published a...,Part 13: Step by Step Guide to Master NLP – Re...,part step step guide master nlp regular expres...
3,2021,6,25,Decide Best Learning Rate with LearningRateSch...,ArticleVideo Book This article was published a...,Decide Best Learning Rate with LearningRateSch...,decide best learning rate learningrateschedule...
4,2021,6,25,Generate Reports Using Pandas Profiling Deploy...,ArticleVideo Book This article was published a...,Generate Reports Using Pandas Profiling Deploy...,generate report using panda profiling deploy u...
...,...,...,...,...,...,...,...
1460,2021,4,20,FREE Report: 2021 Business at the Speed of AI ...,DOWNLOAD,FREE Report: 2021 Business at the Speed of AI ...,free report business speed ai report download
1461,2021,4,16,Gradient Flow Snapshot #50: Data Engineering j...,Subscribe to our Newsletter YouTube channel a...,Gradient Flow Snapshot #50: Data Engineering j...,gradient flow snapshot data engineering job u....
1462,2021,4,15,How Technology Companies Are Using Ray,The Data Exchange Podcast: Zhe Zhang describes...,How Technology Companies Are Using Ray The Dat...,technology company using ray data exchange pod...
1463,2021,4,13,FREE Report: 2020 NLP Industry Survey Report,DOWNLOAD,FREE Report: 2020 NLP Industry Survey Report D...,free report nlp industry survey report download


In [113]:
Main_DF.groupby("Year").count()

Unnamed: 0_level_0,index,Month,Day,Title,Summary,Text,Lemmatized
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,6,6,6,6,6,6,6
1991,1,1,1,1,1,1,1
1992,2,2,2,2,2,2,2
1993,1,1,1,1,1,1,1
1994,1,1,1,1,1,1,1
1995,2,2,2,2,2,2,2
1996,4,4,4,4,4,4,4
1997,3,3,3,3,3,3,3
1998,1,1,1,1,1,1,1
1999,1,1,1,1,1,1,1


##### Key-Phrase Extraction for Text column

In [114]:
lemma_word = ""
key_phrases = []

for sentence in Main_DF["Lemmatized"]:
    lemma_word += sentence

lemma_extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=0.9, dedupFunc="seqm", windowsSize=1, top=200,
                                        features=None)
keywords = lemma_extractor.extract_keywords(lemma_word)

for kw in keywords:
    print(kw)

('nbsp nbsp', 1.765922954459206e-05)
('time series', 3.557459012742076e-05)
('data', 5.007585605936319e-05)
('data science', 6.20800018302835e-05)
('nbsp', 7.030222167434287e-05)
('machine learning', 7.42260400834347e-05)
('time', 0.00015257647245884033)
('data analysis', 0.0001772436598306144)
('learning', 0.0001955913271537126)
('forecasting', 0.0002083998877496126)
('model', 0.00021070380558534875)
('series', 0.00024313994114023513)
('series data', 0.00024478303786162935)
('data scientist', 0.0002798799764738667)
('series forecasting', 0.0003178309672140404)
('quot quot', 0.00037525489736537964)
('learning nbsp', 0.00038509694673104856)
('forecast', 0.00042972288920793407)
('paper', 0.00044164457048107415)
('research', 0.00045526290993114926)
('science', 0.0004574871768061947)
('machine', 0.00048667082876349335)
('deep learning', 0.000520589089552624)
('quot', 0.0005398122023998585)
('post', 0.0005611069101984513)
('method', 0.0005737453883019756)
('data time', 0.0005772749748366833

#### There is a lot of unneccessary key-Phrases, this is due use of verbal text present in sumaary
#### To avoid this apply Key-Phrase on Title only

In [115]:
Main_DF["Title_Lemmatized"] = Main_DF["Title"].apply(lambda x : lemmatizer(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Main_DF["Title_Lemmatized"] = Main_DF["Title"].apply(lambda x : lemmatizer(x))


###### Key-Phrasing on Title

In [116]:
lemma_word = ""

for sentence in Main_DF["Title_Lemmatized"]:
    lemma_word += sentence

lemma_extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=0.9, dedupFunc="seqm", windowsSize=1, top=500,
                                        features=None)
keywords = lemma_extractor.extract_keywords(lemma_word)

for kw in keywords:
    print(kw)

('time series', 1.8339022913673134e-05)
('data science', 0.00010874918122059422)
('series forecasting', 0.00013092487105765638)
('machine learning', 0.00013668020837973073)
('series data', 0.0001732422081483741)
('data', 0.00024101550479053093)
('data scientist', 0.0003460424561280207)
('forecasting', 0.00035356665221525845)
('series analysis', 0.000362186316833339)
('data analysis', 0.00037563527957470424)
('series', 0.0003791584283542332)
('time', 0.00039186049140560583)
('forecasting competition', 0.0004113963743720885)
('functional data', 0.0004974717503199238)
('forecasting time', 0.0006087185324095115)
('deep learning', 0.0006258922043444288)
('functional time', 0.0006327222321257903)
('data analytics', 0.0006687841260483232)
('learning', 0.0007998005160761186)
('series model', 0.0008280506491949598)
('big time', 0.0008433954072446943)
('model', 0.0008959487953709055)
('data model', 0.0009366113550205776)
('big data', 0.000943081020644543)
('data forecasting', 0.00097161268510653

Storing the values of key-phrase , which can be later used for client suggestions

In [117]:
key_phrases = []

for kw in keywords:
    key_phrases.append(kw[0])

key_phrases

['time series',
 'data science',
 'series forecasting',
 'machine learning',
 'series data',
 'data',
 'data scientist',
 'forecasting',
 'series analysis',
 'data analysis',
 'series',
 'time',
 'forecasting competition',
 'functional data',
 'forecasting time',
 'deep learning',
 'functional time',
 'data analytics',
 'learning',
 'series model',
 'big time',
 'model',
 'data model',
 'big data',
 'data forecasting',
 'hierarchical time',
 'data visualization',
 'learning model',
 'forecast reconciliation',
 'forecast',
 'series forecast',
 'feature-based time',
 'energy forecasting',
 'forecasting model',
 'forecasting workshop',
 'job data',
 'science',
 'hierarchical forecasting',
 'business analytics',
 'analytics',
 'automatic time',
 'arima model',
 'data management',
 'probabilistic forecasting',
 'analytics data',
 'machine',
 'high-dimensional time',
 'electricity demand',
 'data mining',
 'forecasting big',
 'forecast hierarchical',
 'forecast package',
 'tourism forecastin

### Packages of Unsupervised Sentimental Analysis

In [118]:
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

Consideering user would search for python doing Sentimental Analysis

In [119]:
choice = "python"

New_df = Main_DF[Main_DF["Title_Lemmatized"].str.contains(choice)]

for text in New_df["Lemmatized"]:
    print(text, TextBlob(text).sentiment.polarity)
    sid = SentimentIntensityAnalyzer()
    print()
    print(text, sid.polarity_scores(text))
    print("\n\n\n")

text preprocessing nlp python code articlevideo book article published part data science blogathon introduction natural language processing nlp branch data science ... post text preprocessing nlp python code appeared first analytics vidhya  0.175

text preprocessing nlp python code articlevideo book article published part data science blogathon introduction natural language processing nlp branch data science ... post text preprocessing nlp python code appeared first analytics vidhya  {'neg': 0.0, 'neu': 0.925, 'pos': 0.075, 'compound': 0.3612}




plotly cufflink advanced python data visualization library articlevideo book article published part data science blogathon introduction data visualization help bridge gap number ... post plotly cufflink advanced python data visualization library appeared first analytics vidhya  0.35000000000000003

plotly cufflink advanced python data visualization library articlevideo book article published part data science blogathon introduction data visua


beginner python tutorial analyze personal netflix data much time spent watching office netflix find entry-level tutorial analyzing netflix usage data post beginner python tutorial analyze personal netflix data appeared first dataquest  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}




v python data analysis objective comparison python vs. better data science compare two language side side see python perform analysis step post v python data analysis objective comparison appeared first dataquest  0.1875

v python data analysis objective comparison python vs. better data science compare two language side side see python perform analysis step post v python data analysis objective comparison appeared first dataquest  {'neg': 0.0, 'neu': 0.906, 'pos': 0.094, 'compound': 0.4404}




long take learn python ready learn python data science right program habit structure master quickly might think post long take learn python appeared first dataquest  0.1615079365079365

long take learn py

#### Trying to build supervised Setimental Analyzer using Gaussian Naive Base Classifier
Use Resturant Review dataset to train the model

In [120]:
import numpy as np

dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [lemmar.lemmatize(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)
print(corpus)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.075, random_state = 0)

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]]
[[19 19]
 [ 2 35]]


0.72

#### Trying to apply the supervised model on RSS Feed

In [121]:
cv = CountVectorizer(max_features = 2500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

classifier = GaussianNB()
classifier.fit(X, y)


for text in New_df["Lemmatized"]:
    cv = CountVectorizer(max_features = 5000)
    X_text = cv.fit_transform([text])
    print(X_text)
    y_pred = classifier.predict(X_text.toarray())
    print(text, y_pred)

  (0, 21)	2
  (0, 16)	2
  (0, 13)	3
  (0, 19)	2
  (0, 7)	2
  (0, 3)	1
  (0, 5)	1
  (0, 2)	1
  (0, 18)	1
  (0, 14)	1
  (0, 8)	2
  (0, 20)	2
  (0, 4)	1
  (0, 10)	1
  (0, 12)	1
  (0, 11)	1
  (0, 17)	1
  (0, 6)	1
  (0, 15)	1
  (0, 1)	1
  (0, 9)	1
  (0, 0)	1
  (0, 22)	1


ValueError: operands could not be broadcast together with shapes (1,23) (1767,) 

### Got wrong values , this is because Resturant review Dataset is completely from RSS Feed data
### It will be impossible to broadcast on over the other. Its not just resturant review but any other dataset will give this un-acceptable results. the only dataset that could applied would be RSS FEED Dataset(which is not available).
## Planning to unsupervised Approach only.