<a href="https://colab.research.google.com/github/GuptaNavdeep1983/DeepLearningRepo/blob/main/CS_688_Assignment_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

References:

https://datascience.stackexchange.com/questions/54904/how-to-avoid-tokenizing-w-sklearn-feature-extraction


https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python

https://www.freecodecamp.org/news/an-introduction-to-bag-of-words-and-how-to-code-it-in-python-for-nlp-282e87a9da04/

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.transform

In [None]:
pip install gensim

In [87]:
import statsmodels.api as sm 
import pylab as py 
import matplotlib.pyplot as plt
import pandas as pd
import urllib.request
import datetime
from zipfile import ZipFile
from gzip import decompress
from json import loads
from requests import get
import requests, zipfile, io
from bs4 import BeautifulSoup
import math
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import re

from gensim import corpora
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

In [82]:
def get_crowd_funding_data_links(filter_by_years):
    # get the content of the website
    content = urllib.request.urlopen('https://webrobots.io/indiegogo-dataset/').read()
    soup = BeautifulSoup(content,features='html.parser')
    
    all_stories = pd.DataFrame([], columns=['Year', 'Month', 'json', 'csv'])
    parent_div = soup.find(name='div', attrs={'class':'fusion-text'})

    # Iterate through the divs to find the list of Month-Year wise anchor tags
    index = 0
    for year_data in parent_div.find_all(name='ul'):
        all_months = year_data.find_all('li')
        for month in all_months:
            complete_text = month.text
            date_time_obj = datetime.datetime.strptime((complete_text.split('[')[0]).strip(), '%Y-%m-%d')
            if date_time_obj.year in filter_by_years:
                all_stories.loc[index, "Year"] = date_time_obj.year
                all_stories.loc[index, "Month"] = str(date_time_obj.month) if date_time_obj.month >=10 else f'0{date_time_obj.month}'
                all_stories.loc[index, "json"] = month.find_all('a')[0]["href"]
                all_stories.loc[index, "csv"] = month.find_all('a')[1]["href"]
                index = index + 1
    return all_stories

def get_current_index_data(df, indx, columns):
    r = requests.get(df["csv"][indx])
    z = zipfile.ZipFile(io.BytesIO(r.content))
    csv_data = z.open("Indiegogo.csv")
    data_df = pd.read_csv(csv_data, usecols=columns, dtype={'title':str, 'tagline':str})
    return data_df

In [84]:
df = get_crowd_funding_data_links([2020])

documents = []
for indx in df.head(5).index:
    current_month_data = get_current_index_data(df, indx, ["title", "tagline"])
    current_month_data.dropna(axis=0, inplace=True)
    only_taglines = current_month_data["tagline"]
    all_content = ''.join(str(e) for e in only_taglines)
    documents.append(all_content)

In [127]:
vectorizer = CountVectorizer(input='content',stop_words='english', analyzer='word', token_pattern=r'\b[a-zA-Z]{2,}\b',max_features=100)
df = pd.DataFrame(documents, columns=['content'])
vectorizer.fit_transform(df['content'])
doc_term_matrix = vectorizer.transform(df['content'])
feature_names = vectorizer.get_feature_names()
df_dictionary = pd.DataFrame(feature_names)
dictionary = df_dictionary.to_dict()[0]
df_doc_term_matrix = pd.DataFrame(doc_term_matrix.toarray())

In [111]:
def create_gensim_lsa_model(number_of_topics,words, dictionary, doc_term_matrix):
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

In [130]:
from sklearn.utils.extmath import randomized_svd

U, Sigma, VT = randomized_svd(doc_term_matrix, 
                              n_components=15,
                              n_iter=5,
                              random_state=42)

In [132]:
print(U)
print(Sigma)
print(VT)

[[ 0.44119157  0.33396529 -0.25739524  0.70795352 -0.35548093]
 [ 0.45334147 -0.21263494 -0.0274168   0.22701965  0.83485226]
 [ 0.44708    -0.32083231  0.77908288  0.00519094 -0.30031488]
 [ 0.44109815  0.709614    0.07723072 -0.53654338  0.08964934]
 [ 0.45319246 -0.48658829 -0.56573959 -0.39919688 -0.28005176]]
[14432.06974845   592.00897557   419.54608135   277.75737672
   184.56997074]
[[ 5.33105524e-02  4.16495387e-02  7.31670609e-02  8.16533354e-02
   1.08808563e-01  4.72136286e-02  7.66663040e-02  7.73980539e-02
   5.55769560e-02  5.72689155e-02  4.19836497e-02  1.40814176e-01
   8.21949133e-02  6.04172305e-02  4.15027052e-02  4.54405123e-02
   8.02186558e-02  5.47050780e-02  4.87292898e-02  7.68992059e-02
   9.30249633e-02  8.42743254e-02  4.31051945e-02  6.67835082e-02
   6.84453641e-02  4.20573434e-02  4.84213229e-02  4.94049825e-02
   4.79311410e-02  5.37512983e-02  4.51573105e-02  4.43856201e-02
   8.33111048e-02  6.85668848e-02  9.89530521e-02  5.40271790e-02
   7.4678910