# Visualize and Analyze Startup Data

## 1. Setup

To prepare your environment, you need to install some packages and enter credentials for the Watson services.

## 1.1 Install the necessary packages

### Install Watson Developer Cloud Package

In [None]:
!pip install watson-developer-cloud==1.5

### Install IBM Database Server Python Package

In [None]:
!pip install ibm_db

### Install Other Packages

In [None]:
!pip install cssselect

In [None]:
!pip install fake-useragent

## 1.2 Import packages and libraries

Import the packages and libraries that you'll use:

In [None]:
from bs4 import BeautifulSoup
import requests
import time
from random import randint
from IPython.display import display, HTML
#import selenium
#from selenium import webdriver
import re
import pandas as pd

from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 \
  import Features, EntitiesOptions, KeywordsOptions, SemanticRolesOptions, SentimentOptions, EmotionOptions, ConceptsOptions, CategoriesOptions

import ibm_boto3
from botocore.client import Config


import json
import nltk
import csv
import ibm_db
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
from io import BytesIO

from urllib.parse import urlencode, urlparse, parse_qs

from lxml.html import fromstring
from requests import get
from fake_useragent import UserAgent

## 2. Configuration

Add configurable items of the notebook below

## 2.1 Add your service credentials from IBM Natural Language Understanding service
You must create a Watson Natural Language Understanding service on IBM Cloud. Create a service for Natural Language Understanding (NLU). Insert the apikey and url values in the variables, for your NLU in the following cell. Do not change the values of the version fields.

Run the cell.

In [None]:
apikey=''
url=''
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2018-03-16',
    iam_api_key=apikey,
    url=url
)

## 2.2 Add your service credentials for DB2

Insert the DB2 service credentials as credentials_1 in the following cell.

In [None]:

# @hidden_cell
# The following code contains the credentials for a connection in your Project.
# You might want to remove those credentials before you share your notebook.
credentials_1 = {
}
    



## 3. Scrape Startup Information

Scrapes data based on its appears on google for the following -

* How many times it has appeared on News?
* Whether it has a Wikipedia page or not?
* Whether they have Tech blogs or not?
* Whether they are active on Social Media (Twitter, Medium, etc..)?


### Insert Pandas Dataframe of the `companies_list.json file`

Ensure the dataframe is saved as `df_data_1`

In [None]:
company_list_final=list(df_data_1['companies'])
company_list_final

In [None]:

def scrape_news_summaries_google(s):
    ua = UserAgent()
    
    number_result=10
    google_url = "https://www.google.com/search?q=" + s + "&num=" + str(number_result)
    response = requests.get(google_url, {"User-Agent": ua.random})
    soup = BeautifulSoup(response.text, "html.parser")

    result_div = soup.find_all('div', attrs = {'class': 'ZINbbc'})

    
    news_items=[]
    for r in result_div:
        # Checks if each element is present, else, raise exception
        
        try:
            news_dict=dict()
            link = r.find('a', href = True)
            title = r.find('div', attrs={'class':'vvjwJb'}).get_text()
            description = r.find('div', attrs={'class':'s3v9rd'}).get_text()

            # Check to make sure everything is present before appending
            if link != '' and title != '' and description != '': 
                
                news_dict['news_link']=link['href']
                news_dict['summary']=description
                news_items.append(news_dict)
        # Next loop if one element is not present
        except:
            continue
    return news_items

In [None]:
final_rows=[]
val=dict()
for c in company_list_final:
    for key,value in c.items():
        s='"'+key+'"'+'company economic times'
        inner_dict=dict()
        temp=[]
        temp=temp+scrape_news_summaries_google(s)
        inner_dict['Description']=value[0]
        inner_dict['Company_Link']=value[1]
        inner_dict['News_Items']=temp
        val[key]=inner_dict
        final_rows.append(val)
        val=dict()

In [None]:
final_rows[0]

In [None]:
def split_sentences(text):
    """ Split text into sentences.
    """
    sentence_delimiters = re.compile(u'[\\[\\]\n.!?]')
    sentences = sentence_delimiters.split(text)
    return sentences

def split_into_tokens(text):
    """ Split text into tokens.
    """
    tokens = nltk.word_tokenize(text)
    return tokens

def load_string(fileobject):
    '''Load the file contents into a Python string'''
    text = fileobject.read()
    return text.decode('utf-8')

def POS_tagging(text):
    """ Generate Part of speech tagging of the text.
    """
    POSofText = nltk.tag.pos_tag(text)
    return POSofText

def resolve_coreference(text, config):
    """ Resolve coreferences in the text for Nouns that are Subjects in a sentence
    """
    sentenceList = split_sentences(text)
    referenceSubject = ''
    sentenceText = ''
    configjson = json.loads(config)
    
    for sentences in sentenceList:    
        tokens = split_into_tokens(sentences)   
        postags = POS_tagging(tokens)
        sentencetags = chunk_sentence(postags)
        subjects = find_subject(sentencetags)
        for rules in configjson['configuration']['coreference']['rules']:
            if (rules['type'] == 'chunking'):
                for tags in rules['chunk']:
                    chunktags = chunk_tagging(tags['tag'],tags['pattern'],postags)
                    if (len(chunktags)>0):
                        for words in chunktags:
                            if tags['tag'] == 'PRP':
                                if subjects == '':
                                    sentenceText = sentenceText+sentences.replace(words,referenceSubject)+'. '
                            elif tags['tag'] == 'NAME':
                                if words == subjects:
                                    referenceSubject = words
                                    sentenceText = sentenceText+sentences+'. '
                    
    return sentenceText

def chunk_sentence(text):
    """ Tag the sentence using chunking.
    """
    grammar = """
      NP: {<DT|JJ|PRP|NN.*>+} # Chunk sequences of DT,JJ,NN
          #}<VB*|DT|JJ|RB|PRP><NN.*>+{  # Chink sequences of VB,DT,JJ,NN       
      PP: {<IN><NP>}               # Chunk prepositions followed by NP
      V: {<V.*>}                   # Verb      
      VP: {<VB*><NP|PP|CLAUSE>+}  # Chunk verbs and their arguments
      CLAUSE: {<NP><VP>}           # Chunk NP, VP
      """  
    parsed_cp = nltk.RegexpParser(grammar,loop=2)
    pos_cp = parsed_cp.parse(text)
    return pos_cp

def find_subject(t):
    for s in t.subtrees(lambda t: t.label() == 'NP'):
        return find_attrs(s,'NP')
    
def find_attrs(subtree,phrase):
    attrs = ''
    if phrase == 'NP':
        for nodes in subtree:
            if nodes[1] in ['DT','PRP$','POS','JJ','CD','ADJP','QP','NP','NNP']:
                attrs = attrs+' '+nodes[0]
    return attrs   

def chunk_tagging(tag,chunk,text):
    """ Tag the text using chunking.
    """
    parsed_cp = nltk.RegexpParser(chunk)
    pos_cp = parsed_cp.parse(text)
    chunk_list=[]
    for root in pos_cp:
        if isinstance(root, nltk.tree.Tree):               
            if root.label() == tag:
                chunk_word = ''
                for child_root in root:
                    chunk_word = chunk_word +' '+ child_root[0]
                chunk_list.append(chunk_word)
    return chunk_list

def analyze_using_NLU(analysistext):
    """ Extract results from Watson Natural Language Understanding for each news item
    """
    res=dict()
    response = natural_language_understanding.analyze( 
        text=analysistext,
        features=Features(
                          sentiment=SentimentOptions(),
                          entities=EntitiesOptions(), 
                          keywords=KeywordsOptions(),
                          emotion=EmotionOptions(),
                          concepts=ConceptsOptions(),
                          categories=CategoriesOptions(),
                          ))
    res['results']=response
    return res



In [None]:
def hasET(company_name):
    cnbcVal=0
    cnbcLinks=[]
    ET_link=[]
    s='"'+company_name+'"'+' economic times'
    res= scrape_news_summaries_google(s)
    return res

def hasTwitter(company_name):
    cnbcVal=0
    cnbcLinks=[]
    ET_link=[]
    s='"'+company_name+'"'+' twitter'
    res= scrape_news_summaries_google(s)
    return res


def getTechAreaNews(article_text):
    concept=''
    relevance=''
    if len(article_text) > 15:
        NLUres=analyze_using_NLU(article_text)
        
        if len(NLUres['results']['concepts']) != 0:
            concept=NLUres['results']['concepts'][0]['text']
            relevance=NLUres['results']['concepts'][0]['relevance']
        if len(NLUres['results']['sentiment']) != 0: 
            sentiment=NLUres['results']['sentiment']['document']['label']
    return concept,relevance,sentiment

def getTechArea(article_text):
    concept=''
    relevance=''
    sentiment=''
    if len(article_text) > 15:
        NLUres=analyze_using_NLU(article_text)
        if len(NLUres['results']['concepts']) != 0:
            concept=NLUres['results']['concepts'][0]['text']
            relevance=NLUres['results']['concepts'][0]['relevance']
    return concept,relevance

def hasWiki(s):
    wikiVal=0
    wikiLinks=[]
    s=s.replace(' ','+')
    link='https://en.wikipedia.org/w/index.php?search='+s+'&title=Special%3ASearch&go=Go'
    r = requests.get(link)
    print(r.status_code)
    content = r.text
    return content
    

## 3.1 Collect Wiki

Collects info on how many Companies have an existing Wikipedia page

In [None]:
wikiList=[]
for f in final_rows:
    for name, info in f.items():
        wiki=dict()
        wiki['Company_Name']=name
        wiki['Wiki_Concept'],wiki['Wiki_Confidence']=getTechArea(hasWiki(name))
        wikiList.append(wiki)
wikiList

In [None]:
keys = wikiList[0].keys()
with open('Wiki.csv', 'w') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(wikiList)

## 3.2 Collect ET

Collects info through google links on how many hits are articles are from ET for a particular company

In [None]:
ET=[]
for f in final_rows:
    for name, info in f.items():
        temp=dict()
        news=hasET(name)
        for n in news:
            flag=0
            if 'summary' in n:
                summary=n['summary']
                flag=1
            link=n['news_link']
            temp=dict()

            if 'economictimes' in link and flag:
                    temp['Company_Name']=name
                    temp['News_Link']=link
                    temp['News_Concept'],temp['News_Relevance'],temp['News_Sentiment']=getTechAreaNews(summary)
                    ET.append(temp)
ET

In [None]:
keys = ET[0].keys()
with open('ET_final.csv', 'w') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(ET)

## 3.3 Collect Tech Area

Suggests the major tech area of a company

In [None]:
tech_area=[]
for f in final_rows:
    for name,info in f.items():
        temp=dict()
        temp["Company_Name"]=name
        print(info)
        temp["Technology"],temp["Technology_Relevance"]=getTechArea(info['Description'])
        tech_area.append(temp)

In [None]:
tech_area

In [None]:
keys = tech_area[0].keys()
with open('Tech_Area_Final.csv', 'w') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(tech_area)

## 3.4 Collect Twitter

Collects info on how many Tweets appear on Google Search of a Company

In [None]:
Twitter=[]
for f in final_rows:
    for name, info in f.items():
        temp=dict()
        news=hasTwitter(name)
        for n in news:
            flag=0
            if 'summary' in n:
                summary=n['summary']
                flag=1
            #print(summary)
            link=n['news_link']
            temp=dict()
            #print('economictimes' in link)
            if 'twitter.com/'+name.lower() in link and flag:
                    temp['Company_Name']=name
                    temp['Twitter_news_link']=link
                    temp['Twitter_Topic'],temp['Twitter_Relevance'],temp['Twitter_Sentiment']=getTechAreaNews(summary)
                    Twitter.append(temp)
Twitter

In [None]:
keys = Twitter[0].keys()
with open('Twitter.csv', 'w') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(Twitter)

## 4. Combine the Results and Save to SPSS

In [None]:
file = open("Tech_Area_Final.csv", "r")
Tech_area = pd.read_csv(file, delimiter=',')
file = open("ET_final.csv", "r")
ET = pd.read_csv(file, delimiter=',')
file = open("Wiki.csv", "r")
Wiki = pd.read_csv(file, delimiter=',')
file = open("Twitter.csv", "r")
Twitter = pd.read_csv(file, delimiter=',')

In [None]:
a = ET.append(Wiki)

In [None]:
b=a.append(Twitter)

In [None]:
b

In [None]:
compiled_rows=pd.merge(b,Tech_area, on="Company_Name")

In [None]:
compiled_rows

In [None]:
compiled_rows

In [None]:
compiled_rows['Wiki_Confidence'].fillna(0.0, inplace=True)
compiled_rows

In [None]:
compiled_rows['News_Relevance'].fillna(0.0, inplace=True)
compiled_rows

In [None]:
compiled_rows['Technology_Relevance'].fillna(0.0, inplace=True)
compiled_rows

In [None]:
compiled_rows['Twitter_Relevance'].fillna(0.0, inplace=True)
compiled_rows

In [None]:
import numpy as np
compiled_rows = compiled_rows.replace(np.nan, '', regex=True)

In [None]:
## Make sure compiled_rows.Company_Name.unique() <= 10. Since SPSS Modeller evaluates only 10 rows at a time
sample_len=int(10/len(list(compiled_rows.Company_Name.unique())))
sample_len

In [None]:
compiled_rows.groupby('Company_Name').apply(lambda x: x.sample(sample_len)).reset_index(drop=True)

### Store and Add table in DB2 Warehouse 

In [None]:
dsn_driver = "IBM DB2 ODBC DRIVER"
dsn_database = credentials_1['db'] 
dsn_hostname = credentials_1['host']
dsn_port = 50000               
dsn_uid = credentials_1['username']      
dsn_pwd = credentials_1['password']

dsn = (
    "DRIVER={{IBM DB2 ODBC DRIVER}};"
    "DATABASE="+str(dsn_database)+";"
    "HOSTNAME="+str(dsn_hostname)+";"
    "PORT="+str(dsn_port)+";"
    "PROTOCOL=TCPIP;"
    "UID="+str(dsn_uid)+";"
    "PWD="+str(dsn_pwd)+";").format(dsn_database, dsn_hostname, dsn_port, dsn_uid, dsn_pwd)

conn = ibm_db.connect(dsn, "", "")

In [None]:
create_statement=pd.io.sql.get_schema(compiled_rows.reset_index(), 'DATA_FOR_SPSS')
create_statement=create_statement.replace('TEXT', 'VARCHAR(500)')
ibm_db.exec_immediate(conn, create_statement)

In [None]:
tuple_of_tuples = tuple([tuple(x) for x in compiled_rows.values])
i=1
for x in compiled_rows.values:
    vals= (i,) + tuple(x)
    print(vals)
    sql = "INSERT INTO "+dsn_uid+".DATA_FOR_SPSS VALUES"+ str(vals)
    i=i+1
    ins_sql=ibm_db.prepare(conn, sql)
    ibm_db.execute(ins_sql)


In [None]:
create_statement= 'CREATE TABLE "DATA_FOR_COGNOS" (\n"index" SMALLINT,\n "Company_Name" VARCHAR,\n  "News_Concept" VARCHAR,\n  "News_Link" VARCHAR, \n  "News_Relevance" DECFLOAT,\n "Overall_Sentiment" VARCHAR ,\n  "Twitter_Topic" VARCHAR,\n  "Twitter_news_link" VARCHAR,\n  "Wiki_Concept" VARCHAR,\n "Wiki_Confidence" VARCHAR,\n  "Technology" VARCHAR,\n  "Technology_Relevance" DECFLOAT,\n "Company_News_Sentiments" SMALLINT\n)'
create_statement=create_statement.replace('VARCHAR', 'VARCHAR(500)')
ibm_db.exec_immediate(conn, create_statement)