In [None]:
#Subset to focus on company entities.
#There are some missing values in roles, which we would also drop (Note the False in the control flow)

comps_cats = comps_cats.loc[['company' in x if type(x)!=NoneType else False for x in comps_cats['roles']]]

comps_cats.shape

### Identify 'AI' companies

In [None]:
#Find AI companies based on text description or the AI category

#These are the terms we use for now. TODO: expand these using semantic similarities
ai_terms = ['data science','machine learning', 'deep learning','artificial intelligence','neural network', ' ai ','natural language processing','text mining']

#Lowercase the text
comps_cats['long_description'] = comps_cats['long_description'].apply(lambda x: x.lower() if type(x)!=NoneType else np.nan)

#Count the number of times that a company mentions AI 
comps_cats['ai_text_n'] = [sum([term in x for term in ai_terms]) if pd.isnull(x)==False else np.nan for x in comps_cats['long_description']]

In [None]:
comps_cats['ai_text_n'].value_counts()

Most companies that mention AI do this once. Others mention it more often

In [None]:
#Check a few companies with more than 3 AI mentions to see what they do
for x in sample(list(comps_cats.loc[comps_cats['ai_text_n']>3]['long_description']),5):
    print(x)
    print('\n')

In [None]:
#Now we check AI in categories. Note there might be other relevant categories in the data but we will not do this for now
comps_cats['ai_cats'] = ['artificial intelligence' in c for c in comps_cats['category_name']]

comps_cats['ai_cats'].sum()

In [None]:
#What is the overlap between companies with AI categories and AI relateed text in the description?
comp_cats_frequences = pd.crosstab(comps_cats['ai_text_n'],comps_cats['ai_cats'])

#What is the distribution of companies that mention AI various times over the share of companies with AI in their category?
comp_cats_frequences['text_share'] = 100*comp_cats_frequences[True]/comp_cats_frequences.sum(axis=1)

comp_cats_frequences

Around a third of companies with AI in their categories don't mention AI related terms in their descriptions.
There are quite a few companies that mention AI repeatedly but don't have an AI category.

In [None]:
# Quick check of company descriptions for companies that have AI cats but no AI Terms

#Check a few companies with more than 3 AI mentions to see what they do
for x in sample(list(comps_cats.loc[(comps_cats['ai_text_n']==0)&(comps_cats['ai_cats']==True)]['long_description']),5):
    print(x)
    print('\n')


The companies that have ai categories but no ai related terms in their description look quite noisy. Let's exclude them from the analysis for now

In [None]:
#Flag as AI companies with at least one AI term in their description. Later we could change this threshold
comps_cats['ai_flag'] = comps_cats['ai_text_n']>0

### A couple of descriptive analyses

#### Evolution

In [None]:
#We need to create a year variable (founded on is currently a date)
 
comps_cats['founded_year'] = [x.year if type(x)!=NoneType else np.nan for x in comps_cats['founded_on']]

In [None]:
fig,ax = plt.subplots()

(100*pd.crosstab(comps_cats['founded_year'],comps_cats['ai_flag'],normalize=1)).plot(ax=ax,title='Year share of activity')

ax.set_xlim(2000,2018)


Very interesting: explosion of AI startup activity while startup activity in general slows-up. What else could be explaining this? China's entry?

#### Geography

In [None]:
#Calculate country distribution
country_distr = pd.crosstab(comps_cats['country'],comps_cats['ai_flag']).sort_values(True,ascending=False)

country_distr[:20]

Some of the coverage issues are apparent - relatively limited activity in Japan. And where is China?

In [None]:
#Calculate index of comparative advantage
country_distr['ai_rca']= (country_distr[True]/country_distr[True].sum())/(country_distr.sum(axis=1)/country_distr.sum(axis=1).sum())

In [None]:
#Plot RCAs for top 20 countries by level of activity
(country_distr[:20]['ai_rca'].sort_values(ascending=False)-1).plot.bar(title='Relative specialisation in AI for top 20 countries')

Some results are expected (Israel, Singapore). Others (Canada), not so much.

In [None]:
#That gnarly pivot gives us the number of ai companies per year and country.
ai_country_counts = pd.pivot_table(comps_cats.groupby(['founded_year','country'])['ai_flag'].sum().reset_index(drop=False),index='country',columns='founded_year',values='ai_flag').fillna(0)

#We want to focus our visualisation on the top 10 countries by overall activity
bigger_countries = ai_country_counts.sum(axis=1).sort_values(ascending=False).index[:15]

#Consider share of activity in a given year

ai_country_shares = ai_country_counts.apply(lambda x: x/x.sum(),axis=1).fillna(0)

In [None]:
fig,ax = plt.subplots()

ai_country_shares.loc[bigger_countries].T.rolling(window=3).mean().plot(ax=ax,title='Share of year in Country',figsize=(10,5),cmap='tab20',linewidth=2)

ax.set_xlim(2000,2018)
ax.legend()

Everyone seems to be following a similar patterns perhaps with the exception of Singapore and Switzerland, which seem to be growing faster

#### Consider country sizes (TODO)

### Cluster sectors

In [None]:
from itertools import combinations, product, chain
import networkx as nx
import community

def flatten_list(a_list):
    '''
    Flattens a list
    '''
    
    return([x for el in a_list for x in el])

In [None]:
#len(set(flatten_list(comps_cats['category_name'])))

In [None]:
#Here the idea is to create a proximity matrix based on co-occurrences

#Turn co-occurrences into combinations of pairs we can use to construct a similarity matrix
sector_combs = flatten_list([sorted(list(combinations(x,2))) for x in comps_cats['category_name']])
sector_combs = [x for x in sector_combs if len(x)>0]

#Turn the sector combs into an edgelist
edge_list = pd.DataFrame(sector_combs,columns=['source','target'])

edge_list['weight']=1

#Group over edge pairs to aggregate weights
edge_list_weighted = edge_list.groupby(['source','target'])['weight'].sum().reset_index(drop=False)

edge_list_weighted.sort_values('weight',ascending=False).head(n=10)

In [None]:
#Create network and extract communities
net = nx.from_pandas_edgelist(edge_list_weighted,edge_attr=True)

#We choose a high level of resulution (lower == more finely grained)
comms = community.best_partition(net,resolution=0.1)

#We have chosen quite a finely grained level here in order to obtain categories that we can label as creative.

In [None]:
#What does this look like?
comm_strings = pd.DataFrame(comms,index=['comm']).T.groupby('comm')

#This is just to visualise the participation in communities
for n,x in enumerate(comm_strings.groups.keys()):
    print(n)
    print('====')
    print('\t'.join(list(comm_strings.groups[x])))
    #print(', '.join(list(x.index())))

In [None]:
#Creative sector lookup
sector_labels = {0:'3d_printing',
2:'advertising',
5:'social_networks',
7:'apps',
9:'animation_film',
10:'apps',
12:'arts_culture',
15:'audio_music',
16:'immersive',
24:'content_blogging',
25:'advertising',
45:'design_ux',
53:'digital_media',
52:'advertising',
63:'video_games',
64:'events_shows',
67:'fashion',
73:'photography',
78:'smart_cities',
88:'apps',
89:'apps',
92:'journalism_news',
99:'design',
111:'social_media',
113:'video_editing',
114:'web_design',
116:'advertising',
92:'e_books',
11:'architecture'}


In [None]:
# Note that these categories may be too aggregate - eg sports games contain both video games (creative) and sports (non creative).

# One way to deal with this is by increasing the granularity of the community detection.

In [None]:
#Lookup every category
comps_cats['sector_list']= [[sector_labels[comms[lab]] if comms[lab] in sector_labels.keys() else 'not_creative' for lab in cats] for cats in comps_cats['category_name']]

In [None]:
#Identify one-sector categories
comps_cats['sector_unique'] = [list(set(x))[0] if len(set(x))==1 else 'mixed' for x in comps_cats['sector_list']]

In [None]:
#Looks like enough for a model
#pd.Series(sect).value_counts()

## Train the model

In [None]:
# %load lda_pipeline.py
from gensim import corpora, models
from string import punctuation
from string import digits
import re
import pandas as pd
import numpy as np

#Characters to drop
drop_characters = re.sub('-','',punctuation)+digits

#Stopwords
from nltk.corpus import stopwords

stop = stopwords.words('English')

#Stem functions
from nltk.stem import *
stemmer = PorterStemmer()


def clean_tokenise(string,drop_characters=drop_characters,stopwords=stop):
    '''
    Takes a string and cleans (makes lowercase and removes stopwords)
    
    '''
    

    #Lowercase
    str_low = string.lower()
    
    
    #Remove symbols and numbers
    str_letters = re.sub('[{drop}]'.format(drop=drop_characters),'',str_low)
    
    
    #Remove stopwords
    clean = [x for x in str_letters.split(' ') if (x not in stop) & (x!='')]
    
    return(clean)


class CleanTokenize():
    '''
    This class takes a list of strings and returns a tokenised, clean list of token lists ready
    to be processed with the LdaPipeline
    
    It has a clean method to remove symbols and stopwords
    
    It has a bigram method to detect collocated words
    
    It has a stem method to stem words
    
    '''
    
    def __init__(self,corpus):
        '''
        Takes a corpus (list where each element is a string)
        '''
        
        #Store
        self.corpus = corpus
        
    def clean(self,drop=drop_characters,stopwords=stop):
        '''
        Removes strings and stopwords, 
        
        '''
        
        cleaned = [clean_tokenise(doc,drop_characters=drop,stopwords=stop) for doc in self.corpus]
        
        self.tokenised = cleaned
        return(self)
    
    def stem(self):
        '''
        Optional: stems words
        
        '''
        #Stems each word in each tokenised sentence
        stemmed = [[stemmer.stem(word) for word in sentence] for sentence in self.tokenised]
    
        self.tokenised = stemmed
        return(self)
        
    
    def bigram(self,threshold=10):
        '''
        Optional Create bigrams.
        
        '''
        
        #Colocation detector trained on the data
        phrases = models.Phrases(self.tokenised,threshold=threshold)
        
        bigram = models.phrases.Phraser(phrases)
        
        self.tokenised = bigram[self.tokenised]
        
        return(self)
        
        
        
        

class LdaPipeline():
    '''
    This class processes lists of keywords.
    How does it work?
    -It is initialised with a list where every element is a collection of keywords
    -It has a method to filter keywords removing those that appear less than a set number of times
    
    -It has a method to process the filtered df into an object that gensim can work with
    -It has a method to train the LDA model with the right parameters
    -It has a method to predict the topics in a corpus
    
    '''
    
    def __init__(self,corpus):
        '''
        Takes the list of terms
        '''
        
        #Store the corpus
        self.tokenised = corpus
        
    def filter(self,minimum=5):
        '''
        Removes keywords that appear less than 5 times.
        
        '''
        
        #Load
        tokenised = self.tokenised
        
        #Count tokens
        token_counts = pd.Series([x for el in tokenised for x in el]).value_counts()
        
        #Tokens to keep
        keep = token_counts.index[token_counts>minimum]
        
        #Filter
        tokenised_filtered = [[x for x in el if x in keep] for el in tokenised]
        
        #Store
        self.tokenised = tokenised_filtered
        self.empty_groups = np.sum([len(x)==0 for x in tokenised_filtered])
        
        return(self)
    
    def clean(self):
        '''
        Remove symbols and numbers
        
        '''
        
        
        
    
        
    def process(self):
        '''
        This creates the bag of words we use in the gensim analysis
        
        '''
        #Load the list of keywords
        tokenised = self.tokenised
        
        #Create the dictionary
        dictionary = corpora.Dictionary(tokenised)
        
        #Create the Bag of words. This converts keywords into ids
        corpus = [dictionary.doc2bow(x) for x in tokenised]
        
        self.corpus = corpus
        self.dictionary = dictionary
        return(self)
        
    def tfidf(self):
        '''
        This is optional: We extract the term-frequency inverse document frequency of the words in
        the corpus. The idea is to identify those keywords that are more salient in a document by normalising over
        their frequency in the whole corpus
        
        '''
        #Load the corpus
        corpus = self.corpus
        
        #Fit a TFIDF model on the data
        tfidf = models.TfidfModel(corpus)
        
        #Transform the corpus and save it
        self.corpus = tfidf[corpus]
        
        return(self)
    
    def fit_lda(self,num_topics=20,passes=5,iterations=75,random_state=1803):
        '''
        
        This fits the LDA model taking a set of keyword arguments.
        #Number of passes, iterations and random state for reproducibility. We will have to consider
        reproducibility eventually.
        
        '''
        
        #Load the corpus
        corpus = self.corpus
        
        #Train the LDA model with the parameters we supplied
        lda = models.LdaModel(corpus,id2word=self.dictionary,
                              num_topics=num_topics,passes=passes,iterations=iterations,random_state=random_state)
        
        #Save the outputs
        self.lda_model = lda
        self.lda_topics = lda.show_topics(num_topics=num_topics)
        

        return(self)
    
    def predict_topics(self):
        '''
        This predicts the topic mix for every observation in the corpus
        
        '''
        #Load the attributes we will be working with
        lda = self.lda_model
        corpus = self.corpus
        
        #Now we create a df
        predicted = lda[corpus]
        
        #Convert this into a dataframe
        predicted_df = pd.concat([pd.DataFrame({x[0]:x[1] for x in topics},
                                              index=[num]) for num,topics in enumerate(predicted)]).fillna(0)
        
        self.predicted_df = predicted_df
        
        return(self)
    

In [None]:
# %load text_classifier.py
# CLasses

#ML imports
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

import warnings

warnings.simplefilter('ignore',UserWarning)

#One class for text classification based on text inputs

class TextClassification():
    '''
    This class takes a corpus (could be a list of strings or a tokenised corpus) and a target (could be multiclass or single class).
    
    When it is initialised it vectorises the list of tokens using sklearn's count vectoriser.
    
    It has a grid search method that takes a list of models and parameters and trains the model.
    
    It returns the output of grid search for diagnosis
    
    '''
    
    def __init__(self,corpus,target):
        '''
        
        Initialise. The class will recognise if we are feeding it a list of strings or a list of
        tokenised documents and vectorise accordingly. 
        
        It will also recognise is this a multiclass or one class problem based on the dimensions of the target array
        
        Later on, it will use control flow to modify model parameters depending on the type of data we have
        
        '''
        
        #Is this a multiclass classification problem or a single class classification problem?
        if target.shape[1]>1:
            self.mode = 'multiclass'
            
        else:
            self.mode = 'single_class'
    
    
        #Store the target
        self.Y = target
    
        #Did we feed the model a bunch of strings or a list of tokenised docs? If the latter, we clean and tokenise.
        
        if type(corpus[0])==str:
            corpus = CleanTokenize(corpus).clean().bigram().tokenised
            
        #Turn every list of tokens into a string for count vectorising
        corpus_string =  [' '.join(words) for words in corpus]
        
        
        #And then we count vectorise in a hacky way.
        count_vect = CountVectorizer(stop_words='english',min_df=5,ngram_range=[1,2]).fit(corpus_string)
        
        #Store the features
        self.X = count_vect.transform(corpus_string)
        
        #Store the count vectoriser (we will use it later on for prediction on new data)
        self.count_vect = count_vect
        
    def grid_search(self,models):
        '''
        The grid search method takes a list with models and their parameters and it does grid search crossvalidation.
        
        '''
        
        #Load inputs and targets into the model
        Y = self.Y
        X = self.X
        
        if self.mode=='multiclass':
            '''
            If the model is multiclass then we need to add some prefixes to the model paramas
            
            '''
        
            for mod in models:
                #Make ovr
                mod[0] = OneVsRestClassifier(mod[0])
                
                #Add the estimator prefix
                mod[1] = {'estimator__'+k:v for k,v in mod[1].items()}
                
        
        #Container with results
        results = []

        #For each model, run the analysis.
        for num,mod in enumerate(models):
            print(num)

            #Run the classifier
            clf = GridSearchCV(mod[0],mod[1])

            #Fit
            clf.fit(X,Y)

            #Append results
            results.append(clf)
        
        self.results = results
        return(self)

    
#Class to visualise the outputs of multilabel models.

#I call it OrangeBrick after YellowBrick, the package for ML output visualisation 
#(which currently doesn't support multilabel classification)


class OrangeBrick():
    '''
    This class takes a df with the true classes for a multilabel classification exercise and produces some charts visualising findings.
    
    The methods include:
    
        .confusion_stack: creates a stacked barchart with the confusion matrices stacked by category, sorting classes by performance
        .prec_rec: creates a barchart showing each class precision and recall;
        #Tobe done: Consider mixes between classes?
    
    '''
    
    def __init__(self,true_labels,predicted_labels,var_names):
        '''
        Initialise with a true labels, predicted labels and the variable names
        '''
         
        self.true_labels = true_labels
        self.predicted_labels = predicted_labels
        self.var_names = var_names
    
    def make_metrics(self):
        '''
        Estimates performance metrics (for now just confusion charts by class and precision/recall scores for the 0.5 
        decision rule.
        
        '''
        #NB in a confusion matrix in SKlearn the X axis indicates the predicted class and the Y axis indicates the ground truth.
        #This means that:
            #cf[0,0]-> TN
            #cf[1,1]-> TP
            #cf[0,1]-> FN (prediction is false, groundtruth is true)
            #cf[1,0]-> FP (prediction is true, ground truth is false)



        #Predictions and true labels
        true_labels = self.true_labels
        pred_labels = self.predicted_labels

        #Variable names
        var_names = self.var_names

        #Store confusion matrices
        score_store = []


        for num in np.arange(len(var_names)):

            #This is the confusion matrix
            cf = confusion_matrix(pred_labels[:,num],true_labels[:,num])

            #This is a melted confusion matrix
            melt_cf = pd.melt(pd.DataFrame(cf).reset_index(drop=False),id_vars='index')['value']
            melt_cf.index = ['true_negative','false_positive','false_negative','true_positive']
            melt_cf.name = var_names[num]
            
            #Order variables to separate failed vs correct predictions
            melt_cf = melt_cf.loc[['true_positive','true_negative','false_positive','false_negative']]

            #We are also interested in precision and recall
            prec = cf[1,1]/(cf[1,1]+cf[1,0])
            rec = cf[1,1]/(cf[1,1]+cf[0,1])

            prec_rec = pd.Series([prec,rec],index=['precision','recall'])
            prec_rec.name = var_names[num]
            score_store.append([melt_cf,prec_rec])
    
        self.score_store = score_store
        
        return(self)
    
    def confusion_chart(self,ax):
        '''
        Plot the confusion charts
        
        
        '''
        
        #Visualise confusion matrix outputs
        cf_df = pd.concat([x[0] for x in self.score_store],1)

        #This ranks categories by the error rates
        failure_rate = cf_df.apply(lambda x: x/x.sum(),axis=0).loc[['false' in x for x in cf_df.index]].sum().sort_values(
            ascending=False).index

        
        #Plot and add labels
        cf_df.T.loc[failure_rate,:].plot.bar(stacked=True,ax=ax,width=0.8,cmap='Accent')

        ax.legend(bbox_to_anchor=(1.01,1))
        #ax.set_title('Stacked confusion matrix for disease areas',size=16)
    
    
    def prec_rec_chart(self,ax):
        '''
        
        Plot a precision-recall chart
        
        '''
    

        #Again, we sort them here to assess model performance in different disease areas
        prec_rec = pd.concat([x[1] for x in self.score_store],1).T.sort_values('precision')
        prec_rec.plot.bar(ax=ax)

        #Add legend and title
        ax.legend(bbox_to_anchor=(1.01,1))
        #ax.set_title('Precision and Recall by disease area',size=16)

In [None]:
#Here is the corpus. We drop projects without any org labels, and projects without abstracts

#We focus on 'pure cases'
cb_pure_cases = comps_cats.loc[[x not in ['mixed','not_creative'] for x in comps_cats['sector_unique']]].reset_index(drop=True)

#Drop cases with no descriptions
cb_pure_cases = cb_pure_cases.dropna(axis=0,subset=['long_description']).reset_index(drop=True)

In [None]:
#cb_pure_cases['sector_unique'].value_counts()

In [None]:
corpus = list(cb_pure_cases['long_description'])

#We use a utility function to create a df for a one vs rest classification
target = pd.get_dummies(cb_pure_cases['sector_unique'])

In [None]:
#Run grid search with these model parameters
my_models = [
    [RandomForestClassifier(),
     {'class_weight':['balanced'],'min_samples_leaf':[1,5]}],
    
    [LogisticRegression(),
     {'class_weight':['balanced'],'penalty':['l1','l2'],
      'C':[0.1,1,100]}]]

In [None]:
# Predict groups

#Initialise the TextClassification class
cb_t = TextClassification(corpus,target)

In [None]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [None]:
cb_t.grid_search(my_models)

In [None]:
#Check scores and best estimators
for res in cb_t.results:
    print(res.best_score_)
    print(res.best_estimator_)
    
    #This is the best estimator
best_est = cb_t.results[1].best_estimator_

In [None]:
cb_diag = OrangeBrick(true_labels=np.array(target),
                      predicted_labels=best_est.predict_proba(cb_t.X)>0.5,
                      var_names=target.columns).make_metrics()

In [None]:
fig,ax = plt.subplots(nrows=2,figsize=(10,10))

cb_diag.confusion_chart(ax=ax[0])
cb_diag.prec_rec_chart(ax=ax[1])

#fig.suptitle('Model evaluation for GTR disciplines',y=1.01,size=16)

plt.tight_layout()

### Apply the model to arXiv

In [None]:
# arx = pd.read_csv('../data/processed/6_8_2019_arxiv_processed.csv',compression='zip')

# arx_papers = arx.drop_duplicates('article_id').reset_index(drop=True)

In [None]:
# Transform the arXiv data using the same model we used to train the model before

# arx_trans = cb_t.count_vect.transform(arx_papers['summary'])

In [None]:
# arx_preds = pd.DataFrame(best_est.predict_proba(arx_trans),columns=target.columns)

In [None]:
# for abs in arx_papers.loc[arx_preds['video_games']>0.99999]['summary'][:10]:
    
#     print(abs)

Alas, it doesn't work

### Analyse AI trends in creative sectors in CrunchBase

In [None]:
#Remove all observations without description
cb_with_descr = comps_cats.dropna(axis=0,subset=['long_description']).reset_index(drop=True)

In [None]:
#Transform the corpus
all_cb_transformed = cb_t.count_vect.transform(cb_with_descr['long_description'])

In [None]:
#Generate predictions
all_preds = pd.DataFrame(
    best_est.predict_proba(all_cb_transformed)>0.9999,
    columns=target.columns)

In [None]:
all_preds.sum()

In [None]:
#Can we use the predicted labels with the rest of the CrunchBase data?

In [None]:
cb_with_descr.columns

In [None]:
all_cb_descr= pd.concat([cb_with_descr[['id','long_description']],all_preds],axis=1)

In [None]:
# for s in out.columns:
    
#     print(s)
#     print('=====')
    
#     print('\n')
#     for x in all_cb_descr.loc[all_cb_descr[s]==True,'long_description'][:5]:
        
#         print(x[:500])
        
#         print('\n')
    
#     print('\n')


The above looks fine. We will reclassify these companies into creative sectors and calculate some descriptives

In [None]:
creative_sector_lookup = {
    '3d_printing':'crafts', 
    'advertising':'advertising',
    'animation_film':'film_video_tv',
    'apps':'software',
    'arts_culture':'music_performing_arts',
    'audio_music':'music_performing_arts',
    'content_blogging':'publishing',
    'design':'design',
    'design_ux':'design',
    'e_books':'publishing',
    #'events_shows':'music_performing_arts',
    #'fashion':'design',
    'immersive':'games_immersive',
    'journalism_news':'publishing',
    'photography':'film_video_tv',
    'social_media':'software',
    'social_networks':'software',
    'video_editing':'film_video_tv',
    'video_games':'games_immersive',
    'web_design':'design',
    'architecture':'architecture',
    'digital_media':'film_video_tv'
}


#Turn into a df for merging
creative_df = pd.DataFrame.from_dict(creative_sector_lookup,orient='index',
                                    columns=['sector']).reset_index(drop=False)
creative_df

In [None]:
# We want to convert the predicted data across categories. Do Melt, apply, pivot

In [None]:
cb_sector_merged = pd.merge(pd.melt(all_cb_descr,id_vars=['id','long_description']).reset_index(drop=False),
                 creative_df,left_on='variable',right_on='index')

In [None]:
cb_sector_merged['value'] = cb_sector_merged['value']

In [None]:
#Remove the missing values
cb_sector_reshaped = cb_sector_merged.loc[cb_sector_merged['value']==True].pivot_table(
    index=['id'],columns='sector',values='value',aggfunc=sum).fillna(0)

In [None]:
(cb_sector_reshaped>0).sum()

In [None]:
#Now we merge cb sector reshaped with the cb df with all sectors

In [None]:
cb_sectors = pd.merge(cb_with_descr,cb_sector_reshaped.reset_index(drop=False),
                     left_on='id',right_on='id')

### Descriptive analysis of AI CrunchBase activity globally

In [None]:
def save_fig(figname,path='../reports/figures/figures_report/cb_'):
    
    plt.tight_layout()
    
    plt.savefig(path+figname)

In [None]:
100*len(cb_sectors)/len(cb_with_descr)

In [None]:
100*cb_sectors['ai_flag'].sum()/len(cb_sectors)

In [None]:
100*cb_with_descr['ai_flag'].sum()/len(cb_with_descr)

In [None]:
len(cb_with_descr['ai_flag'])

#### Sectoral distribution of activity

In [None]:
creative_sectors = set(creative_sector_lookup.values())

In [None]:
sector_distr = pd.concat([cb_sectors.loc[cb_sectors[sector]>0]['ai_flag'].value_counts() for sector in creative_sectors],axis=1)
sector_distr.columns = creative_sectors

In [None]:
sector_distr.T

In [None]:
(100*sector_distr.T.apply(lambda x: x/x.sum(),axis=1).sort_values(
    True,ascending=False)[True]).plot.bar(title='Share of AI projects in vertical')

save_fig('ai_share_sector.pdf')

Three examples from each sector:

In [None]:
import random

In [None]:
for s in creative_sectors:
    
    print(s)
    print('===')
    
    ai_ex = cb_sectors.loc[(cb_sectors[s]>0) & (cb_sectors['ai_flag']==True)]
    
    choose_three = random.sample(list(ai_ex['long_description']),3)
    
    for des in choose_three:
        
        print(des[:500])
        print('\n')
    

#### Sectoral distribution of funding

In [None]:
sector_fund_distr = pd.concat([cb_sectors.loc[cb_sectors[sector]>0].groupby(
    'ai_flag')['funding_total_usd'].sum() for sector in creative_sectors],axis=1)
sector_fund_distr.columns = creative_sectors

In [None]:
sector_fund_distr.T.sort_values(True,ascending=False)/1e6

In [None]:
top_sectors = sector_fund_distr.T.sort_values(True,ascending=False)[True].index

#### Overal trends

In [None]:
cb_sectors['founded_on'][0].year

In [None]:
cb_sectors['year_founded'] = [x.year for x in cb_sectors['founded_on']]

In [None]:
sector_year = pd.concat([cb_sectors.loc[(cb_sectors[sect]>0)&(cb_sectors['ai_flag']==True)][
    'year_founded'].value_counts() for sect in creative_sectors],axis=1).fillna(0)
                               
sector_year = sector_year.loc[(sector_year.index>2000)&(sector_year.index<2019)]

sector_year.columns = creative_sectors

sector_year[top_sectors].plot.bar(stacked=True,title='Number of companies')

save_fig('ai_sector_trends.pdf')

Interesting! There has been a drop in the number of companies in the last couple of years? What about levels of funding?

Let's look at the comparative picture first

In [None]:
cb_year_all = cb_with_descr['founded_year'].value_counts()

cb_year_rec = cb_year_all.loc[np.arange(2000,2019)]

cb_year_rec.plot.bar()

In [None]:
cb_year_all_ai = cb_with_descr.loc[cb_with_descr['ai_flag']==True,'founded_year'].value_counts()
cb_year_rec_ai = cb_year_all_ai.loc[np.arange(2000,2019)]

cb_year_rec_ai.plot.bar()

In [None]:
sectors_sorted = sector_year.sum().sort_values().index[::-1]

In [None]:
ax = sector_year.apply(lambda x: x/x.sum(),axis=1)[sectors_sorted].rolling(window=3).mean().dropna().plot.bar(
    stacked=True,title='share of AI activity',figsize=(8,5))

ax.legend(bbox_to_anchor=(1,1))

save_fig('ai_sector_share_trends.pdf')

In [None]:
sector_year_funding = pd.concat([cb_sectors.loc[(cb_sectors[sect]>0)&(cb_sectors['ai_flag']==True)].groupby(
    'year_founded')['funding_total_usd'].sum() for sect in creative_sectors],axis=1).fillna(0)
                               
sector_year_funding = sector_year_funding.loc[(sector_year_funding.index>2000)&(sector_year_funding.index<2019)]

sector_year_funding.columns = creative_sectors

ax = sector_year_funding.rolling(window=2).mean().plot.bar(stacked=True,title='Level of funding for creative AI',
                                                          figsize=(8,5))

ax.legend(bbox_to_anchor=(1,1))

save_fig('ai_sector_trends_funding.pdf')

#### What companies received the biggest amount of funding per sector?

In [None]:
for s in creative_sectors:
    
    print(s)
    print('===')
    
    ai_ex = cb_sectors.loc[(cb_sectors[s]>0) & (cb_sectors['ai_flag']==True)]
    
    top_3 = ai_ex.sort_values('funding_total_usd',ascending=False)[:3]
    
    for pid,des in top_3.iterrows():
        
        print(des['company_name'])
        print(des['year_founded'])
        print('$'+str(des['funding_total_usd']/1e6)+' million')
        print('----')
        
        print(des['long_description'][:500])
                                  
        print('\n')
    

#### Geography

In [None]:
def create_lq_df(df):
    '''
    Takes a df with cells = activity in col in row and returns a df with cells = lq
    
    '''
    
    area_activity = df.sum(axis=0)
    area_shares = area_activity/area_activity.sum()
    
    lqs = df.apply(lambda x: (x/x.sum())/area_shares, axis=1)
    return(lqs)

import seaborn as sn

In [None]:
sector_country = pd.concat([cb_sectors.loc[(cb_sectors[sect]>0)&(cb_sectors['ai_flag']==True)][
    'country'].value_counts() for sect in creative_sectors],axis=1).fillna(0)
                               
sector_country.columns = creative_sectors

big_countries = sector_country.sum(axis=1).sort_values(ascending=False).index[:15]

sector_country.loc[big_countries,top_sectors].plot.bar(stacked=True)

save_fig('ai_country_totals.pdf')


In [None]:
100*len(cb_sectors.loc[(cb_sectors['country']=='United States')&(cb_sectors['ai_flag'])==True])/len(cb_sectors.loc[cb_sectors['ai_flag']==True])

In [None]:
100*len(cb_sectors.loc[(cb_sectors['country']=='United Kingdom')&(cb_sectors['ai_flag'])==True])/len(cb_sectors.loc[cb_sectors['ai_flag']==True])

#### Market 'shares'

In [None]:
ax = sector_country.apply(lambda x: x/x.sum(),axis=0).loc[big_countries,top_sectors].T.plot.bar(
    stacked=True,cmap='tab20',edgecolor='lightgrey',title='Market shares',figsize=(10,5))

ax.legend(bbox_to_anchor=(1,1))

save_fig('ai_country_shares.pdf')

#### Specialisation (discretised)

In [None]:
#We want to extract non creative AI companies and non AI companies
#Non creative AI
ai_creative_ids = set(cb_sectors.loc[cb_sectors['ai_flag']==True]['id'])
non_ci_ai = cb_with_descr.loc[[x not in ai_creative_ids for x in cb_with_descr['id']]]

In [None]:
non_ai_country = pd.DataFrame(non_ci_ai.loc[non_ci_ai['ai_flag']==True]['country'].value_counts())

non_ai_country.columns= ['AI non CI']

non_ai_all = cb_with_descr.loc[cb_with_descr['ai_flag']==False]['country'].value_counts()

non_ai_all.name = 'Non AI'

In [None]:
all_country_activity = pd.concat([non_ai_all,non_ai_country,sector_country[top_sectors]],axis=1).fillna(0)

In [None]:
all_country_lq = create_lq_df(all_country_activity)

In [None]:
fig,ax = plt.subplots(figsize=(8,5))

ax = sn.heatmap(all_country_lq.apply(lambda x: pd.qcut(x,np.arange(0,1.1,0.2),labels=False,duplicates='drop'),axis=1).loc[big_countries].T,
           cmap='Oranges',edgecolor='lighgrey',linewidth=0.01,ax=ax)

ax.collections[0].colorbar.set_label("Specialisation quartile")
ax.set_title('Country specialisation')

save_fig('ai_sector_specs.pdf')

### Evolution of market shares

In [None]:
def make_sector_year_trend(df,year_var,geo_var):
    '''
    Creates a table with number of papers by country and year
    
    
    '''
    
    country_year = pd.crosstab(df[year_var],df[geo_var])
    
    return(country_year)


def geo_trends_country(df,ai_var,year_var,geo_var,sector_names,threshold,country,top_c=False,year_lims= False):
    
    '''
    Visualise geotrends by sector
    
    Creates a set of tables for each sector with number of papers by country and year
    
    args:
    
        -df is the table with the information, every row is an entity with geographical information, ai information, sector information etc
        -ai_var is the ai variable
        -geo_var is the variable with the countries
        -sector_names is the sectors we want to extract information for
        -threshold is the threshold above which we accept a paper as 'creative'
        -top_c is the number of countries we want to output
        
    
    
    '''
    #We focus on ai papers
    

    
    ai_papers = df.loc[df[ai_var]==True]
    
    #We store the sectoral results here
    sector_store = {}
    
    #Also for all papers (as a benchmark)
    
    all_papers = make_sector_year_trend(df,year_var,geo_var)
    
    all_papers_shares = all_papers.apply(lambda x: x/x.sum(),axis=1)
    
    sector_store['All'] = all_papers_shares[country]
    
    #For each sector
    for s in sector_names:
        
        
        #Calculate number of papers in sector by year and country
        
        out = make_sector_year_trend(ai_papers.loc[ai_papers[s]>threshold],year_var,geo_var).fillna(0)
        
        
        out_shares = out.apply(lambda x: x/x.sum(),axis=1)
        
        #This is to limit the number of years we focus on
        if year_lims!=False:
            
            out_shares = out_shares.loc[(out_shares.index>year_lims[0]) & (out_shares.index<year_lims[1])]
        
        
        #out['sector']=s
        try:
            sector_store[s]=out_shares[country]
        except:
            pass
    
    return(sector_store)
        

In [None]:
out_all = []

out_uk = []

for s in sector_year:
    
    s_out = cb_sectors.loc[cb_sectors[s]>0]
    
    country_by_year = pd.crosstab(s_out['year_founded'],s_out['country'])
    
    
    if 'United Kingdom' in country_by_year.columns:
        out_uk.append(country_by_year['United Kingdom'])
    out_all.append(country_by_year)
    

In [None]:
ai_uk_creative = pd.concat(out_uk,axis=1).fillna(0).sum(axis=1).loc[np.arange(2000,2019)]
ai_all_creative = pd.concat(out_all,axis=1).fillna(0).sum(axis=1).loc[np.arange(2000,2019)]

In [None]:
ax = (100*(ai_uk_creative/ai_all_creative).rolling(window=3).mean().dropna()).plot(title='UK Market share in creative AI activity')

ax.set_ylim(0,10)

save_fig('uk_market_share.pdf')

### Examples

In [None]:
for c in big_countries:
    
    print(c)
    print('===')
    
    ai_ex = cb_sectors.loc[(cb_sectors['ai_flag']==True)&(cb_sectors['country']==c)]
    
    top_3 = ai_ex.sort_values('funding_total_usd',ascending=False)[:10]
    
    for pid,des in top_3.iterrows():
        
        print(des['company_name'])
        print(des['year_founded'])
        print('$'+str(des['funding_total_usd']/1e6)+' million')
        print('----')
        
        print(des['long_description'][:500])
                                  
        print('\n')
    

#### Output keyword lookup

In [None]:
creative_appendix_lookup = {k:sector_labels[v] for k,v in comms.items() if v in sector_labels.keys()}
creative_appendix_lookup_2 = {k:creative_sector_lookup[v] for k,v in creative_appendix_lookup.items() if v in creative_sector_lookup.keys()}

append = pd.DataFrame(creative_appendix_lookup_2,index=['keyword']).T.reset_index(drop=False)
append.groupby('keyword')['index'].apply(lambda x: ', '.join(x)).to_csv('../data/external/cb_lookup.csv')

In [None]:
cb_sectors.loc[cb_sectors['country']=='United Kingdom'].to_csv(f'../data/processed/{today_str}_uk_creative_companies.csv',compression='zip')

In [None]:
cb_sectors.shape

In [None]:
with open(f'../data/processed/{today_str}_arxiv_creative_sector_names.json','w') as outfile:
    json.dump(list(top_sectors),outfile)

In [None]:
from data_getters.labs.core import download_file