## Clean Technologies 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
categories = pd.read_excel('../data/categories.xls')
categories = categories.fillna(method='ffill')
categories.columns = ['sector','technology_group','technology','description','technology_term','source']
categories.head()

In [None]:
matrix = []
for term in categories['technology_term']:
    row = [x.strip() for x in term.split(',')]
    row = [i.replace('“', '').replace('”', '') for i in row]
    matrix.append(row)
categories['technology_term'] = matrix

---

In [None]:
#https://data.europa.eu/euodp/en/data/dataset/cordisH2020projects

In [None]:
cordish2020 = pd.read_excel('../data/cordis-h2020projects.xlsx')
#cordish2020 = pd.read_csv('../data/cordis-h2020projects.csv', sep=";")

In [None]:
cordish2020.columns

In [None]:
cordish2020['totalCost'] = [float(str(i).replace(',', '.')) for i in cordish2020['totalCost']]
cordish2020['ecMaxContribution'] = [float(str(i).replace(',', '.')) for i in cordish2020['ecMaxContribution']]

In [None]:
cordish2020['startDate'] = cordish2020['startDate'].map(pd.Timestamp)
cordish2020['endDate'] = cordish2020['endDate'].map(pd.Timestamp)

In [None]:
# Total of proyects
len(cordish2020)

Objective example

---

In [None]:
cordish2020.loc[13]['objective']

---

### Deterministic text labeling, Flashtext for easy regex

In [None]:
from flashtext import KeywordProcessor

In [None]:
def extract(vec, dictionary, info=False):
    empty = []
    for line in vec:
        empty.append(dictionary.extract_keywords(line, span_info=info))
    return empty

In [None]:
# unique list of technology group
technology_group = list(categories['technology_group'].unique())
len(technology_group)

In [None]:
dictionary1 = KeywordProcessor()
dictionary1.add_keywords_from_list(technology_group)
extracted1 = extract(cordish2020['objective'], dictionary1)
cordish2020['matches_group'] = [list(set(i)) if len(i)>0 else '' for i in extracted1]
cordish2020['count_matches_group'] = [len(i) for i in extracted1]
cordish2020['count_unique_matches_group'] = [len(set(i)) for i in extracted1]

In [None]:
# unique list of technology term
technology_term = [y for x in categories['technology_term'] for y in x if y != '']
len(technology_term)

In [None]:
#adding the new list of terms
dictionary2 = KeywordProcessor()
dictionary2.add_keywords_from_list(technology_term)
extracted2 = extract(cordish2020['objective'], dictionary2)
cordish2020['matches_technology'] = [list(set(i)) if len(i)>0 else '' for i in extracted2]
cordish2020['count_matches_technology'] = [len(i) for i in extracted2]
cordish2020['count_unique_matches_technology'] = [len(set(i)) for i in extracted2]

In [None]:
#cordish2020.head()

In [None]:
# filter only the rows with at least one match
cordish2020['count_matches'] = cordish2020['count_matches_group'] + cordish2020['count_matches_technology'] 
cordish2020matches = cordish2020[cordish2020['count_matches_technology'] > 0 ] #at least one match with technology term

In [None]:
cordish2020matches.columns

In [None]:
len(cordish2020matches)

In [None]:
print(len(cordish2020matches)/len(cordish2020))

In [None]:
#this is the list we need to compare with the probabilistic
cordish2020matches.id[:20]

In [None]:
cordish2020matches.sort_values('count_matches', ascending=False).head(10)

There is one hypthesis here:  
The more count matches you have the better the accuracy to detect technology terms in the objective text  
Might be solved with statistical sampling. For each count match, select a sample check manually the accuracy  
http://www.marknagelberg.com/using-python-to-figure-out-sample-sizes-for-your-study/

EDA
1. Group by month and count over the time
2. Group by month and sum the total cost over the time
3. Group by country coordinator and count

In [None]:
dfy = cordish2020matches.groupby(cordish2020matches['startDate'].map(lambda x: x.year)).count()['id']

In [None]:
dfy.plot()

In [None]:
dfc = cordish2020matches.groupby(cordish2020matches['startDate'].map(lambda x: x.year)).sum()['ecMaxContribution']

In [None]:
dfc.plot()

In [None]:
df_c = cordish2020matches.groupby(['coordinator','coordinatorCountry']).count()['id'].sort_values(ascending=False).reset_index()
df_c.columns = ['coordinator','coordinatorCountry','count']

In [None]:
df_c.head(20)

In [None]:
df_c = cordish2020matches.groupby(['coordinator','coordinatorCountry']).sum()['ecMaxContribution'].sort_values(ascending=False).reset_index()
df_c.columns = ['coordinator','coordinatorCountry','sum']

In [None]:
df_c.head(20)

In [None]:
def get_terms_matrix(vector_of_terms, all_terms):
    matrix = []
    for vector in vector_of_terms:
        ind_vector = [0] * len(all_terms)
        for v in vector:
            for idx, i in enumerate(all_terms):
                if v == i:
                    ind_vector[idx] = 1
        matrix.append(ind_vector)
    table = pd.DataFrame(matrix)
    return table

In [None]:
all_technology_terms = [i for i in set([*list(categories['technology_group']),*list(technology_term)])]

In [None]:
len(all_technology_terms)

In [None]:
all_technology_terms[:10]

In [None]:
#expected to have 225 columns
matches_technology_table = get_terms_matrix(list(cordish2020matches['matches_technology']), all_technology_terms)
matches_group_table = get_terms_matrix(list(cordish2020matches['matches_group']), all_technology_terms)

In [None]:
matches_table = matches_group_table + matches_technology_table
matches_table.columns = all_technology_terms
matches_table['number_unique_terms'] = matches_table.sum(axis=1)

In [None]:
matches_table.head()

In [None]:
len(matches_table) == len(cordish2020matches)

In [None]:
full_table_tech_matches = pd.concat([cordish2020matches.reset_index(), matches_table], axis=1).sort_values(['count_matches','number_unique_terms'], ascending=False).reset_index()

In [None]:
len(full_table_tech_matches)

In [None]:
import seaborn as sns
sns.distplot(full_table_tech_matches['number_unique_terms'])

In [None]:
sns.distplot(full_table_tech_matches['count_matches'])

In [None]:
full_table_tech_matches.groupby('count_matches').count()['number_unique_terms']

In [None]:
#for i, j in zip(full_table_tech_matches[full_table_tech_matches['count_matches'] ==2]['matches_technology'],full_table_tech_matches[full_table_tech_matches['count_matches'] ==2]['matches_group']):
#    print(i, j)

In [None]:
#full_table_tech_matches.columns[:35]#.head()

In [None]:
#for i in full_table_tech_matches.columns:
#    print(i)
#count_matches: each time a term in the glossary of 225 terms appear
#number_unique_terms: 

In [None]:
full_table_tech_matches[['id','acronym','title','objective','startDate','endDate','ecMaxContribution','matches_group','matches_technology','number_unique_terms','count_matches']].to_csv('../data/tech.csv')

In [None]:
termsT = full_table_tech_matches.iloc[:,33:-1].T
termsT['sum'] = termsT.sum(axis=1)

In [None]:
#termsT.sort_values('sum', ascending=False)

In [None]:
len(termsT[termsT['sum']>0])/len(termsT)

In [None]:
#all_terms

In [None]:
#data = pd.read_csv('../data/tech.csv')

In [2]:
#len(data)

In [1]:
#data.tail(30)

---
### Syntactic probabilistic classification using spacy and fuzzywuzzy

In [None]:
import spacy
import numpy as np
from spacy import displacy
from collections import Counter
import en_core_web_sm
from fuzzywuzzy import process, fuzz
import re
nlp = en_core_web_sm.load()
import textdistance as tx
import unicodedata
from textdistance.algorithms import vector_based

In [None]:
def clean(string):
    return re.sub(r'[-\s]+', '-',
                str(
                    re.sub(r'[^\w\s-]', '',
                        unicodedata.normalize('NFKD', string)
                    .strip()
                   )))

Objective example

---

In [None]:
cordish2020.loc[13]['objective']

In [None]:
text = cordish2020['objective'][13]
doc = nlp(text)
docs = list(map(str, doc.noun_chunks))
docs[:6]

---

**Get all the synonyms from the short descriptions**

In [None]:
technologies = all_technology_terms

In [None]:
len(technologies)

In [None]:
def find_best_matching_tech(techs, doc):
    best_matches = []
    best1 = [("", 0)]
    best2 = [("", 0)]
    val2=""
    best_tech = []
    for val in techs:
        best_curs = process.extract(val, doc, limit=1, scorer=fuzz.ratio)
        terms = ""
        for cat in best_curs:
            terms = terms + cat[0] + "," 
        terms = terms[:-1]
        if len(best_curs)==0:
            avg = 0
        else:
            avg = sum(i for _, i in best_curs)/float(len(best_curs))
        best_cur = [(terms, avg)]
        best_matches.extend(best_cur)
        if best_cur[0][1] > best1[0][1]:
            best1 = best_cur
            best_tech = [val]
        elif best_cur[0][1] == best1[0][1]:
            best1.extend(best_cur)
            best_tech.append(val)
        else:
            if best_cur[0][1] > best2[0][1] and len(best1)<3:
                best2 = best_cur
                val2 = val
    if len(best1)<3:            
        best1.extend(best2)
        best_tech.append(val2)
    return best_matches, best_tech, best1

In [None]:
#process.extract(each tech term, each noun chunk in each objective)

In [None]:
cordish2020['category'] = pd.Series(np.random.randn(len(cordish2020)), index=cordish2020.index)
for idx in range(len(cordish2020['objective'])):
    text = cordish2020['objective'][idx]
    doc = nlp(text)
   # doc = " ".join([token.lemma_ for token in doc])
    #doc = nlp(doc)
    docs = list(map(str, doc.noun_chunks))
    #docs = [clean(str(txt)) for txt in docs]
    _, best_tech, best_match = find_best_matching_tech(technologies, docs)
    #print(best_tech, best_match)
    if round(best_match[0][1]) > 77:
        cordish2020['category'][idx] = " ".join(best_tech)
    else:
        cordish2020['category'][idx] = 'None'

In [None]:
#category is the column to store the best matches with technology

In [None]:
cordish2020['category'].value_counts()['None']

In [None]:
cordish2020matches2 = cordish2020[cordish2020['category'] != "None" ]

In [None]:
print(len(cordish2020matches2)/len(cordish2020))

In [None]:
cordish2020matches2.id[:20]

In [None]:
cordish2020matches2.head(20)

In [None]:
text = cordish2020['objective'][13]
doc = nlp(text)
#doc = " ".join([token.lemma_ for token in doc])
#doc = nlp(doc)
docs = list(map(str, doc.noun_chunks))
_, best_tech, best_match = find_best_matching_tech(technologies, docs)
print(best_tech,best_match)