In [2]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 5.2 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 16.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 57.0 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 33.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYA

In [11]:
from sentence_transformers import SentenceTransformer
import numpy as np

In [46]:
embedder = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

In [62]:
#French:

E_dic = ["carbone", "durable", "pollution", 
         "empreinte", "gaspillage", "co2","energie fossile", 
        "contamination", "environnement"]

S_dic = ["sante", "securite", "carriere", 
         "donation", "diversite", "ecart salarial",
         "ecart revenus","satisfaction","union", "employe", 
         "licenciements", "genre", "femme", "discrimination",
         "accidents", "dommages","droits de l’homme", "egalite", 
         "corruption", "alcool", "drogues", "sexe","tabac", "armes",
         "salaire"]

G_dic = ["gouvernance", "politique", "formation", "talents",
         "carriere", "board","corporate", "conseil administration",
         "comite", "nomination", "executif", "audit", "structure", 
         "performance","experience", "PDG", "DG", "Directeur",
         "compte-rendu", "reunions", "CEO", "President",
         "election", "senior","revenus","remuneration", "actionnaires", 
         "chef", "vote", "directeur","gouvernement", "parachute dore", 
         "confidentiel", "CSR", "investissement"]

In [63]:
E_embeddings = np.array(embedder.encode(E_dic))
E_vector = np.average(E_embeddings, axis = 0)

In [64]:
S_embeddings = np.array(embedder.encode(S_dic))
S_vector = np.average(S_embeddings, axis = 0)

In [65]:
G_embeddings = np.array(embedder.encode(G_dic))
G_vector = np.average(G_embeddings, axis = 0)

In [66]:
sentences = ["L'usage du carbone est élevé", "Ma couleur préférée est le rouge", 
             "J'aime bien peindre des portraits", "J'ai voyagé une fois en Allemagne",
             "Les salaires dans cette entreprise sont plutôt bas", "L'entreprise couvre nos factures d'assurance maladie",
             "Voici 10 choses à considérer avant de choisir un emploi"]

In [67]:
embedding_space = np.array(embedder.encode(sentences))
embedding_space.shape

(7, 768)

In [68]:
# Possible Measures:

#The cosine similarity:
def unit_vector(vector):
    """ Returns the unit vector of the vector.  """
    return vector / np.linalg.norm(vector)

def cosine_similarity(v1, v2):
    v1_u = unit_vector(v1)
    v2_u = unit_vector(v2)
    return np.dot(v1_u, v2_u)

#Euclidian distance:
def euc_distance(v1,v2):
  return np.linalg.norm(v1-v2)

In [69]:
# Cosine_Similarity for E:
E_sim= []
for sentence in embedding_space:
  E_sim += [cosine_similarity(sentence, E_vector)]

# Euclidian distance for E:
E_euc = []
for sentence in embedding_space:
  E_euc += [euc_distance(sentence, E_vector)]

for i in range(len(sentences)):
  print(sentences[i])
  print(E_sim[i])
  print(E_euc[i])
  print("\n")

L'usage du carbone est élevé
0.6912909
2.0152438


Ma couleur préférée est le rouge
0.19431779
3.513558


J'aime bien peindre des portraits
0.12549186
3.2767637


J'ai voyagé une fois en Allemagne
0.1531995
3.392332


Les salaires dans cette entreprise sont plutôt bas
0.14210193
3.1933975


L'entreprise couvre nos factures d'assurance maladie
0.12898466
3.1346362


Voici 10 choses à considérer avant de choisir un emploi
0.2058077
2.979122




In [70]:
# Cosine_Similarity for S:
S_sim= []
for sentence in embedding_space:
  S_sim += [cosine_similarity(sentence, S_vector)]

# Euclidian distance for S:
S_euc = []
for sentence in embedding_space:
  S_euc += [euc_distance(sentence, S_vector)]

for i in range(len(sentences)):
  print(sentences[i])
  print(S_sim[i])
  print(S_euc[i])
  print("\n")

L'usage du carbone est élevé
0.28355354
2.7370152


Ma couleur préférée est le rouge
0.19417793
3.363163


J'aime bien peindre des portraits
0.19340807
2.9762962


J'ai voyagé une fois en Allemagne
0.19628355
3.1498623


Les salaires dans cette entreprise sont plutôt bas
0.32105872
2.7390802


L'entreprise couvre nos factures d'assurance maladie
0.30005497
2.6808252


Voici 10 choses à considérer avant de choisir un emploi
0.36974585
2.5533059




In [71]:
# Cosine_Similarity for G:
G_sim= []
for sentence in embedding_space:
  G_sim += [cosine_similarity(sentence, G_vector)]

# Euclidian distance for G:
G_euc = []
for sentence in embedding_space:
  G_euc += [euc_distance(sentence, G_vector)]

for i in range(len(sentences)):
  print(sentences[i])
  print(G_sim[i])
  print(G_euc[i])
  print("\n")

L'usage du carbone est élevé
0.28811413
2.7568743


Ma couleur préférée est le rouge
0.15089221
3.4527555


J'aime bien peindre des portraits
0.1792644
3.0298705


J'ai voyagé une fois en Allemagne
0.18680032
3.1933768


Les salaires dans cette entreprise sont plutôt bas
0.25194407
2.8648825


L'entreprise couvre nos factures d'assurance maladie
0.32312956
2.6714914


Voici 10 choses à considérer avant de choisir un emploi
0.36955035
2.57267




In [74]:
import tweepy as tw
import pandas as pd

In [75]:
#English:

#English:

E_dic = ["carbon", "sustainable", "pollution",
         "footprint", "waste", "co2", "fossil energy",
        "contamination", "environment"]

S_dic = ["health", "safety", "career",
         "donation", "diversity", "wage gap",
         "income gap", "satisfaction", "union", "employee",
         "licenses", "gender", "woman", "discrimination",
         "accidents", "damage", "human rights", "equality",
         "corruption", "alcohol", "drugs", "sex", "tobacco", "weapons",
         "salary"]

G_dic = ["governance", "policy", "training", "talents",
         "career", "board", "corporate", "board of directors",
         "committee", "appointment", "executive", "audit", "structure",
         "performance","experience", "CEO", "DG", "Director",
         "minutes", "meetings", "CEO", "Chairman",
         "election", "senior", "income", "remuneration", "shareholders",
         "leader", "vote", "director", "government", "golden parachute",
         "confidential", "CSR", "investment"]

In [76]:
consumer_key= 'GoN90XkTehFV4lNMk1Whk8JYe'
consumer_secret= 'WCpGzX2T67hAF56EU1qdAFo0lO5EoIv7fgrEoNtwa8dvv1fuLz'
access_token= '1417219540904259594-CdqMIpFBK10mF7eY9l2LRBNW1PbD53'
access_token_secret= 'G1P2vYYXB3BeShIG5fvPEgYMFpY5sB0LMfS5mfHk2yLz9'

In [77]:
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth)

In [78]:
tweets = tw.Cursor(api.search ,q= "#Tesla", lang="en" , tweet_mode="extended").items(1000)

<tweepy.cursor.ItemIterator at 0x7fe2950e2590>

In [79]:
tweet =[]
for i in tweets :
  a = i.full_text
  tweet.append(a)