# OpenAI Word Embeddings, Semantic Search




## Installing the OpenAI

In [None]:
!pip install openai -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m71.7/73.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h

## Importing the OpenAI API key and some of the essential libraries

In [None]:
import openai
import pandas as pd
import numpy as np
from getpass import getpass

openai.api_key = getpass()

··········


## Importing the emails body dataset




In [None]:
df = pd.read_csv('words.csv')
print(df)

                                            text_clean
0                                             forecast
1    traveling business meeting take fun trip espec...
2                               test successful way go
3    randycan send schedule salary level everyone t...
4          greghow either next tuesday thursdayphillip
..                                                 ...
485  pattithis sound like opportunity land couple a...
486  would support matt lenharts promotion next lev...
487  nickthere specific program using recruit train...
488  forwarded phillip k allenhouect 01252001enron ...
489  lucy32 29 fine28 paid weekly 15 switched biwee...

[490 rows x 1 columns]


## Some Feature Engineering over here

In [None]:
df['text_clean'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 490 entries, 0 to 489
Series name: text_clean
Non-Null Count  Dtype 
--------------  ----- 
489 non-null    object
dtypes: object(1)
memory usage: 4.0+ KB


In [None]:
df['text_clean'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 490 entries, 0 to 489
Series name: text_clean
Non-Null Count  Dtype 
--------------  ----- 
490 non-null    object
dtypes: object(1)
memory usage: 4.0+ KB


In [None]:
df['text_clean']=df['text_clean'].astype(str)

## Calculating the Embeddings

In [None]:
from openai.embeddings_utils import get_embedding

df['embedding'] = df['text_clean'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
df.to_csv('word_embeddings.csv')

# Semantic Search

Semantic searching focuses on closeness of meaning, allowing it to match not just the exact query but also the meaning expressed in that specific query, setting it apart from traditional text searching methods.

## Importing the downloaded embedding for email body

In [None]:
df = pd.read_csv('word_embeddings.csv')
df['embedding'] = df['embedding'].apply(eval).apply(np.array)
df

Unnamed: 0.1,Unnamed: 0,text_clean,embedding
0,0,forecast,"[-0.002978444332256913, -0.029137851670384407,..."
1,1,traveling business meeting take fun trip espec...,"[-0.016611222177743912, -0.023307325318455696,..."
2,2,test successful way go,"[-0.011441954411566257, 0.013317566365003586, ..."
3,3,randycan send schedule salary level everyone t...,"[-0.02002922259271145, -0.01722276769578457, 0..."
4,4,greghow either next tuesday thursdayphillip,"[-0.026682931929826736, -0.0004355040728114545..."
...,...,...,...
485,485,pattithis sound like opportunity land couple a...,"[-0.033775750547647476, -0.032047294080257416,..."
486,486,would support matt lenharts promotion next lev...,"[-0.018882956355810165, 0.008640885353088379, ..."
487,487,nickthere specific program using recruit train...,"[-0.032885294407606125, -0.03331460803747177, ..."
488,488,forwarded phillip k allenhouect 01252001enron ...,"[-0.014968602918088436, -0.021389560773968697,..."


## Now we are giving the prompt over here that is "suspicious and unusual emails" and we have freedom to change this prompt accordingly as per the usecase that we have in hand

Like for detecting unusual emails we can also give the following prompts
1. Fraud related emails
2. business related emails
3. meeting related emails
4. vendor related emails
5. employee related emails
etc

In [None]:
search_term = input('Enter a search term: ')

Enter a search term: suspicious and unusual emails


## Now that we have a search term, let's calculate an embedding or vector for that search term using the OpenAI get_embedding function.

In [None]:
# semantic search
search_term_vector = get_embedding(search_term, engine="text-embedding-ada-002")
search_term_vector

[-0.010488229803740978,
 -0.02903611771762371,
 -0.007859500125050545,
 -0.01613265834748745,
 -0.00188981625251472,
 0.024205660447478294,
 -0.03391994908452034,
 -0.014918372966349125,
 0.0021400125697255135,
 -0.01860126294195652,
 0.014824965968728065,
 -0.011469000019133091,
 -0.009874414652585983,
 0.014664840884506702,
 -0.004333400167524815,
 0.01695997454226017,
 0.024979600682854652,
 0.0016171023016795516,
 0.010474885813891888,
 -0.009760992601513863,
 -0.02022920735180378,
 0.026500795036554337,
 -0.01740032061934471,
 0.000738913135137409,
 -0.014824965968728065,
 -0.01881476305425167,
 0.009253928437829018,
 -0.020122457295656204,
 -0.03456045314669609,
 -0.0052941543981432915,
 -0.014918372966349125,
 -0.0049572233110666275,
 -0.021510211750864983,
 0.0034960766788572073,
 -0.0214835237711668,
 0.010361463762819767,
 0.022791218012571335,
 -0.000842327659483999,
 0.009674257598817348,
 -0.006031399592757225,
 0.016092628240585327,
 0.020375989377498627,
 -0.014224494807

## Once we have a vector representing that word, we can see how similar it is to other words in our dataframe by calculating the cosine similarity of our search term's word vector to each word embedding in our dataframe.

In [None]:
from openai.embeddings_utils import cosine_similarity

df["similarities"] = df['embedding'].apply(lambda x: cosine_similarity(x, search_term_vector))

df

Unnamed: 0.1,Unnamed: 0,text_clean,embedding,similarities
0,0,forecast,"[-0.002978444332256913, -0.029137851670384407,...",0.761429
1,1,traveling business meeting take fun trip espec...,"[-0.016611222177743912, -0.023307325318455696,...",0.739294
2,2,test successful way go,"[-0.011441954411566257, 0.013317566365003586, ...",0.735613
3,3,randycan send schedule salary level everyone t...,"[-0.02002922259271145, -0.01722276769578457, 0...",0.738082
4,4,greghow either next tuesday thursdayphillip,"[-0.026682931929826736, -0.0004355040728114545...",0.736345
...,...,...,...,...
485,485,pattithis sound like opportunity land couple a...,"[-0.033775750547647476, -0.032047294080257416,...",0.757469
486,486,would support matt lenharts promotion next lev...,"[-0.018882956355810165, 0.008640885353088379, ...",0.713095
487,487,nickthere specific program using recruit train...,"[-0.032885294407606125, -0.03331460803747177, ...",0.746204
488,488,forwarded phillip k allenhouect 01252001enron ...,"[-0.014968602918088436, -0.021389560773968697,...",0.775133


# Sorting By Similarity

Now that we have calculated the similarities to each term in our dataframe, we simply sort the similarity values to find the terms that are most similar to the term we searched for. Notice how the foods are most similar to "hot dog". Not only that, it puts fast food closer to hot dog. Also some colors are ranked closer to hot dog than others. Let's go back and try the word "yellow" and walk through the results.

In [None]:
df.sort_values("similarities", ascending=False).head(20)

Unnamed: 0.1,Unnamed: 0,text_clean,embedding,similarities
263,263,maryreceived email 17 message please tryagainp...,"[-0.04584462568163872, -0.009893225505948067, ...",0.821358
265,265,mary write stage miss sent 2 emailsmaybe mary ...,"[-0.028678998351097107, -0.023966487497091293,...",0.81691
402,402,01 attachment free virus scan mail forwarded p...,"[-0.02215365506708622, -0.011948964558541775, ...",0.807583
204,204,stephanecan create email list distribute repor...,"[-0.022787166759371758, -0.0019001166801899672...",0.8034
180,180,dawni received email pls please continue send ...,"[-0.030858663842082024, -0.014560838229954243,...",0.801897
373,373,lucysomehow email account lost rentroll sent t...,"[-0.02481573447585106, -0.022108813747763634, ...",0.800594
86,86,lucyi got email attachment let work together t...,"[-0.03583185747265816, 0.00372899672947824, 0....",0.800235
77,77,frankermisenroncom jayreitmeyerenroncom forwar...,"[-0.017804693430662155, -0.0009674206376075745...",0.798474
476,476,forwarded phillip k allenhouect 01302001susan ...,"[-0.011269974522292614, -0.007798149716109037,...",0.797609
416,416,forwarded phillip k allenhouect 02212001yoderh...,"[-0.019259801134467125, 0.001299341325648129, ...",0.79704


## The following email bodies can be considered as suspicious

In [None]:
Top5=df.sort_values("similarities", ascending=False).head(5)

In [None]:
Top5['text_clean'][263]

'maryreceived email 17 message please tryagainphillip'

In [None]:
Top5['text_clean'][265]

'mary write stage miss sent 2 emailsmaybe mary stalking gary'

In [None]:
Top5['text_clean'][402]

'01 attachment free virus scan mail forwarded phillip k allenhouect 0305200101 attachment free virus scan mailsorry deadline passed enrons deal yesterdaywillbe included survey01attachment free virus scan mailwe send evening calc book probably aroundanne01 attachment free virus scan mailanneare planning send today bidweek deal soon need knowwhetherto transfer everything data basethanksliane kucher2023832147'

In [None]:
Top5['text_clean'][204]

'stephanecan create email list distribute report everyday thewest deskor put common drive report listphillip allenmike grigsbykeith holstfrank ermissteve southjanie tholttory kuykendallmatt lenhartrandy gaythanksphillip'

In [None]:
Top5['text_clean'][180]

'dawni received email pls please continue send dailythank youphillip'

# Adding Words Together

We can also add the two vectors over here let say we have added the following two email bodies then the result will have a semantic meaning of both the emails bodies

In [None]:
df['text_clean'][10]

'forwarded phillip k allenhouect 10092000richard burchfieldphillipbelow issue list go forward documenting therequirements consolidated physicalfinancial position transporttrade capture need focus first bullet allans listthe need single set requirement although meeting keithon wednesday informative solution creating infinitely dynamicconsolidated position screen extremely difficult timeconsuming throughout meeting wednesday keith alluded theinability get consensus amongst trader presentation theconsolidated position solution make trader canarrange position screen liking much like excel need tohappen monday 3 5 effort design desired layout theconsolidated position screen critical excludebuilding capability create flexible position presentation thefuture order create plan measured need firmrequirements also reiterate goal project projectplan consolidate physicalfinancial position transport trade capturethe issue raised capture project tothemselves need prioritised effort outside thisproje

In [None]:
df['text_clean'][19]

'think fletch good cpa still'

In [None]:
df2 = df.copy()

vector_9 = df2['embedding'][10]
vector_18 = df2['embedding'][19]

vector_9_vector_18 = vector_9 + vector_18
vector_9_vector_18

array([-0.03142461,  0.00869567,  0.00607754, ...,  0.00497611,
       -0.00257863, -0.06899126])

## Now, finding the similarity using cosine similarity

In [None]:
df2["similarities"] = df2['embedding'].apply(lambda x: cosine_similarity(x, vector_9_vector_18))
df2.sort_values("similarities", ascending=False)

Unnamed: 0.1,Unnamed: 0,text_clean,embedding,similarities
11,11,forwarded phillip k allenhouect 10092000richar...,"[-0.019025541841983795, 0.00516538554802537, 0...",0.926652
19,19,think fletch good cpa still,"[-0.012344620190560818, 0.0035061421804130077,...",0.926647
10,10,forwarded phillip k allenhouect 10092000richar...,"[-0.019079990684986115, 0.005189531948417425, ...",0.926647
214,214,forwarded phillip k allenhouect 03132000priceh...,"[-0.02599875070154667, -0.0026112592313438654,...",0.874241
418,418,johndoes next thursday 3pm fit schedule go roc...,"[-0.02813517488539219, -0.01764894835650921, -...",0.869273
...,...,...,...,...
162,162,check np gen load amw,"[-0.02001616172492504, -0.0030035388190299273,...",0.753235
66,66,richardcompare california production number 20...,"[-0.0056007131934165955, -0.010287025012075901...",0.752844
313,313,email insurance info tomorrow,"[-0.02736624702811241, -0.008114330470561981, ...",0.750491
334,334,location eb3210c,"[0.0023946897126734257, 0.00415245583280921, 0...",0.749345


# Using Emails subject


## Importing the emails subject dataset

In [None]:
EmailSubject_df = pd.read_csv('EmailSubject.csv')
EmailSubject_df

Unnamed: 0,Subject
0,Re: FW: fixed forward or other Collar floor ga...
1,Westgate
2,Re: Not business related..
3,Re: Original Sept check/closing
4,San Juan Index
...,...
671,Have tax problems?\n ...
672,"IDRC Texas, World Congress - Chapter Reception..."
673,Metro Briefs & Inside Real Estate
674,daily charts and matrices as hot links 10/5


## Taking care of missing values is present over here

In [None]:
EmailSubject_df.isnull().sum()

Subject         0
embedding       0
similarities    0
dtype: int64

## Computing the embeddings for the emails subjects over here using OpenAI

In [None]:
EmailSubject_df['embedding'] = EmailSubject_df['Subject'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
EmailSubject_df.to_csv('Subject-Embeddings.csv')

## Giving the prompt for performing semantic searching in the emails subjects over here

In [None]:
Subject_search = input("Search Subject for a sentence:")

Search earnings for a sentence:Suspicious Activity: Take Immediate Action to Secure Your Account


In [None]:
Subject_search_vector = get_embedding(Subject_search, engine="text-embedding-ada-002")
Subject_search_vector

[-0.00983609352260828,
 -0.019030561670660973,
 0.006865374743938446,
 -0.034083057194948196,
 -0.036906201392412186,
 0.03757349029183388,
 -0.035597290843725204,
 9.067948849406093e-05,
 0.026588890701532364,
 -0.023278111591935158,
 0.030002329498529434,
 -0.01596360094845295,
 -0.004671021364629269,
 0.010432803072035313,
 0.0076674045994877815,
 -0.003856159280985594,
 0.03862575441598892,
 -0.0032000988721847534,
 0.017465000972151756,
 -0.022084692493081093,
 -0.026242414489388466,
 0.016207417473196983,
 -0.01618175208568573,
 -0.009515280835330486,
 -0.006262248381972313,
 -0.024291878566145897,
 -0.0018575009889900684,
 -0.028975732624530792,
 0.00048482700367458165,
 -0.02483084239065647,
 0.0014645063783973455,
 -0.003791996743530035,
 0.0001927879056893289,
 0.003320403164252639,
 -0.004690269939601421,
 0.033723749220371246,
 0.019351374357938766,
 -0.02154572866857052,
 -0.012505248188972473,
 0.0027301092632114887,
 0.026768546551465988,
 0.02743583545088768,
 -0.009258

## Finding the similarity using the cosine similarity

In [None]:
EmailSubject_df["similarities"] = EmailSubject_df['embedding'].apply(lambda x: cosine_similarity(x, Subject_search_vector))

EmailSubject_df


Unnamed: 0,Subject,embedding,similarities
0,Re: FW: fixed forward or other Collar floor ga...,"[-0.01031581498682499, -0.001963562099263072, ...",0.695695
1,Westgate,"[-0.0059119402430951595, -0.012103075161576271...",0.721306
2,Re: Not business related..,"[-0.005840612575411797, -0.02709035947918892, ...",0.756154
3,Re: Original Sept check/closing,"[-0.018633292987942696, 0.0014383376110345125,...",0.741414
4,San Juan Index,"[8.525636076228693e-05, 0.013261760585010052, ...",0.718881
...,...,...,...
671,Have tax problems?\n ...,"[-0.007703292183578014, -0.013972178101539612,...",0.760107
672,"IDRC Texas, World Congress - Chapter Reception...","[-0.006740477401763201, 0.00010688947077142075...",0.705320
673,Metro Briefs & Inside Real Estate,"[-0.013395565561950207, -0.01418998558074236, ...",0.737012
674,daily charts and matrices as hot links 10/5,"[-0.033177152276039124, -0.013716374523937702,...",0.715447


## Sorting the subjects in descending order over here to see the ranking

In [None]:
EmailSubject_df.sort_values("similarities", ascending=False)

Unnamed: 0,Subject,embedding,similarities
529,Re: credit card,"[-0.02997935563325882, -0.0005494933575391769,...",0.791385
566,Re: 24x7 Emergency Services,"[0.0032429818529635668, -0.0035821059718728065...",0.789261
32,Re: Notices,"[-0.030852068215608597, -0.004822312388569117,...",0.787968
247,TECH ALERT: New-Media Update,"[-0.02872360683977604, 0.007840655744075775, 0...",0.785658
122,Re: Court Ordered Notice to Customers and Regi...,"[-0.023654503747820854, -0.007149614859372377,...",0.785490
...,...,...,...
151,Re: History of Lime and Cement,"[0.009560145437717438, 0.004758685361593962, 0...",0.684735
152,History of Lime and Cement,"[0.022018982097506523, 0.007863922044634819, 0...",0.681374
153,History of Lime and Cement,"[0.022018982097506523, 0.007863922044634819, 0...",0.681374
399,wv love story,"[-0.0032443280797451735, -0.009089140221476555...",0.675949


In [55]:
Top5_subjects=EmailSubject_df.sort_values("similarities", ascending=False).head(5)

In [56]:
Top5_subjects.head()

Unnamed: 0,Subject,embedding,similarities
529,Re: credit card,"[-0.02997935563325882, -0.0005494933575391769,...",0.791385
566,Re: 24x7 Emergency Services,"[0.0032429818529635668, -0.0035821059718728065...",0.789261
32,Re: Notices,"[-0.030852068215608597, -0.004822312388569117,...",0.787968
247,TECH ALERT: New-Media Update,"[-0.02872360683977604, 0.007840655744075775, 0...",0.785658
122,Re: Court Ordered Notice to Customers and Regi...,"[-0.023654503747820854, -0.007149614859372377,...",0.78549


## Our Prompt : Suspicious Activity: Take Immediate Action to Secure Your Account

## Finding the top 5 results

In [58]:
Top5_subjects['Subject'][529]

'Re: credit card'

In [59]:
Top5_subjects['Subject'][566]

'Re: 24x7 Emergency Services'

In [60]:
Top5_subjects['Subject'][32]

'Re: Notices'

In [61]:
Top5_subjects['Subject'][247]

'TECH ALERT: New-Media Update'

In [62]:
Top5_subjects['Subject'][122]

'Re: Court Ordered Notice to Customers and Registered Users of\n living. com Regarding Sale of Information'