In [None]:
import pandas as pd
import ast
from sklearn.ensemble import IsolationForest

In [None]:
# Load embeddings from CSV file
csv_file_path = 'word_embeddings_body.csv'
df = pd.read_csv(csv_file_path)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,text_clean,embedding
0,0,forecast,"[-0.002978444332256913, -0.029137851670384407,..."
1,1,traveling business meeting take fun trip espec...,"[-0.016611222177743912, -0.023307325318455696,..."
2,2,test successful way go,"[-0.011441954411566257, 0.013317566365003586, ..."
3,3,randycan send schedule salary level everyone t...,"[-0.02002922259271145, -0.01722276769578457, 0..."
4,4,greghow either next tuesday thursdayphillip,"[-0.026682931929826736, -0.0004355040728114545..."


In [None]:
# Assuming that the embeddings are stored in a column called 'embeddings'
# Parse the string representations to lists of numbers
df['embedding'] = df['embedding'].apply(lambda x: ast.literal_eval(x))

In [None]:
# Convert the embeddings column to a list of lists
embeddings = df['embedding'].tolist()

In [50]:
# Create the Isolation Forest model
isolation_forest = IsolationForest(contamination=0.05)  # Adjust contamination based on your data


In [51]:
# Fit the model to the embeddings
isolation_forest.fit(embeddings)

In [52]:
anomaly_scores = isolation_forest.score_samples(embeddings)


In [53]:
anomaly_scores

array([-0.46813892, -0.44683136, -0.44731273, -0.43783666, -0.43259021,
       -0.42031463, -0.48025548, -0.42119029, -0.42529053, -0.43346462,
       -0.43224631, -0.43224631, -0.47586686, -0.44930949, -0.41360623,
       -0.44886073, -0.43563738, -0.4106636 , -0.42209594, -0.46505123,
       -0.42104459, -0.42474715, -0.4435004 , -0.4354558 , -0.43747778,
       -0.41163131, -0.41747092, -0.46954998, -0.41053945, -0.40383528,
       -0.43484691, -0.46236845, -0.41754613, -0.41472934, -0.45084025,
       -0.42802953, -0.43381513, -0.43839206, -0.45506123, -0.41141602,
       -0.44900457, -0.4508678 , -0.4356752 , -0.45007917, -0.42148689,
       -0.43838941, -0.42652302, -0.47088819, -0.40644295, -0.42682749,
       -0.44517962, -0.47106831, -0.46639474, -0.41071094, -0.41137355,
       -0.41774734, -0.4458026 , -0.45043322, -0.42004734, -0.45920129,
       -0.43635137, -0.42975401, -0.4373771 , -0.42810812, -0.41323462,
       -0.43146696, -0.48101535, -0.43106036, -0.41864857, -0.43

In [54]:
df['anomaly_score'] = anomaly_scores


In [55]:
df

Unnamed: 0.1,Unnamed: 0,text_clean,embedding,anomaly_score,is_anomaly
0,0,forecast,"[-0.002978444332256913, -0.029137851670384407,...",-0.468139,True
1,1,traveling business meeting take fun trip espec...,"[-0.016611222177743912, -0.023307325318455696,...",-0.446831,True
2,2,test successful way go,"[-0.011441954411566257, 0.013317566365003586, ...",-0.447313,True
3,3,randycan send schedule salary level everyone t...,"[-0.02002922259271145, -0.01722276769578457, 0...",-0.437837,True
4,4,greghow either next tuesday thursdayphillip,"[-0.026682931929826736, -0.0004355040728114545...",-0.432590,True
...,...,...,...,...,...
485,485,pattithis sound like opportunity land couple a...,"[-0.033775750547647476, -0.032047294080257416,...",-0.449006,False
486,486,would support matt lenharts promotion next lev...,"[-0.018882956355810165, 0.008640885353088379, ...",-0.467861,True
487,487,nickthere specific program using recruit train...,"[-0.032885294407606125, -0.03331460803747177, ...",-0.422990,True
488,488,forwarded phillip k allenhouect 01252001enron ...,"[-0.014968602918088436, -0.021389560773968697,...",-0.415299,False


In [76]:
threshold = -0.45  # Adjust the threshold as needed


In [77]:
df['is_anomaly'] = df['anomaly_score'] < threshold


In [78]:
anomalies = df[df['is_anomaly']]


In [79]:
anomalies

Unnamed: 0.1,Unnamed: 0,text_clean,embedding,anomaly_score,is_anomaly
0,0,forecast,"[-0.002978444332256913, -0.029137851670384407,...",-0.468139,True
6,6,dont think required isp2 static ip address,"[-0.014152380637824535, 0.020037245005369186, ...",-0.480255,True
12,12,davehere name west desk member category origin...,"[-0.002665597712621093, -0.0013185596326366067...",-0.475867,True
19,19,think fletch good cpa still,"[-0.012344620190560818, 0.0035061421804130077,...",-0.465051,True
27,27,nymex expiration time frame please reschedule,"[-0.011050663888454437, -0.02314694970846176, ...",-0.469550,True
...,...,...,...,...,...
456,456,send ina rangel forward appropriate trader are...,"[-0.02564612776041031, -0.011885322630405426, ...",-0.462802,True
459,459,susanraised issue sally beck larry going spend...,"[-0.014596407301723957, -0.017808910459280014,...",-0.461931,True
469,469,8500 thats twice valuable car cant get used on...,"[-0.00241537275724113, 0.00860824715346098, 0....",-0.467006,True
472,472,jeffbefore write stage thing think about1 oper...,"[0.006602688692510128, 0.003724329173564911, 0...",-0.466027,True


## Some of the anomaly emails over here

In [80]:
anomalies['text_clean'][0]

'forecast'

In [81]:
anomalies['text_clean'][6]

'dont think required isp2 static ip address'

In [82]:
anomalies['text_clean'][12]

'davehere name west desk member category originationside sparsephillip'

In [83]:
anomalies['text_clean'][19]

'think fletch good cpa still'

## Some of the normal emails

In [70]:
df['text_clean'][1]

'traveling business meeting take fun trip especially prepare presentation would suggest holding business plan meeting take trip without formal business meeting would even try get honest opinion whether trip even desired necessaryas far business meeting think would productive try stimulate discussion across different group working often presenter speaks others quiet waiting turn meeting might better held round table discussion formatmy suggestion go austin play golf rent ski boat jet ski flying somewhere take much time'

In [71]:
df['text_clean'][2]

'test successful way go'