-
Notifications
You must be signed in to change notification settings - Fork 1
/
jaccard.py
78 lines (73 loc) · 2.71 KB
/
jaccard.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from nltk.corpus import stopwords
import nltk
def process(file):
raw=open(file).read()
words= raw.split()
porter = nltk.PorterStemmer()
stemmed_tokens = [porter.stem(t) for t in words]
stop_words = set(stopwords.words('english'))
voc = [w for w in stemmed_tokens if not w in stop_words]
#print(type(r))
return voc;
def jaccard_similarity(list1,list2):
intersec = len(set(list1) & set(list2))
#print(intersec)
union = (len(list1) + len(list2)) - intersec
#print(union)
return float(intersec / union)
def findfile(a):
for i in range(1,10):
dict2= '/home/kayal/Desktop/Dataset/Task_2/Prior_Cases/prior_case_000'+ str(i) +'.txt'
b=process(dict2)
print('p'+str(i))
f1.write('000'+str(i)+' ')
f1.write(str(jaccard_similarity(a,b)))
f1.write('\n')
#print(c)
for i in range(10,100):
dict2= '/home/kayal/Desktop/Dataset/Task_2/Prior_Cases/prior_case_00'+ str(i) +'.txt'
b=process(dict2)
print('p'+str(i))
f1.write('00'+str(i)+' ')
f1.write(str(jaccard_similarity(a,b)))
f1.write('\n')
for i in range(100,1000):
dict2= '/home/kayal/Desktop/Dataset/Task_2/Prior_Cases/prior_case_0'+ str(i) +'.txt'
b=process(dict2)
print('p'+str(i))
f1.write('0'+str(i)+' ')
f1.write(str(jaccard_similarity(a,b)))
f1.write('\n')
for i in range(1000,2001):
dict2= '/home/kayal/Desktop/Dataset/Task_2/Prior_Cases/prior_case_'+ str(i) +'.txt'
print('p'+str(i))
b=process(dict2)
f1.write(str(i)+' ')
f1.write(str(jaccard_similarity(a,b)))
f1.write('\n')
if __name__ == '__main__':
f1=open('/home/kayal/Desktop/Dataset/Task_2/j_fullscore.txt','w+')
for j in range(1,10):
print("for loop:"+str(j))
f1.write(",\n")
dict1='/home/kayal/Desktop/Dataset/Task_2/Current_Cases/current_case_000'+str(j)+'.txt'
a=process(dict1)
findfile(a)
for j in range(10,100):
print("for loop:"+str(j))
f1.write(",\n")
dict1='/home/kayal/Desktop/Dataset/Task_2/Current_Cases/current_case_00'+str(j)+'.txt'
a=process(dict1)
findfile(a)
for j in range(100,201):
print("for loop:"+str(j))
f1.write(",\n")
dict1='/home/kayal/Desktop/Dataset/Task_2/Current_Cases/current_case_0'+str(j)+'.txt'
a=process(dict1)
findfile(a)
for i in range(1,201):
f2 = open('/home/kayal/Desktop/Dataset/Task_2/project/jaccard/J/j'+str(i)+'.txt','w+')
fo = open('/home/kayal/Desktop/Dataset/Task_2/project/jaccard/fullscore.txt','r')
x = fo.read()
t=x.split(',')
f2.write(str(t[i]))