-
Notifications
You must be signed in to change notification settings - Fork 0
/
Analyser.py
131 lines (110 loc) · 4.42 KB
/
Analyser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#import regex
import re
import csv
import nltk
from collections import OrderedDict
import Summary.make_unique as unique
#remember to delete all tweets with a photo or link not just the photo or link
def Classify():
#start process_tweet
def processTweet(tweet):
# Clean the tweet sample
#Convert to lower case
tweet = tweet.lower()
#Remove www.* or https?://* to URL
tweet = re.sub(r'((www.[^\s]+)|(https[^\s]+))','',tweet)
#Remove the retweets users
tweet = re.sub(r'(rt @[^\s])','',tweet)
#remove all the user mentioned in the tweets
tweet = re.sub(r'(@[^\s])','',tweet)
#Remove additional white spaces
tweet = re.sub(r'@[\s]+', '', tweet)
#remove the initial text: preeceding each tweet
tweet = re.sub(r'text:"','', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
#end
#initialize stopWords
stopWords = []
#start replaceTwoOrMore
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character and replace with the character itself
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
#end
#start getStopWordList
def getStopWordList(stopWordListFileName):
#read the stopwords file and build a list
stopWords = []
fp = open(stopWordListFileName, 'r')
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
#end
#start getfeatureVector
def getFeatureVector(tweet):
featureVector = []
#split tweet into words
words = tweet.split()
for w in words:
#replace two or more with two occurrences
w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if the word stats with an alphabet
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
#ignore if it is a stop word
if(w in stopWords or val is None):
continue
else:
featureVector.append(w.lower())
return featureVector
#end
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in featureList:
features['contains(%s)' % word] = (word in tweet_words)
return features
inpTweets = csv.reader(open('Training/Training.csv', 'rd'), delimiter=',', quotechar='|')
stopWords = getStopWordList('stopwords.txt')
featureList = []
# Get tweet words
tweets = []
for row in inpTweets:
sentiment = row[0]
tweet = row[1]
processedTweet = processTweet(tweet)
featureVector = getFeatureVector(processedTweet)
featureList.extend(featureVector)
tweets.append((featureVector, sentiment));
#end loop
# Remove featureList duplicates
featureList = list(set(featureList))
# Extract feature vector for all tweets in one shote
training_set = nltk.classify.util.apply_features(extract_features, tweets)
#Train the classifier
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
#Remove all the duplicate tweets.
with open ('text_tweets.txt','r') as tweets:
for tweet in tweets:
processedTestTweet = processTweet(tweet)
sentiment = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet.decode('unicode_escape').encode('ascii','ignore'))))
#print processedTestTweet.decode('unicode_escape').encode('ascii','ignore')
#print sentiment
#We write all the tweets to a positive or negative file depending on their sentiment excluding all non ascii charachters.
with open('Summary/positive.txt','a') as pt:
if sentiment == "positive":
pt.write(processedTestTweet.decode('unicode_escape').encode('ascii','ignore') + "\n")
with open('Summary/negative.txt','a') as nt:
if sentiment=="negative":
nt.write(processedTestTweet.decode('unicode_escape').encode('ascii','ignore') + "\n")
#Remove all the duplicate tweets.
unique.unique()