-
Notifications
You must be signed in to change notification settings - Fork 0
/
preparation.py
45 lines (37 loc) · 1.07 KB
/
preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import nltk
from stop_words import get_stop_words
from nltk.corpus import stopwords
import re
import pymorphy2
stop_words = list(get_stop_words('ru'))
nltk_words = list(stopwords.words('russian'))
stop_words.extend(nltk_words)
rep = re.compile("[^a-zA-Zа-яА-Я ]")
morph = pymorphy2.MorphAnalyzer()
data = open("data.txt", "r")
file = open("prepared_data", "w")
tweets = data.readlines()
tweets[0] = tweets[0][1:]
for tweet in tweets:
res = tweet[:17]
temp = ""
flag = 0
for word in tweet[17:].split(' '):
if word == "RT": break
if word == "#" or flag == 1: flag += 1
if "pic.twitter.com" \
and"http://" not in word \
and "https://" not in word \
and ".com" not in word \
and flag == 0:
temp += word + ' '
if flag == 2: flag = 0
temp = rep.sub('', temp)
tokens = nltk.word_tokenize(temp)
for word in tokens:
if not word in stop_words:
p = morph.parse(word)[0]
res += p.normal_form + ' '
file.write(res)
data.close()
file.close()