In [1]:
import re
from itertools import combinations

import numpy as np
import pandas as pd
from textdistance import jaccard
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
class CleanText:
    def __init__(self, filepath):
        self.filepath = filepath
        self.posts = self._read_file()        
        self.months = ['января', 'февраля', 'марта',
                       'апреля', 'мая', 'июня',
                       'июля', 'августа', 'сентября',
                       'октября', 'ноября', 'декабря']
        self.stage = 0
        
    def find_word(self, phrase):
        for word in self.months + ['на завтра', 'Завтра']:
            if word in phrase:
                return True
        return False
    
    def get_date(self, phrase):
        date = None
        for month in self.months:
            if month in phrase:
                regexp_date = r'[1-9]?[0-9][ \t]+' + month
                pattern = re.compile(regexp_date, flags=re.I)
                tmp = re.findall(pattern, phrase)
                if len(tmp) > 0:
                    date = tmp[0]
        return date
    
    def _read_file(self):
        fileObject = open(self.filepath, 'r', encoding='utf-8')
        posts_read = fileObject.read()
        posts_read = posts_read.replace(u'\xa0', u' ')
        posts_read = posts_read.split('"\n')
        posts = pd.DataFrame(posts_read, columns=['text'])
        posts['length'] = pd.DataFrame(map(lambda x: len(x), posts_read))
        return posts
    
    def _replace_unnecessary_chars(self):
        self.posts.text = self.posts.text.map(lambda x: x.replace('"', ''))
        return self.posts
        
    def _drop_not_daily_horoscope(self):
        self.posts.drop(self.posts[~self.posts.text.map(self.find_word)].index,
                        inplace=True)
        self.posts.drop(self.posts[~self.posts.text.map(lambda x: False 
                                         if 'ЗОДИАКА' in x else True)].index,
                   inplace=True)
        self.posts.drop(self.posts[~self.posts.text.map(lambda x: False 
                                         if 'ГОРОСКОП' in x else True)].index, 
                   inplace=True)
        self.posts.drop(self.posts[~self.posts.text.map(lambda x: False 
                                        if 'ЗНАК' in x else True)].index, 
                   inplace=True)
        self.posts.reset_index(inplace=True, drop=True)
        return self.posts
        
    def _make_date(self):
        self.posts['date'] = self.posts.text.map(self.get_date)
        return self.posts
    
    def _drop_long_text(self):
        self.posts.drop(self.posts[self.posts.length > 3000].index,
                        inplace=True)
        self.posts.reset_index(inplace=True, drop=True)
        return self.posts
    
    def get_modified_df(self, num_steps='all'):
        if num_steps == 'all':
            num_steps = 4
        list_of_actions = [self._replace_unnecessary_chars,
                           self._drop_not_daily_horoscope,
                           self._make_date,
                           self._drop_long_text
                          ]
        if num_steps < self.stage:
            self._read_file()            
        for i in range(self.stage, num_steps):
            list_of_actions[i]()
            
        return self.posts      

In [15]:
def separate_types(posts):
    posts_new = pd.DataFrame(columns=list(posts.columns) + ['type', 'index in posts'])
    pattern_simple = re.compile(r'(?:(.*?[0-9]{1,2}.*|Гороскоп на завтра.*)\n+|^Завтра)(.+)')
    pattern_bus = re.compile(r'(Бизнес[^а-я].+|Финансовый.+)\n+(.+)')
    pattern_lov = re.compile(r'(Любовный.+)\n+(.+)')
    for i in posts.index:
        post = list(posts.loc[i])
        find_simple = re.findall(pattern_simple, post[0])
        find_bus = re.findall(pattern_bus, post[0])
        find_lov = re.findall(pattern_lov, post[0])
        is_used = False
        if len(find_bus) > 0:
            posts_new.loc[len(posts_new)] = [find_bus[0][1]] + post[1:] + ['business', i]
            is_used = True
        if len(find_lov) > 0:
            posts_new.loc[len(posts_new)] = [find_lov[0][1]] + post[1:] + ['love', i]
            is_used = True
        if len(find_simple) > 0:
            can_use = True
            if len(find_bus) > 0:
                if find_simple[0][0] == find_bus[0][0]:
                    can_use = False
            if len(find_lov) > 0:
                if find_simple[0][0] == find_lov[0][0]:
                    can_use = False
            if can_use:
                posts_new.loc[len(posts_new)] = [find_simple[0][1]] + post[1:] + ['simple', i]
                is_used = True
        if not is_used:
            posts_new.loc[len(posts_new)] = [None] + post[1:] + ['non', i]
    return posts_new

In [2]:
domains = ['ribyhoroscop', 'devahoroscop', 'levhoroscop',
           'rakhoroscop', 'bliznetsihoroscop', 'ovenhoroscop',
           'telechoroscop', 'scorpionhoroscop', 'vodoleihoroscop',
           'kozeroghoroscop', 'vesyhoroscop', 'strelechoroscop']
stop_emoji = ['♓ Рыбы', '♍ Дева', '♌ Лев',
              '♋ Рак', '♊ Близнецы', '♈ Овен',
              '♉ Телец', '♏ Скорпион', '♒ Водолей',
              '♑ Козерог', '♎ Весы', '♐ Стрелец']
stop_love = ['Рыбы. ❤', 'Дева. ❤', 'Лев. ❤',
             'Рак. ❤', 'Близнецы. ❤', 'Овен. ❤',
             'Телец. ❤', 'Скорпион. ❤', 'Водолей. ❤',
             'Козерог. ❤', 'Весы. ❤', 'Стрелец. ❤']
stop_simple = ['Рыбы', 'Дева', 'Лев',
              'Рак', 'Близнецы', 'Овен',
              'Телец', 'Скорпион', 'Водолей',
              'Козерог', 'Весы', 'Стрелец']

In [75]:
%%time
posts_old = []
for domain in domains:
    posts = CleanText('Data\\horoscope_' + domain + '.txt')
    posts = posts.get_modified_df()
    posts_old += [posts]
    print('Done ' + domain)

Done ribyhoroscop
Done devahoroscop
Done levhoroscop
Done rakhoroscop
Done bliznetsihoroscop
Done ovenhoroscop
Done telechoroscop
Done scorpionhoroscop
Done vodoleihoroscop
Done kozeroghoroscop
Done vesyhoroscop
Done strelechoroscop
Wall time: 1.79 s


In [76]:
def delete_text_using_domain(zip_values):
    posts_text = zip_values[0].text
    stop_emoji = zip_values[1]
    stop_simple = zip_values[2]
    pattern = re.compile('\n *'+stop_simple+r' *\n', re.I)
    stop_love = stop_simple+'. ❤'
    pattern_date = re.compile('\n *[1-9]?[0-9] +[а-я]+ *\n', re.I)
    res_emoji =  list(map(lambda x: x.replace(stop_emoji, '') 
                          if stop_emoji in x else x, posts_text))
    res_love =  list(map(lambda x: x.replace('\n'+stop_love, '') 
                         if '\n'+stop_love in x else x, res_emoji))
    res_simple = list(map(lambda x: re.sub(pattern, '\n', x), res_love))
    res = list(map(lambda x: re.sub(pattern_date, '\n', x), res_simple))
    return res

In [77]:
%%time
res = list(map(delete_text_using_domain, zip(posts_old, stop_emoji, stop_simple)))

for text, old in zip(res, posts_old):
    old.text = text

Wall time: 628 ms


In [78]:
def replace_space_newline(x):
    while '\n \n' in x:
        x = x.replace('\n \n', '\n\n')
    return x

In [79]:
posts_old[0].loc[933]

text      Любовный гороскоп\nХороший день. Самое главное...
length                                                  235
date                                                13 июля
Name: 933, dtype: object

In [80]:
%%time
for posts in posts_old:
    posts.text = list(map(replace_space_newline, posts.text))

Wall time: 43.6 ms


In [81]:
%%time
posts_new = []
for domain, posts in zip(domains, posts_old):
    posts_new += [separate_types(posts)]
    print('Done ' + domain)

Done ribyhoroscop
Done devahoroscop
Done levhoroscop
Done rakhoroscop
Done bliznetsihoroscop
Done ovenhoroscop
Done telechoroscop
Done scorpionhoroscop
Done vodoleihoroscop
Done kozeroghoroscop
Done vesyhoroscop
Done strelechoroscop
Wall time: 3min 51s


In [82]:
for old, new, domain in zip(posts_old, posts_new, domains):
    old.to_csv('Data tmp\\old_' + domain + '.csv')
    new.to_csv('Data tmp\\new_' + domain + '.csv')

In [50]:
%%time
posts_old = []
posts_new = []
for domain in domains:
    posts_old += [pd.read_csv('Data tmp\\old_' + domain + '.csv', index_col=0)]
    posts_new += [pd.read_csv('Data tmp\\new_' + domain + '.csv', index_col=0)]

Wall time: 1.82 s


In [51]:
def print_bad_rows(posts_new=posts_new, posts_old=posts_old, domains=domains):
    ind = 0
    for new, old, domain in zip(posts_new, posts_old, domains):
        bad_rows = new[new.text.isna()]
        print(str(ind) + ' ' + domain, ' - ', len(bad_rows))
        if len(bad_rows) > 0:
            for i_old, i_new in zip(bad_rows['index in posts'], bad_rows.index):
                print('_______________________________________')
                print('old - ', i_old, ', new - ', i_new)
                print(old.loc[i_old].text[:200])
        print('****************************************************')
        print()
        ind +=1

In [52]:
print_bad_rows()

0 ribyhoroscop  -  0
****************************************************

1 devahoroscop  -  0
****************************************************

2 levhoroscop  -  2
_______________________________________
old -  1187 , new -  2901
 
Если Лев будет на протяжении всего дня мнить себя королем положения, то отношение коллег к нему резко изменится в худшую сторону. 18 апреля 2017 года возможна странная ситуация в любовной сфере. Изб
_______________________________________
old -  1884 , new -  3781
ОСОБЕННОСТЬ ЛЬВОВ 👇

Львов невозможно не заметить: везде, где появляются представители знака, они оказываются в центре внимания. От простых смертных Львов отличает безупречный вкус, а также искренняя 
****************************************************

3 rakhoroscop  -  1
_______________________________________
old -  2566 , new -  4492
Знаки Зодиака и жилище

Наиболее склонный к домашней жизни знак. Умеет создать очень уютную и тёплую домашнюю обстановку. Жить предпочитает на первых этажах

## manually format/drop rows

In [55]:
pattern_date = re.compile(r'[1-9]?[0-9] .+201[0-9] года ')

#### 2 levhoroscop

In [56]:
# dropping
posts_new[2].drop([3781], inplace=True)

# formatting
post = re.sub(pattern_date, '', posts_old[2].loc[1187].text)
post = re.findall(r'.+', post)[1]
posts_new[2].at[2901, 'text'] = post
posts_new[2].at[2901, 'type'] = 'simple'

posts_new[2].reset_index(inplace=True, drop=True)

#### 3 rakhoroscop

In [57]:
# dropping
posts_new[3].drop([4492], inplace=True)

posts_new[3].reset_index(inplace=True, drop=True)

#### 4 bliznetsihoroscop

In [58]:
# dropping
posts_new[4].drop([3017, 3676], inplace=True)

# formatting
post = re.sub(pattern_date, '', posts_old[4].loc[1187].text)
post = re.findall(r'.+', post)[1]
posts_new[4].at[2901, 'text'] = post
posts_new[4].at[2901, 'type'] = 'simple'

post = re.sub(pattern_date, '', posts_old[4].loc[1195].text)
post = re.findall(r'.+', post)[1]
posts_new[4].at[2911, 'text'] = post
posts_new[4].at[2911, 'type'] = 'simple'


posts_new[4].reset_index(inplace=True, drop=True)

#### 5 ovenhoroscop

In [59]:
# formatting
post = re.sub(pattern_date, '', posts_old[5].loc[1541].text)
post = re.findall(r'.+', post)[0]
posts_new[5].at[3436, 'text'] = post
posts_new[5].at[3436, 'type'] = 'simple'

#### 7 scorpionhoroscop

In [60]:
# dropping
posts_new[7].drop([3022, 3944], inplace=True)

posts_new[7].reset_index(inplace=True, drop=True)

#### 8 vodoleihoroscop

In [61]:
# formatting
post = re.sub(pattern_date, '', posts_old[8].loc[1188].text)
post = re.findall(r'.+', post)[1]
posts_new[8].at[2904, 'text'] = post
posts_new[8].at[2904, 'type'] = 'simple'

#### 9 kozeroghoroscop

In [62]:
# dropping
posts_new[9].drop([3662], inplace=True)

posts_new[9].reset_index(inplace=True, drop=True)

#### 10 vesyhoroscop

In [63]:
# dropping
posts_new[10].drop([3672, 3836, 3891], inplace=True)

# formatting
post = re.sub(pattern_date, '', posts_old[10].loc[1187].text)
post = re.findall(r'.+', post)[1]
posts_new[10].at[2901, 'text'] = post
posts_new[10].at[2901, 'type'] = 'simple'

posts_new[10].reset_index(inplace=True, drop=True)

#### 11 strelechoroscop

In [64]:
# dropping
posts_new[11].drop([3913, 4347, 4497, 4521], inplace=True)

# formatting
post = re.sub(pattern_date, '', posts_old[11].loc[1187].text)
post = re.findall(r'.+', post)[1]
posts_new[11].at[2905, 'text'] = post
posts_new[11].at[2905, 'type'] = 'simple'

posts_new[11].reset_index(inplace=True, drop=True)

In [65]:
print_bad_rows()

0 ribyhoroscop  -  0
****************************************************

1 devahoroscop  -  0
****************************************************

2 levhoroscop  -  0
****************************************************

3 rakhoroscop  -  0
****************************************************

4 bliznetsihoroscop  -  0
****************************************************

5 ovenhoroscop  -  0
****************************************************

6 telechoroscop  -  0
****************************************************

7 scorpionhoroscop  -  0
****************************************************

8 vodoleihoroscop  -  0
****************************************************

9 kozeroghoroscop  -  0
****************************************************

10 vesyhoroscop  -  0
****************************************************

11 strelechoroscop  -  0
****************************************************



# Delete spaces

In [66]:
%%time
pattern_bad_spaces_1 = re.compile(r'(^ +)|( +$)')
pattern_bad_spaces_2 = re.compile(r'(  +)|(\t)')
for posts in posts_new:
    posts.text = list(posts.text.map(lambda x: re.sub(pattern_bad_spaces_1, '', x)))
    posts.text = list(posts.text.map(lambda x: re.sub(pattern_bad_spaces_2, ' ', x)))

Wall time: 3.1 s


In [69]:
for posts in posts_new:
    posts.length = list(map(lambda x: 0 if x is np.nan else len(x), posts.text))

In [70]:
for new, domain in zip(posts_new, domains):
    new.to_csv('Data\\with_duplicates_' + domain + '.csv')