In [1]:
import re
from itertools import combinations

import pandas as pd
import textdistance
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
class CleanText:
    def __init__(self, filepath):
        self.filepath = filepath
        self.posts = self._read_file()        
        self.months = ['января', 'февраля', 'марта',
                       'апреля', 'мая', 'июня',
                       'июля', 'августа', 'сентября',
                       'октября', 'ноября', 'декабря']
        self.stage = 0
        
    def find_word(self, phrase):
        for word in self.months + ['на завтра', 'Завтра']:
            if word in phrase:
                return True
        return False
    
    def get_date(self, phrase):
        date = None
        for month in self.months:
            if month in phrase:
                regexp_date = r'[1-9]?[0-9][ \t]+' + month
                pattern = re.compile(regexp_date, flags=re.I)
                tmp = re.findall(pattern, phrase)
                if len(tmp) > 0:
                    date = tmp[0]
        return date
    
    def _read_file(self):
        fileObject = open(self.filepath, 'r', encoding='utf-8')
        posts_read = fileObject.read()
        posts_read = posts_read.replace(u'\xa0', u' ')
        posts_read = posts_read.split('"\n')
        posts = pd.DataFrame(posts_read, columns=['text'])
        posts['length'] = pd.DataFrame(map(lambda x: len(x), posts_read))
        return posts
    
    def _replace_unnecessary_chars(self):
        self.posts.text = self.posts.text.map(lambda x: x.replace('"', ''))
        return self.posts
        
    def _drop_not_daily_horoscope(self):
        self.posts.drop(self.posts[~self.posts.text.map(self.find_word)].index,
                        inplace=True)
        self.posts.drop(self.posts[~self.posts.text.map(lambda x: False 
                                         if 'ЗОДИАКА' in x else True)].index,
                   inplace=True)
        self.posts.drop(self.posts[~self.posts.text.map(lambda x: False 
                                         if 'ГОРОСКОП' in x else True)].index, 
                   inplace=True)
        self.posts.drop(self.posts[~self.posts.text.map(lambda x: False 
                                        if 'ЗНАК' in x else True)].index, 
                   inplace=True)
        self.posts.reset_index(inplace=True, drop=True)
        return self.posts
        
    def _make_date(self):
        self.posts['date'] = self.posts.text.map(self.get_date)
        return self.posts
    
    def _drop_long_text(self):
        self.posts.drop(self.posts[self.posts.length > 3000].index,
                        inplace=True)
        self.posts.reset_index(inplace=True, drop=True)
        return self.posts
    
    def get_modified_df(self, num_steps='all'):
        if num_steps == 'all':
            num_steps = 4
        list_of_actions = [self._replace_unnecessary_chars,
                           self._drop_not_daily_horoscope,
                           self._make_date,
                           self._drop_long_text
                          ]
        if num_steps < self.stage:
            self._read_file()            
        for i in range(self.stage, num_steps):
            list_of_actions[i]()
            
        return self.posts      

In [118]:
def separate_types(posts):
    posts_new = pd.DataFrame(columns=list(posts.columns) + ['type', 'index in posts'])
    pattern_simple = re.compile(r'(?:(.*?[0-9]{1,2}.*|Гороскоп на завтра.*)\n{1,2}|^Завтра)(.+)')
    pattern_bus = re.compile(r'(Бизнес[^а-я].+|Финансовый.+)\n{1,2}(.+)')
    pattern_lov = re.compile(r'(Любовный.+)\n{1,2}(.+)')
    for i in posts.index:
        post = list(posts.loc[i])
        find_simple = re.findall(pattern_simple, post[0])
        find_bus = re.findall(pattern_bus, post[0])
        find_lov = re.findall(pattern_lov, post[0])
        is_used = False
        if len(find_bus) > 0:
            posts_new.loc[len(posts_new)] = [find_bus[0][1]] + post[1:] + ['business', i]
            is_used = True
        if len(find_lov) > 0:
            posts_new.loc[len(posts_new)] = [find_lov[0][1]] + post[1:] + ['love', i]
            is_used = True
        if len(find_simple) > 0:
            can_use = True
            if len(find_bus) > 0:
                if find_simple[0][0] == find_bus[0][0]:
                    can_use = False
            if len(find_lov) > 0:
                if find_simple[0][0] == find_lov[0][0]:
                    can_use = False
            if can_use:
                posts_new.loc[len(posts_new)] = [find_simple[0][1]] + post[1:] + ['simple', i]
                is_used = True
        if not is_used:
            posts_new.loc[len(posts_new)] = [None] + post[1:] + ['non', i]
    return posts_new

In [122]:
domains = ['ribyhoroscop', 'devahoroscop', 'levhoroscop',
           'rakhoroscop', 'bliznetsihoroscop', 'ovenhoroscop',
           'telechoroscop', 'scorpionhoroscop', 'vodoleihoroscop',
           'kozeroghoroscop', 'vesyhoroscop', 'strelechoroscop']

In [162]:
%%time
posts_old = []
posts_new = []
for domain in domains:
    posts = CleanText('Data\\horoscope_' + domain + '.txt')
    posts = posts.get_modified_df()
    posts_old += [posts]
    posts_new += [separate_types(posts)]
    print('Done ' + domain)

Done ribyhoroscop
Done devahoroscop
Done levhoroscop
Done rakhoroscop
Done bliznetsihoroscop
Done ovenhoroscop
Done telechoroscop
Done scorpionhoroscop
Done vodoleihoroscop
Done kozeroghoroscop
Done vesyhoroscop
Done strelechoroscop
Wall time: 3min 59s
