In [153]:
import matplotlib.pyplot as plt
import numpy as np
import re
import jieba
import jieba.posseg as pseg
import jieba.analyse
from collections import Counter
import math
import operator
%matplotlib inline

class DataAnalysisHelper:
    def bucketize(self, min_value, max_value, bin_size, values):
        bucket_names = []
        bucket_values = []
        step = (max_value - min_value) / (bin_size)
        for i in range(1, bin_size + 1):
            cur_min_value = step * (i - 1)
            cur_max_value = step * i
            cur_bucket_name = str(int(cur_min_value)) + ' - ' + str(int(cur_max_value))
            bucket_names.append(cur_bucket_name)

            cur_bucket_value = 0
            for v in values:
                if v >= cur_min_value and v < cur_max_value:
                    cur_bucket_value += 1
            bucket_values.append(cur_bucket_value)

        last_bucket_name = '> ' + str(max_value)
        bucket_names.append(last_bucket_name)
        last_bucket_value = 0
        for v in values:
            if v >= max_value:
                last_bucket_value += 1
        bucket_values.append(last_bucket_value)

        return (bucket_names, bucket_values)
    
    def bisectCommentsForPostData(self, posts_data, postDataFilterFunCase):
        num_comments_pass_filter = []
        num_comments_fail_filter = []
        for i in range(len(posts_data)):
            
            should_pass_filter = False
            if postDataFilterFunCase == 'isTargetedFemale':
                should_pass_filter = self.isTargetedFemale(posts_data[i])
            elif postDataFilterFunCase == 'hasEmailInText':
                should_pass_filter = self.hasEmailInText(posts_data[i])
            elif postDataFilterFunCase == 'hasLongText':
                should_pass_filter = self.hasLongText(posts_data[i])
                
            if should_pass_filter:
                num_comments_pass_filter.append(posts_data[i]['post_comment_num'])
            else:
                num_comments_fail_filter.append(posts_data[i]['post_comment_num'])
        return (num_comments_pass_filter, num_comments_fail_filter)
    
    
    def drawBarChart(self, bucket_names, bucket_values):
        y_pos = np.arange(len(bucket_names))
        plt.figure(figsize=(8,5))
        plt.bar(y_pos, bucket_values, align='center', alpha=0.5)
        plt.xticks(y_pos, bucket_names)
        plt.ylim([0, 50])
        plt.show()
        
    
    def drawPieChart(self, bucket_names, bucket_values):
        plt.figure(figsize=(8,5))
        plt.pie(bucket_values, labels=bucket_names, shadow=True, autopct='%1.0f%%', startangle=90)
        plt.axis('equal')
        plt.show()
        
    
    def getListStatistics(self, values):
        return {
            'mean': np.mean(values),
            'median': np.median(values),
        }
    
    def printAllStats(self, bucket_names, bucket_values):
        self.drawBarChart(bucket_names, bucket_values)
        self.drawPieChart(bucket_names, bucket_values)
        print(data_helper.getListStatistics(bucket_values))
    
    def isTargetedFemale(self, postData):
        if postData['target_gender'] == 'Female':
            return True
        return False
    
    def hasEmailInText(self, postData):
        match = re.search(r'[\w\.-]+@[\w\.-]+', postData['message_text'])
        if match == None:
            return False
        return True
    
    def hasLongText(self, postData):
        if len(postData['message_text']) < 320 :
            return False
        return True
    
    def getMeaninglessWordTags(self):
        return ['x', 'm', 'r', 'c', 'ul', 'uj', 'd', 'f', 'p', 'uz', 'eng', 'q', 'zg']
    
    def BreakWordsAndCleanUp(self, text):
        #seg_list = jieba.lcut(text, cut_all=False)
        words = pseg.cut(text)
        seg_list = []
        for word, flag in words:
            #print(word, flag)
            if flag in self.getMeaninglessWordTags():
                continue
            seg_list.append(word)
        return seg_list
    
    def filterWordsBasedOnTags(self, words, tags):
        filtered_words = []
        for word in words:
            for tag in tags:
                tmp_word = pseg.cut(word)
                print(tmp_word)
                
    def getTopTFIDFWordsForBothGoodAndBadPostDatas(self, posts_data, postCommentThreshold=20):
        for i in range(len(posts_data)):
            posts_data[i]['text_words'] = self.BreakWordsAndCleanUp(posts_data[i]['message_text'])

        postDataIndexWithGoodComments = []
        postDataIndexWithBadComments = []
        for i in range(len(posts_data)):
            if posts_data[i]['post_comment_num'] > postCommentThreshold:
                postDataIndexWithGoodComments.append(i)
            else:
                postDataIndexWithBadComments.append(i)

        tfidf = TFIDF(posts_data)
        good_comment_post_words = tfidf.getTopWordsInDocuments(postDataIndexWithGoodComments, 40, 100)
        bad_comment_post_words = tfidf.getTopWordsInDocuments(postDataIndexWithBadComments, 40, 100)

        good_diff_words = copy.deepcopy(good_comment_post_words)
        for word in good_comment_post_words:
            if word in bad_comment_post_words:
                del good_diff_words[word]
        good_diff_words = sorted(good_diff_words.items(), key = operator.itemgetter(1), reverse = True)

        bad_diff_words = copy.deepcopy(bad_comment_post_words)
        for word in bad_comment_post_words:
            if word in good_comment_post_words:
                del bad_diff_words[word]
        bad_diff_words = sorted(bad_diff_words.items(), key = operator.itemgetter(1), reverse = True)

        return (good_diff_words, bad_diff_words)
    
    def wordsDiff(self, a_words, b_words):
        a_words = set([x[0] for x in a_words])
        b_words = set([x[0] for x in b_words])
        diff = a_words - b_words
        return diff

In [152]:
class TFIDF:
    def __init__(self, postDatas):
        num_of_documents = len(postDatas)
        documents = [x for x in range(num_of_documents)]
        documents_word_counters = []
        
        # remove words that only appear once
        total_words = []
        for i in range(num_of_documents):
            word_counter = Counter(postDatas[i]['text_words'])
            words_in_document = list(word_counter.keys())
            for word in words_in_document:
                total_words.append(word)
        total_word_counter = Counter(total_words)
        
        for i in range(num_of_documents):
            # Compute TF
            word_counter = Counter(postDatas[i]['text_words'])
            words_in_document = list(word_counter.keys())
            total_words_in_document = sum(list(word_counter.values()))
            for word in words_in_document:
                if total_word_counter[word] > 1:
                    word_counter[word] = word_counter[word] / total_words_in_document
                else:
                    word_counter[word] = 0
            
            documents_word_counters.append(word_counter)

        tfidf_matrix_dict = {}
        
        for i in range(num_of_documents):
            tfidf_matrix_dict[i] = {}
            # Compute IDF
            for word in list(documents_word_counters[i].keys()):
                exist_document = 0
                for j in range(num_of_documents):
                    if word in documents_word_counters[j]:
                        exist_document += 1
                idf = math.log(num_of_documents / exist_document, 10)
                # Compute TFIDF
                tfidf = documents_word_counters[i][word] * idf
                tfidf_matrix_dict[i][word] = tfidf
        
        self.tfidf_matrix_dict = tfidf_matrix_dict
            
    def getTopWordsInDocument(self, documentIndex, numOfTop):
        tfidf_in_document = self.tfidf_matrix_dict[documentIndex]
        tfidf_in_document_sorted = sorted(tfidf_in_document.items(), key = operator.itemgetter(1), reverse = True)
        tfidf_in_document_sorted = tfidf_in_document_sorted[:numOfTop]
        return dict(tfidf_in_document_sorted)
    
    def getTopWordsInDocuments(self, documentIndexes, numOfTopInDocument, numOfTop):
        words_with_score = {}
        for index in documentIndexes:
            words_in_document = self.getTopWordsInDocument(index, numOfTopInDocument)
            for word in list(words_in_document.keys()):
                if not word in words_with_score:
                    words_with_score[word] = 0
                words_with_score[word] += words_in_document[word]
        words_with_score_sorted = sorted(words_with_score.items(), key = operator.itemgetter(1), reverse = True)
        words_with_score_sorted = words_with_score_sorted[:numOfTop]
        return dict(words_with_score_sorted)

In [131]:
# Test Case
# postDatas = [
#     {'text_words': ['this', 'is', 'a', 'sample', 'sample']},
#     {'text_words': ['this', 'is', 'another', 'another', 'example', 'example', 'example']},
# ]
# tfidf = TFIDF(postDatas)
# print(tfidf.tfidf_matrix_dict)
# print(tfidf.getTopWordsInDocuments([0,1], 1000, 1000))