In [9]:
import pandas as pd
import numpy as np
import datetime, os
from collections import Counter
import gensim
import warnings
warnings.filterwarnings("ignore")

In [10]:
directory_path = "\\".join(os.getcwd().split("\\")[:-1]) + "\\data"
os.path.exists(directory_path)

True

In [11]:
class TrainingConfig(object):
    epochs = 5
    evaluateEvery = 100
    checkpointEvery = 100
    learningRate = 0.001


class ModelConfig(object):
    embeddingSize = 200
    hiddenSizes = 128
    dropoutKeepProb = 0.5
    leRegLambda = 0.0
    epsilon = 5

    
class Config(object):
    sequenceLength = 200
    batchSize = 128
    dataSource = directory_path + "\\preProcess\\labeledTrain.csv"
    stopWordSource = directory_path + "\\english"
    
    # 二分类设置为 1，多分类设置为其他数字
    numClasses = 1
    rate = 0.8
    training = TrainingConfig()
    model = ModelConfig()

    
config = Config()

In [12]:
class Dataset(object):
    def __init__(self, config):
        self.config = config
        self._dataSource = config.dataSource
        self._stopWordSource = config.stopWordSource
        self._sequenceLength = config.sequenceLength
        self._embeddingSize = config.model.embeddingSize
        self._batchSize = config.batchSize
        self._rate = config.rate
        
        self._stopWordDict = dict()
        self.trainReviews = []
        self.trainLabels = []
        self.evalReviews = []
        self.evalLabels = []
        
        self.wordEmbedding = None
        
        # 统计词在多少个 review 中出现过
        self.indexFreqs = []
        self.labelList = []
    
    
    def _readData(self, file_path):
        df = pd.read_csv(file_path)
        if self.config.numClasses == 1:
            labels = df["sentiment"].tolist()
        elif self.config.numClasses > 1:
            labels =df["rate"].tolist()
        
        review = df["review"].tolist()
        reviews = [line.strip().split() for line in review]
        
        return reviews, labels
    
    
    
    def _readStopWord(self, stopWordPath):
        with open(stopWordPath, "r") as f:
            stopWords = f.read()
            stopWordList = stopWords.splitlines()
            self._stopWordDict = dict(zip(stopWordList, list(range(len(stopWordList)))))
    
    
    def _genVocabulary(self, reviews, labels):
        all_words = [word for review in reviews for word in review]
        subWords = [word for word in all_words if word not in self._stopWordDict]
        wordCount = Counter(subWords)
        sortedWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)
        words = [item[0] for item in sortedWordCount if item[1] >= 5]
        
        vocab, wordEmbedding = self._getWordEmbedding(words)
        self.wordEmbedding = wordEmbedding
        
        word2idx = dict(zip(vocab, list(range(len(vocab)))))
        self._getWordIndexFreqs(vocab, reviews, word2idx)
        
        uniqueLabel = list(set(labels))
        label2idx = dict(zip(uniqueLabel, list(range(len(uniqueLabel)))))
        self.labelList = list(range(len(uniqueLabel)))
        
        with open(directory_path + "\\wordJson\\word2idx.json", "w", encoding="utf-8") as f:
            json.dump(word2idx, f)
        with open(directory_path + "\\wordJson\\label2idx.json", "w", encoding="utf-8") as f:
            json.dump(label2idx, f)
        
        return word2idx, label2idx
    
    
    def _getWordEmbedding(self, words):
        wordVec = gensim.models.KeyedVectors.load_word2vec_format(directory_path + "\\word2vec\\word2Vec.bin", 
                                                                 binary=True)
        vocab = []
        wordEmbedding = []
        vocab.append("PAD")
        vocab.append("UNK")
        wordEmbedding.append(np.zeros(self._embeddingSize))
        wordEmbedding.append(np.random.randn(self._embeddingSize))
        
        for word in words:
            try:
                vector = wordVec.wv[word]
                vocab.append(word)
                wordEmbedding.append(vector)
            except:
                print("{} is not exist...")
        
        return vocab, wordEmbedding
    
    
    
    def _get_word_index_frequence(self, vocab, reviews, word2idx):
        all_words = [word for review in reviews for word in review]
        subWords = [word for word in all_words if word not in self._stopWordDict]
        wordCount = Counter(subWords)
        sortedWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)
        words = [item[0] for item in sortedWordCount if item[1] >= 5]
        
        review_dicts = [dict(zip(review, range(len(review)))) for review in reviews]
        index_frequence = [0] * len(vocab)
        for word in vocab:
            count = 0
            for review in reviews:
                if word in set(review):
                    count += 1
        
            index_frequence[word2idx[word]] = count
        
        self._index_frequence = index_frequence
    
    
    def _ensemble(self):
        self._readStopWord(self._stopWordSource)
        reviews, labels = self._readData(self._dataSource)
        
        all_words = [word for review in reviews for word in review]
        subWords = [word for word in all_words if word not in self._stopWordDict]
        wordCount = Counter(subWords)
        sortedWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)
        words = [item[0] for item in sortedWordCount if item[1] >= 5]
        
        vocab, wordEmbedding = self._getWordEmbedding(words)
        word2idx = dict(zip(vocab, list(range(len(vocab)))))
        self._get_word_index_frequence(vocab, reviews, word2idx)
    
    
my_data = Dataset(config)
my_data._ensemble()

In [16]:
import json
def write_to_file():
    with open(directory_path +"\\my_file\\index_frequence.txt", "w", encoding="utf-8") as f:
        json.dump(my_data._index_frequence, f)

write_to_file()

In [17]:
my_data._index_frequence

[0,
 0,
 14888,
 13487,
 13616,
 11372,
 9325,
 8263,
 7987,
 7569,
 8168,
 7337,
 7892,
 6936,
 3828,
 6698,
 6686,
 6295,
 6006,
 5676,
 6130,
 6196,
 6166,
 6094,
 5953,
 5840,
 5419,
 5003,
 5771,
 5157,
 5372,
 5246,
 5014,
 4767,
 4544,
 5171,
 4965,
 4808,
 5176,
 4749,
 4322,
 4794,
 4679,
 3220,
 4188,
 4613,
 4474,
 4328,
 4292,
 4264,
 3545,
 3660,
 3784,
 4036,
 3911,
 3745,
 3547,
 3622,
 3573,
 3730,
 3570,
 3514,
 3435,
 3447,
 3597,
 3165,
 3404,
 3525,
 3338,
 3365,
 3447,
 3292,
 3142,
 3348,
 3311,
 2969,
 3265,
 3213,
 2842,
 3066,
 3157,
 3005,
 3000,
 3081,
 2991,
 3083,
 2887,
 2888,
 2678,
 3005,
 2923,
 2905,
 1180,
 2935,
 2917,
 2616,
 2795,
 2932,
 2661,
 2701,
 1951,
 2784,
 2289,
 2720,
 2681,
 2772,
 2613,
 2645,
 1795,
 2565,
 2724,
 2707,
 2745,
 2622,
 2620,
 2674,
 2619,
 2614,
 2361,
 2643,
 2118,
 2544,
 2475,
 2508,
 2036,
 2195,
 2232,
 2492,
 2408,
 2427,
 2031,
 2195,
 2421,
 2487,
 2401,
 2110,
 2428,
 2414,
 2424,
 2295,
 2211,
 2252,
 2246,
 