一、函数

In [1]:
import os
import nltk
import re
import numpy as np
import pandas as pd
from collections import Counter

import sklearn
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVR, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
#nltk.download()

In [42]:
def gettext(filename):
    handle = open(filename, 'r', encoding='UTF-8')
    text = handle.read()
    handle.close()
    return text


def wordlemmatize(tags):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    words_l = []
    for tag in tags:
        if tag[1] and tag[1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            words_l.append(lemmatizer.lemmatize(tag[0], pos='n'))
        elif tag[1] and tag[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
            words_l.append(lemmatizer.lemmatize(tag[0], pos='v'))
        elif tag[1] and tag[1] in ['JJ', 'JJR', 'JJS']:
            words_l.append(lemmatizer.lemmatize(tag[0], pos='a'))
        elif tag[1] and tag[1] in ['RB', 'RBR', 'RBS']:
            words_l.append(lemmatizer.lemmatize(tag[0], pos='r'))
        else:
            words_l.append(tag[0])
    return words_l


def F_measure(tags_s):
    Flist = {
        'noum': ['NN', 'NNS', 'NNP', 'NNPS'],
        'adj': ['JJ', 'JJR', 'JJS'],
        'prep': ['IN'],
        'art': ['CD'],
        'pron': ['PRP', 'WP'],
        'verb': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
        'adv': ['RB', 'RBR', 'RBS'],
        'int': ['UH']
    }
    tagshapelist = [0] * len(tags_s)

    for i, tag in enumerate(tags_s):
        tagshapelist[i] = tag[1]
    ff = dict(Counter(tagshapelist))

    a = 0
    b = 0
    for i, poslist in enumerate(Flist.values()):
        if i < 4:
            for pos in poslist:
                if pos in ff.keys():
                    a += ff[pos]
        if i >= 4:
            for pos in poslist:
                if pos in ff.keys():
                    b += ff[pos]
    F_feature = 0.5 * (a - b + 100)
    return F_feature


def Gender_Preferential_Features(words_l):
    GPFlist = ['able', 'al', 'ful', 'ible', 'ic', 'ive', 'less', 'ly', 'ous']
    GPF_feature = [0] * (len(GPFlist) + 1)

    for i, trigger in enumerate(GPFlist):
        flag = [0] * len(words_l)
        for j, word in enumerate(words_l):
            flag[j] = word.endswith(trigger)
        GPF_feature[i] = sum(flag)
    GPF_feature[-1] = words.count('sorry') + words.count('sry')
    if sum(GPF_feature) != 0:
        GPF_feature = np.array(GPF_feature) / sum(GPF_feature)
    return GPF_feature


def Word_Classes_Feature(words_l):
    classlist = {
        'Conversation': [
            'know', 'people', 'think', 'person', 'tell', 'feel', 'friends',
            'talk', 'new', 'talking', 'mean', 'ask', 'understand', 'feelings',
            'care', 'thinking', 'friend', 'relationship', 'realize',
            'question', 'answer', 'saying'
        ],
        'AtHome': [
            'woke', 'home', 'sleep', 'today', 'eat', 'tired', 'wake', 'watch',
            'watched', 'dinner', 'ate', 'bed', 'day', 'house', 'tv', 'early',
            'boring', 'yesterday', 'watching', 'sit'
        ],
        'Family': [
            'years', 'family', 'mother', 'children', 'father', 'kids',
            'parents', 'old', 'year', 'child', 'son', 'married', 'sister',
            'dad', 'brother', 'moved', 'age', 'young', 'months', 'three',
            'wife', 'living', 'college', 'four', 'high', 'five', 'died', 'six',
            'baby', 'boy', 'spend', 'christmas'
        ],
        'Time': [
            'friday', 'saturday', 'weekend', 'week', 'sunday', 'night',
            'monday', 'tuesday', 'thursday', 'wednesday', 'morning',
            'tomorrow', 'tonight', 'evening', 'days', 'afternoon', 'weeks',
            'hours', 'july', 'busy', 'meeting', 'hour', 'month', 'june'
        ],
        'Work': [
            'work', 'working', 'job', 'trying', 'right', 'met', 'figure',
            'meet', 'start', 'better', 'starting', 'try', 'worked', 'idea'
        ],
        'PastActions': [
            'said', 'asked', 'told', 'looked', 'walked', 'called', 'talked',
            'wanted', 'kept', 'took', 'sat', 'gave', 'knew', 'felt', 'turned',
            'stopped', 'saw', 'ran', 'tried', 'picked', 'left', 'ended'
        ],
        'Games': [
            'game', 'games', 'team', 'win', 'play', 'played', 'playing', 'won',
            'season', 'beat', 'final', 'two', 'hit', 'first', 'video',
            'second', 'run', 'star', 'third', 'shot', 'table', 'round', 'ten',
            'chance', 'club', 'big', 'straight'
        ],
        'Internet': [
            'site', 'email', 'page', 'please', 'website', 'web', 'post',
            'link', 'check', 'blog', 'mail', 'information', 'free', 'send',
            'comments', 'comment', 'using', 'internet', 'online', 'name',
            'service', 'list', 'computer', 'add', 'thanks', 'update', 'message'
        ],
        'Location': [
            'street', 'place', 'town', 'road', 'city', 'walking', 'trip',
            'headed', 'front', 'car', 'beer', 'apartment', 'bus', 'area',
            'park', 'building', 'walk', 'small', 'places', 'ride', 'driving',
            'looking', 'local', 'sitting', 'drive', 'bar', 'bad', 'standing',
            'floor', 'weather', 'beach', 'view'
        ],
        'Fun': [
            'fun', 'im', 'cool', 'mom', 'summer', 'awesome', 'lol', 'stuff',
            'pretty', 'ill', 'mad', 'funny', 'weird'
        ],
        'Food/Clothes': [
            'food', 'eating', 'weight', 'lunch', 'water', 'hair', 'life',
            'white', 'wearing', 'color', 'ice', 'red', 'fat', 'body', 'black',
            'clothes', 'hot', 'drink', 'wear', 'blue', 'minutes', 'shirt',
            'green', 'coffee', 'total', 'store', 'shopping'
        ],
        'Poetic': [
            'eyes', 'heart', 'soul', 'pain', 'light', 'deep', 'smile',
            'dreams', 'dark', 'hold', 'hands', 'head', 'hand', 'alone', 'sun',
            'dream', 'mind', 'cold', 'fall', 'air', 'voice', 'touch', 'blood',
            'feet', 'words', 'hear', 'rain', 'mouth'
        ],
        'Books/Movies': [
            'book', 'read', 'reading', 'books', 'story', 'writing', 'written',
            'movie', 'stories', 'movies', 'film', 'write', 'character', 'fact',
            'thoughts', 'title', 'short', 'take', 'wrote'
        ],
        'Religion': [
            'god', 'jesus', 'lord', 'church', 'earth', 'world', 'word',
            'lives', 'power', 'human', 'believe', 'given', 'truth', 'thank',
            'death', 'evil', 'own', 'peace', 'speak', 'bring', 'truly'
        ],
        'Romance': [
            'forget', 'forever', 'remember', 'gone', 'true', 'face', 'spent',
            'times', 'love', 'cry', 'hurt', 'wish', 'loved'
        ],
        'Swearing': [
            'shit', 'fuck', 'fucking', 'ass', 'bitch', 'damn', 'hell', 'sucks',
            'stupid', 'hate', 'drunk', 'crap', 'kill', 'guy', 'gay', 'kid',
            'sex', 'crazy'
        ],
        'Politics': [
            'bush', 'president', 'Iraq', 'kerry', 'war', 'american',
            'political', 'states', 'america', 'country', 'government', 'john',
            'national', 'news', 'state', 'support', 'issues', 'article',
            'michael', 'bill', 'report', 'public', 'issue', 'history', 'party',
            'york', 'law', 'major', 'act', 'fight', 'poor'
        ],
        'Music': [
            'music', 'songs', 'song', 'band', 'cd', 'rock', 'listening',
            'listen', 'show', 'favorite', 'radio', 'sound', 'heard', 'shows',
            'sounds', 'amazing', 'dance'
        ],
        'School': [
            'school', 'teacher', 'class', 'study', 'test', 'finish', 'english',
            'students', 'period', 'paper', 'pass'
        ],
        'Business': [
            'system', 'based', 'process', 'business', 'control', 'example',
            'personal', 'experience', 'general'
        ],
        'Positive': [
            'absolutely', 'abundance', 'ace', 'active', 'admirable', 'adore',
            'agree', 'amazing', 'appealing', 'attraction', 'bargain',
            'beaming', 'beautiful', 'best', 'better', 'boost', 'breakthrough',
            'breeze', 'brilliant', 'brimming', 'charming', 'clean', 'clear',
            'colorful', 'compliment', 'confidence', 'cool', 'courteous',
            'cuddly', 'dazzling', 'delicious', 'delightful', 'dynamic', 'easy',
            'ecstatic', 'efficient', 'enhance', 'enjoy', 'enormous',
            'excellent', 'exotic', 'expert', 'exquisite', 'flair', 'free',
            'generous', 'genius', 'great', 'graceful', 'heavenly', 'ideal',
            'immaculate', 'impressive', 'incredible', 'inspire', 'luxurious',
            'outstanding', 'royal', 'speed', 'splendid', 'spectacular',
            'superb', 'sweet', 'sure', 'supreme', 'terrific', 'treat',
            'treasure', 'ultra', 'unbeatable', 'ultimate', 'unique', 'wow',
            'zest'
        ],
        'Negative': [
            'wrong', 'stupid', 'bad', 'evil', 'dumb', 'foolish', 'grotesque',
            'harm', 'fear', 'horrible', 'idiot', 'lame', 'mean', 'poor',
            'heinous', 'hideous', 'deficient', 'petty', 'awful', 'hopeless',
            'fool', 'risk', 'immoral', 'risky', 'spoil', 'spoiled', 'malign',
            'vicious', 'wicked', 'fright', 'ugly', 'atrocious', 'moron',
            'hate', 'spiteful', 'meager', 'malicious', 'lacking'
        ],
        'Emotion': [
            'aggressive', 'alienated', 'angry', 'annoyed', 'anxious',
            'careful', 'cautious', 'confused', 'curious', 'depressed',
            'determined', 'disappointed', 'discouraged', 'disgusted',
            'ecstatic', 'embarrassed', 'enthusiastic', 'envious', 'excited',
            'exhausted', 'frightened', 'frustrated', 'guilty', 'happy',
            'helpless', 'hopeful', 'hostile', 'humiliated', 'hurt',
            'hysterical', 'innocent', 'interested', 'jealous', 'lonely',
            'mischievous', 'miserable', 'optimistic', 'paranoid', 'peaceful',
            'proud', 'puzzled', 'regretful', 'relieved', 'sad', 'satisfied',
            'shocked', 'shy', 'sorry', 'surprised', 'suspicious', 'thoughtful',
            'undecided', 'withdrawn'
        ]
    }
    WC_feature = [0] * len(classlist)

    for i, ws in enumerate(classlist.values()):
        for w in ws:
            WC_feature[i] += words_l.count(w)
    if sum(WC_feature) != 0:
        WC_feature = np.array(WC_feature) / sum(WC_feature)
    return WC_feature


def CorpusPOS(sentences):
    posTagList = [
        'NN', 'CC', 'LS', 'PDT', 'POS', 'SYM', 'NNS', 'NNP', 'NNPS', 'FW',
        'CD', 'JJ', 'JJR', 'JJS', 'IN', 'TO', 'DT', 'EX', 'PRP', 'PRP$', 'WDT',
        'WP', 'WP$', 'MD', 'VB', 'VBZ', 'VBP', 'VBD', 'VBN', 'VBG', 'RB',
        'RBR', 'RBS', 'RP', 'WRB', 'UH', '.'
    ]
    outfile = open('CorpusPOS.txt', 'w')
    for sentence in sentences:
        tagSentence = ""
        tokensWord = nltk.word_tokenize(sentence)
        textToken = nltk.Text(tokensWord)
        tags = nltk.pos_tag(tokensWord)

        for a, b in tags:
            if b in posTagList:
                tagSentence = tagSentence + b + " "
        tagSentence = tagSentence + "\n"
        outfile.write(tagSentence)

    outfile.close()


def calc_probabilities(cPOS):
    from nltk import ngrams

    unigram_p = {}
    bigram_p = {}
    trigram_p = {}
    fourgram_p = {}
    fivegram_p = {}
    sixgram_p = {}
    sevengram_p = {}

    unigram = {}
    bigram = {}
    trigram = {}
    fourgram = {}
    fivegram = {}
    sixgram = {}
    sevengram = {}
    uni_count = biCount = triCount = fourCount = fiveCount = sixCount = sevenCount = 0

    for sentence in cPOS:
        tokens = sentence.split()

        for word in tokens:
            uni_count += 1

            if word in unigram:
                unigram[word] += 1
            else:
                unigram[word] = 1

        bigram_tuples = tuple(nltk.bigrams(tokens))
        for item in bigram_tuples:
            biCount += 1
            if item in bigram:
                bigram[item] += 1
            else:
                bigram[item] = 1

        trigram_tuples = tuple(nltk.trigrams(tokens))
        for item in trigram_tuples:
            triCount += 1
            if item in trigram:
                trigram[item] += 1
            else:
                trigram[item] = 1

        fourgram_tuples = ngrams(tokens, 4)
        for item in fourgram_tuples:
            fourCount += 1
            if item in fourgram:
                fourgram[item] += 1
            else:
                fourgram[item] = 1

        fivegram_tuples = ngrams(tokens, 5)
        for item in fivegram_tuples:
            fiveCount += 1
            if item in fivegram:
                fivegram[item] += 1
            else:
                fivegram[item] = 1

        sixgram_tuples = ngrams(tokens, 6)
        for item in sixgram_tuples:
            sixCount += 1
            if item in sixgram:
                sixgram[item] += 1
            else:
                sixgram[item] = 1

        sevengram_tuples = ngrams(tokens, 7)
        for item in sevengram_tuples:
            sevenCount += 1
            if item in sevengram:
                sevengram[item] += 1
            else:
                sevengram[item] = 1

    # calculate unigram probability
    for word in unigram:
        temp = [word]
        unigram_p[tuple(temp)] = (float(unigram[word]) / uni_count)

    # calculate bigram probability
    for word in bigram:
        bigram_p[tuple(word)] = (float(bigram[word]) / biCount)

    # calculate trigram probability
    for word in trigram:
        trigram_p[tuple(word)] = (float(trigram[word]) / triCount)

    # calculate fourgram probability
    for word in fourgram:
        fourgram_p[tuple(word)] = (float(fourgram[word]) / fourCount)

    # calculate fivegram probability
    for word in fivegram:
        fivegram_p[tuple(word)] = (float(fivegram[word]) / fiveCount)

    for word in sixgram:
        sixgram_p[tuple(word)] = (float(sixgram[word]) / sixCount)

    for word in sevengram:
        sevengram_p[tuple(word)] = (float(sevengram[word]) / sevenCount)

    return unigram_p, bigram_p, trigram_p, fourgram_p, fivegram_p, sixgram_p, sevengram_p


def q1_output(unigrams, bigrams, trigrams, fourgrams, fivegrams, sixgrams,
              sevengrams):
    #output probabilities
    outfile = open('probabilities.txt', 'w')
    for unigram in unigrams:
        outfile.write(unigram[0] + ':' + str(unigrams[unigram]) + '\n')
    for bigram in bigrams:
        outfile.write(bigram[0] + ' ' + bigram[1] + ':' +
                      str(bigrams[bigram]) + '\n')
    for trigram in trigrams:
        outfile.write(trigram[0] + ' ' + trigram[1] + ' ' + trigram[2] + ':' +
                      str(trigrams[trigram]) + '\n')

    for fourgram in fourgrams:
        outfile.write(fourgram[0] + ' ' + fourgram[1] + ' ' + fourgram[2] +
                      ' ' + fourgram[3] + ':' + str(fourgrams[fourgram]) +
                      '\n')

    for fivegram in fivegrams:
        outfile.write(fivegram[0] + ' ' + fivegram[1] + ' ' + fivegram[2] +
                      ' ' + fivegram[3] + ' ' + fivegram[4] + ':' +
                      str(fivegrams[fivegram]) + '\n')

    for sixgram in sixgrams:
        outfile.write(sixgram[0] + ' ' + sixgram[1] + ' ' + sixgram[2] + ' ' +
                      sixgram[3] + ' ' + sixgram[4] + ' ' + sixgram[5] + ':' +
                      str(sixgrams[sixgram]) + '\n')

    for sevengram in sevengrams:
        outfile.write(sevengram[0] + ' ' + sevengram[1] + ' ' + sevengram[2] +
                      ' ' + sevengram[3] + ' ' + sevengram[4] + ' ' +
                      sevengram[5] + ' ' + sevengram[6] + ':' +
                      str(sevengrams[sevengram]) + '\n')

    outfile.close()


def prob(sequence):
    if sequence in Prob.keys():
        return Prob[sequence]
    else:
        return 0


def fairSCP(sequence):
    numerator = prob(sequence) * prob(sequence)
    sequence = sequence.split()

    denominator = 0

    for j in range(1, len(sequence)):
        seq1 = ""
        seq2 = ""
        cnt = 1

        for tag in sequence:
            if cnt <= j:
                seq1 = seq1 + tag + " "
                cnt += 1
            else:
                seq2 = seq2 + tag + " "

        seq2 = seq2[:-1]
        seq1 = seq1[:-1]

        denominator += prob(seq1) * prob(seq2)

    denominator = denominator * 1.0 / (len(sequence) - 1)

    if denominator == 0:
        return 0.0

    SCP = numerator * 1.0 / denominator

    return SCP


def candidateGen(Fk):
    Ck = []

    for item in Fk:
        for tag in tagList:
            itemTemp = item + " " + tag
            Ck.append(itemTemp)

    return Ck


def minePOSPats(cPOS):
    minSup = 0.3
    minAdherence = 0.2
    C = [{} for i in range(7)]
    F = [[] for i in range(7)]
    SP = [[] for i in range(7)]
    Cand = [[] for i in range(7)]

    Doc = cPOS
    n = len(Doc)

    for post in Doc:
        for tag in tagList:
            if tag in post:
                if tag in C[0].keys():
                    C[0][tag] += 1
                else:
                    C[0][tag] = 1

    for a in C[0]:
        if C[0][a] * 1.0 / n >= minSup:
            F[0].append(a)

    SP[0] = F[0]
    temp = {}
    for k in range(1, 7):
        Cand[k] = candidateGen(F[k - 1])
        for post in Doc:
            for candidate in Cand[k]:
                if candidate in post:
                    if candidate in C[k].keys():
                        C[k][candidate] += 1
                    else:
                        C[k][candidate] = 1

        for a in C[k]:
            if C[k][a] * 1.0 / n >= minSup:
                F[k].append(a)

        for a in F[k]:
            if fairSCP(a) >= minAdherence:
                SP[k].append(a)

    SPFinal = []
    SPFinal = SP[0] + SP[1] + SP[2] + SP[3] + SP[4] + SP[5] + SP[6]

    return SPFinal

二、POS Sequence Pattern 挖掘

In [13]:
tagList = [
    'NN', 'CC', 'LS', 'PDT', 'POS', 'SYM', 'NNS', 'NNP', 'NNPS', 'FW', 'CD',
    'JJ', 'JJR', 'JJS', 'IN', 'TO', 'DT', 'EX', 'PRP', 'PRP$', 'WDT', 'WP',
    'WP$', 'MD', 'VB', 'VBZ', 'VBP', 'VBD', 'VBN', 'VBG', 'RB', 'RBR', 'RBS',
    'RP', 'WRB', 'UH', '.'
]

for gender in [0, 1]:
    if gender == 0:
        txtDir = './postprocess_blogs/blogs/female/'
    else:
        txtDir = './postprocess_blogs/blogs/male/'

    print("Processing gender: {}".format(txtDir))
    blogs_gender = os.listdir(txtDir)
    for i in range(0, len(blogs_gender)):
        m = blogs_gender[i]
        print("Processing: ", m)
        print("Files:", len(os.listdir(txtDir + m)))
        for file in os.listdir(txtDir + m):
            text = gettext(txtDir + m + '/' + file)
            sentences = nltk.sent_tokenize(text)
            CorpusPOS(sentences)

infile = open('CorpusPOS.txt', 'r')
cPOS = infile.readlines()
infile.close()
(a, b, c, d, e) = calc_probabilities(cPOS)
q1_output(a, b, c, d, e)

Prob = {}
infile = open('probabilities.txt', 'r')
prob_text = infile.readlines()

for sentence in prob_text:
    keyValPair = sentence.split(":")
    Prob[keyValPair[0]] = float(keyValPair[1][:-1])

infile.close()

posFeatures = minePOSPats(cPOS)

Processing gender: ./postprocess_blogs/blogs/female/
Processing:  Chemicals
Files: 23
Processing:  Fashion
Files: 73
Processing:  Banking
Files: 47
Processing:  Construction
Files: 22
Processing:  Student
Files: 2479
Processing:  InvestmentBanking
Files: 10
Processing:  Engineering
Files: 70
Processing:  Arts
Files: 419
Processing:  Technology
Files: 180
Processing:  Consulting
Files: 73
Processing:  Internet
Files: 101
Processing:  Sports-Recreation
Files: 36
Processing:  Religion
Files: 43
Processing:  BusinessServices
Files: 81
Processing:  Automotive
Files: 17
Processing:  Tourism
Files: 54
Processing:  Non-Profit
Files: 194
Processing:  Museums-Libraries
Files: 33
Processing:  indUnk
Files: 3961
Processing:  Communications-Media
Files: 209
Processing:  Military
Files: 32
Processing:  Telecommunications
Files: 40
Processing:  Advertising
Files: 75
Processing:  Biotech
Files: 20
Processing:  Science
Files: 84
Processing:  Environment
Files: 14
Processing:  Government
Files: 95
Proce

In [14]:
posFeatures

['NN',
 'CC',
 'PDT',
 'NNS',
 'FW',
 'CD',
 'JJ',
 'JJR',
 'JJS',
 'IN',
 'TO',
 'DT',
 'EX',
 'PRP',
 'PRP$',
 'WDT',
 'WP',
 'WP$',
 'MD',
 'VB',
 'VBZ',
 'VBP',
 'VBD',
 'VBN',
 'VBG',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'WRB',
 'UH',
 'TO VB',
 'RBR DT NN JJ EX']

三、 feature engineering

In [21]:
names = []
F_features = []
GRF_features = []
WC_features = []
POS_features = []
labels = []

for gender in [0, 1]:
    if gender == 0:
        txtDir = './postprocess_blogs/blogs/female/'
    else:
        txtDir = './postprocess_blogs/blogs/male/'

    print("Processing gender: {}".format(txtDir))
    blogs_gender = os.listdir(txtDir)
    for i in range(0, len(blogs_gender)):
        m = blogs_gender[i]
        print("Processing: ", m)
        print("Files:", len(os.listdir(txtDir + m)))
        for file in os.listdir(txtDir + m):
            name = txtDir + m + '/' + file
            text = gettext(name)
            words = getwords(text)
            sentences = nltk.sent_tokenize(text)
            words_s = split(words)
            tags = [nltk.pos_tag(word) for word in words]
            tags_s = split(tags)
            words_l = wordlemmatize(tags_s)

            F_feature = F_measure(tags_s)
            GRF_feature = Gender_Preferential_Features(words_l)
            WC_feature = Word_Classes_Feature(words_l)

            textTags = ""
            for word, tag in tags_s:
                if tag in tagList:
                    textTags = textTags + tag + " "

            POS_feature = []

            for feature in posFeatures:
                if feature in textTags:
                    POS_feature.append(1)
                else:
                    POS_feature.append(0)

            names.append(name)
            F_features.append(F_feature)
            GRF_features.append(GRF_feature)
            WC_features.append(WC_feature)
            POS_features.append(POS_feature)
            labels.append(gender)

Processing gender: ./postprocess_blogs/blogs/female/
Processing:  Chemicals
Files: 23
Processing:  Fashion
Files: 73
Processing:  Banking
Files: 47
Processing:  Construction
Files: 22
Processing:  Student
Files: 2479
Processing:  InvestmentBanking
Files: 10
Processing:  Engineering
Files: 70
Processing:  Arts
Files: 419
Processing:  Technology
Files: 180
Processing:  Consulting
Files: 73
Processing:  Internet
Files: 101
Processing:  Sports-Recreation
Files: 36
Processing:  Religion
Files: 43
Processing:  BusinessServices
Files: 81
Processing:  Automotive
Files: 17
Processing:  Tourism
Files: 54
Processing:  Non-Profit
Files: 194
Processing:  Museums-Libraries
Files: 33
Processing:  indUnk
Files: 3961
Processing:  Communications-Media
Files: 209
Processing:  Military
Files: 32
Processing:  Telecommunications
Files: 40
Processing:  Advertising
Files: 75
Processing:  Biotech
Files: 20
Processing:  Science
Files: 84
Processing:  Environment
Files: 14
Processing:  Government
Files: 95
Proce

In [32]:
def getsingle(features, n):
    single = []
    for item in features:
        single.append(item[n])
    return single


WC_features_l = []
for i in range(23):
    n = i
    WC_features_l.append(getsingle(WC_features, n))

GRF_features_l = []
for i in range(10):
    n = i
    GRF_features_l.append(getsingle(GRF_features, n))

POS_features_l = []
for i in range(33):
    n = i
    POS_features_l.append(getsingle(POS_features, n))

In [40]:
map1 = {'name': names, 'label': labels, 'F_feature': F_features}

for i in range(23):
    key = 'WC_' + str(i + 1)
    value = WC_features_l[i]
    map1[key] = value

for i in range(10):
    key = 'GRF_' + str(i + 1)
    value = GRF_features_l[i]
    map1[key] = value

for i in range(33):
    key = 'POS_' + str(i + 1)
    value = POS_features_l[i]
    map1[key] = value

allofall = pd.DataFrame(map1)

In [47]:
F_features_u = np.array(F_features)
F_features_u = (F_features_u - np.mean(F_features_u)) / np.std(F_features_u)
allofall['F_feature'] = F_features_u

In [4]:
#allofall.to_csv('allofall.csv',index = False)
allofall = pd.read_csv('allofall.csv')
df_per_txt = pd.read_csv('blogs_genderbias.csv')

In [5]:
allofall['word ratio'], allofall['bias'] = df_per_txt['word ratio'], df_per_txt['bias']
allofall

Unnamed: 0,name,label,F_feature,WC_1,WC_2,WC_3,WC_4,WC_5,WC_6,WC_7,...,POS_26,POS_27,POS_28,POS_29,POS_30,POS_31,POS_32,POS_33,word ratio,bias
0,./postprocess_blogs/blogs/female/Chemicals/344...,0,-0.169566,0.107692,0.112821,0.042735,0.104274,0.061538,0.010256,0.034188,...,1,1,0,1,1,0,1,0,5.000000e-01,0.289545
1,./postprocess_blogs/blogs/female/Chemicals/407...,0,-0.293140,0.149425,0.022989,0.080460,0.011494,0.034483,0.000000,0.011494,...,1,0,0,1,1,0,1,0,1.000000e+00,0.000000
2,./postprocess_blogs/blogs/female/Chemicals/340...,0,1.948334,0.112903,0.085253,0.086790,0.035330,0.052227,0.009985,0.035330,...,1,1,1,1,1,1,1,0,6.502463e-01,0.940899
3,./postprocess_blogs/blogs/female/Chemicals/339...,0,-0.295692,0.105000,0.115000,0.075000,0.065000,0.035000,0.015000,0.055000,...,1,0,0,1,1,0,1,0,4.347826e-01,0.810930
4,./postprocess_blogs/blogs/female/Chemicals/398...,0,-0.282569,0.158273,0.100719,0.028777,0.043165,0.043165,0.014388,0.000000,...,1,1,0,1,1,0,1,0,5.000000e-01,0.133531
5,./postprocess_blogs/blogs/female/Chemicals/364...,0,0.948802,0.134003,0.038848,0.067220,0.034046,0.046704,0.004801,0.074640,...,1,1,1,1,1,1,1,0,5.824561e-01,0.541787
6,./postprocess_blogs/blogs/female/Chemicals/409...,0,-0.241013,0.157480,0.051181,0.043307,0.094488,0.055118,0.019685,0.039370,...,1,1,0,1,1,1,1,0,7.419355e-01,1.165503
7,./postprocess_blogs/blogs/female/Chemicals/716...,0,-0.285121,0.117647,0.176471,0.035294,0.117647,0.023529,0.000000,0.035294,...,1,0,1,1,1,0,1,0,7.500001e-01,0.000000
8,./postprocess_blogs/blogs/female/Chemicals/405...,0,-0.265072,0.139738,0.122271,0.013100,0.052402,0.030568,0.000000,0.034934,...,1,0,1,1,1,0,1,0,6.000000e-01,0.810930
9,./postprocess_blogs/blogs/female/Chemicals/380...,0,-0.253772,0.114379,0.104575,0.068627,0.173203,0.055556,0.009804,0.032680,...,1,1,1,1,1,1,1,0,5.500000e-01,0.154151


In [57]:
data, target = allofall.drop(
    columns=['name', 'label']).iloc[:].values, allofall['label'].values
train_X, test_X, train_y, test_y = train_test_split(data,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    shuffle=True)
data, target = pd.concat(
    (pd.DataFrame(train_X), pd.DataFrame(test_X))), pd.concat(
        (pd.DataFrame(train_y), pd.DataFrame(test_y)))
data1 = data.iloc[:, :63]

In [58]:
from sklearn.model_selection import cross_val_score

rnd_clf = RandomForestClassifier(n_estimators=800,
                                 max_leaf_nodes=50,
                                 n_jobs=-1)
scores_rnd_clf_cv = cross_val_score(rnd_clf,data1.values,target1.values.reshape(-1,),cv=5)
print(scores_rnd_clf_cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_rnd_clf_cv.mean(), scores_rnd_clf_cv.std() * 2))

[0.7025 0.705  0.7225 0.755  0.7525]
Accuracy: 0.73 (+/- 0.05)


In [None]:
rnd_clf = LogisticRegression(solver="liblinear", random_state=42)
scores_rnd_clf_cv = cross_val_score(rnd_clf,data1.values,target.values.reshape(-1,),cv=5)
print(scores_rnd_clf_cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_rnd_clf_cv.mean(), scores_rnd_clf_cv.std() * 2))

In [64]:
rnd_clf = SVC(kernel='rbf', gamma=0.1, probability=True, C=1000)
scores_rnd_clf_cv = cross_val_score(rnd_clf,data1.values,target.values.reshape(-1,),cv=5)
print(scores_rnd_clf_cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_rnd_clf_cv.mean(), scores_rnd_clf_cv.std() * 2))

[0.685 0.735 0.72  0.735 0.7  ]
Accuracy: 0.71 (+/- 0.04)


In [66]:
rnd_clf = sklearn.tree.DecisionTreeClassifier(max_leaf_nodes=50)
scores_rnd_clf_cv = cross_val_score(rnd_clf,data.values,target.values.reshape(-1,),cv=5)
print(scores_rnd_clf_cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_rnd_clf_cv.mean(), scores_rnd_clf_cv.std() * 2))

[0.69   0.715  0.7025 0.755  0.7125]
Accuracy: 0.71 (+/- 0.04)


In [21]:
rnd_clf = RandomForestClassifier(n_estimators=1000,
                                 max_leaf_nodes=32,
                                 n_jobs=-1)
rnd_clf.fit(train_X, train_y)

'''svr_clf = SVR(kernel='rbf', gamma=2.0, C=10)
svr_clf.fit(train_X, train_y)

svc_clf = SVC(kernel='rbf', gamma=2.0, probability=True, C=10)
svc_clf.fit(train_X, train_y)

svc_poly_clf = SVC(kernel="poly", probability=True, degree=3, coef0=1, C=5)
svc_poly_clf.fit(train_X, train_y)

log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(train_X, train_y)

voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('svc', svcclf),
                                          ('svc_poly', svc_poly_clf)],
                              voting='soft')
voting_clf.fit(train_X, train_y)'''

'svr_clf = SVR(kernel=\'rbf\', gamma=2.0, C=10)\nsvr_clf.fit(train_X, train_y)\n\nsvc_clf = SVC(kernel=\'rbf\', gamma=2.0, probability=True, C=10)\nsvc_clf.fit(train_X, train_y)\n\nsvc_poly_clf = SVC(kernel="poly", probability=True, degree=3, coef0=1, C=5)\nsvc_poly_clf.fit(train_X, train_y)\n\nlog_clf = LogisticRegression(solver="liblinear", random_state=42)\nlog_clf.fit(train_X, train_y)\n\nvoting_clf = VotingClassifier(estimators=[(\'lr\', log_clf), (\'svc\', svcclf),\n                                          (\'svc_poly\', svc_poly_clf)],\n                              voting=\'soft\')\nvoting_clf.fit(train_X, train_y)'

In [22]:
y_pred = rnd_clf.predict(test_X)
accuracy_score(test_y, y_pred)

0.775

In [22]:
param_grid = [
    {
        'kernel': ['poly'],
        'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
        'degree': [2, 3, 4, 5, 6]
    },
    {
        'kernel': ['rbf'],
        'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
        'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]
    },
]

svc_clf_gs = SVC()
grid_search = GridSearchCV(svc_clf_gs,
                           param_grid,
                           cv=10,
                           scoring='accuracy',
                           verbose=2,
                           n_jobs=-1)
grid_search.fit(data, target)

Fitting 10 folds for each of 77 candidates, totalling 770 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 285 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 568 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 770 out of 770 | elapsed:   13.4s finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0],
                          'degree': [2, 3, 4, 5, 6], 'kernel': ['poly']},
                         {'C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0],
                          'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],
                          'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=2)

In [23]:
cvres = grid_search.cv_results_
for score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(score, params)

0.6835 {'C': 1.0, 'degree': 2, 'kernel': 'poly'}
0.672 {'C': 1.0, 'degree': 3, 'kernel': 'poly'}
0.6575 {'C': 1.0, 'degree': 4, 'kernel': 'poly'}
0.64 {'C': 1.0, 'degree': 5, 'kernel': 'poly'}
0.6245 {'C': 1.0, 'degree': 6, 'kernel': 'poly'}
0.706 {'C': 3.0, 'degree': 2, 'kernel': 'poly'}
0.703 {'C': 3.0, 'degree': 3, 'kernel': 'poly'}
0.6885 {'C': 3.0, 'degree': 4, 'kernel': 'poly'}
0.6715 {'C': 3.0, 'degree': 5, 'kernel': 'poly'}
0.6575 {'C': 3.0, 'degree': 6, 'kernel': 'poly'}
0.728 {'C': 10.0, 'degree': 2, 'kernel': 'poly'}
0.7195 {'C': 10.0, 'degree': 3, 'kernel': 'poly'}
0.709 {'C': 10.0, 'degree': 4, 'kernel': 'poly'}
0.7 {'C': 10.0, 'degree': 5, 'kernel': 'poly'}
0.69 {'C': 10.0, 'degree': 6, 'kernel': 'poly'}
0.73 {'C': 30.0, 'degree': 2, 'kernel': 'poly'}
0.726 {'C': 30.0, 'degree': 3, 'kernel': 'poly'}
0.7275 {'C': 30.0, 'degree': 4, 'kernel': 'poly'}
0.717 {'C': 30.0, 'degree': 5, 'kernel': 'poly'}
0.708 {'C': 30.0, 'degree': 6, 'kernel': 'poly'}
0.734 {'C': 100.0, 'degree'

In [43]:
svc_best = [0.6989130434782609, {'C': 1000.0, 'degree': 2, 'kernel': 'poly'}]


train_X, test_X= train_X[:,:44], test_X[:,:44]


svc_poly_clf = SVC(kernel="poly", probability=True, degree=2, coef0=1, C=1000)
svc_poly_clf.fit(train_X, train_y)



SVC(C=1000, cache_size=200, class_weight=None, coef0=1,
    decision_function_shape='ovr', degree=2, gamma='auto_deprecated',
    kernel='poly', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [44]:
y_pred = svc_poly_clf.predict(test_X)
accuracy_score(test_y, y_pred)

0.7039337474120083

In [None]:
param_grid = [
    {
        'kernel': ['poly'],
        'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
        'degree': [2, 3, 4, 5, 6]
    },
    {
        'kernel': ['rbf'],
        'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
        'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]
    },
]

svc_clf_gs = SVC()
grid_search = GridSearchCV(svc_clf_gs,
                           param_grid,
                           cv=10,
                           scoring='accuracy',
                           verbose=2,
                           n_jobs=-1)
grid_search.fit(data, target)

In [7]:
data.shape

(19320, 69)

In [9]:
import keras
import keras.backend as K
from keras import models, layers, regularizers, optimizers

K.clear_session()

model = models.Sequential()
model.add(layers.Dense(8, activation='relu', input_shape=(65,),))
model.add(layers.Dense(1, activation='sigmoid',))
model.compile(optimizer=optimizers.RMSprop(lr=0.001),
                loss='binary_crossentropy',
                metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 8)                 528       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 9         
Total params: 537
Trainable params: 537
Non-trainable params: 0
_________________________________________________________________


In [10]:
callbacks_list = [
                keras.callbacks.EarlyStopping(
                monitor='val_acc',
                patience=10,
                ),
keras.callbacks.ModelCheckpoint(
                filepath='my_model.h5',
                monitor='val_loss',
                save_best_only=True,
                )
]

history = model.fit(data, target, batch_size=128, epochs=100, verbose = 1, 
                    validation_split=0.1, callbacks=callbacks_list, shuffle=True)

Train on 1800 samples, validate on 200 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100


In [16]:
data1 = data[:,:44] 
data2 = data[:,-2:]

In [18]:
data = np.concatenate((data1,data2),axis=1)

In [3]:
df_per_txt = pd.read_csv('blogs_genderbias.csv')
df_per_txt

Unnamed: 0,bias,gender,word ratio
0,0.289545,0.0,5.000000e-01
1,0.000000,0.0,1.000000e+00
2,0.940899,0.0,6.502463e-01
3,0.810930,0.0,4.347826e-01
4,0.133531,0.0,5.000000e-01
5,0.541787,0.0,5.824561e-01
6,1.165503,0.0,7.419355e-01
7,0.000000,0.0,7.500001e-01
8,0.810930,0.0,6.000000e-01
9,0.154151,0.0,5.500000e-01


In [20]:
data.shape

(19320, 46)

In [25]:
allofall.to_csv('blogs_features.csv')

In [None]:
tagList = [
    'NN', 'CC', 'LS', 'PDT', 'POS', 'SYM', 'NNS', 'NNP', 'NNPS', 'FW', 'CD',
    'JJ', 'JJR', 'JJS', 'IN', 'TO', 'DT', 'EX', 'PRP', 'PRP$', 'WDT', 'WP',
    'WP$', 'MD', 'VB', 'VBZ', 'VBP', 'VBD', 'VBN', 'VBG', 'RB', 'RBR', 'RBS',
    'RP', 'WRB', 'UH', '.'
]

for gender in [0, 1]:
    if gender == 0:
        txtDir = './2000novel/female/'
    else:
        txtDir = './2000novel/male/'

    print("Processing gender: {}".format(txtDir))
    blogs_gender = os.listdir(txtDir)
    print("Files:", len(blogs_gender))
    for m in blogs_gender:
        text = gettext(txtDir + m)
        sentences = nltk.sent_tokenize(text)
        CorpusPOS(sentences)

infile = open('CorpusPOS.txt', 'r')
cPOS = infile.readlines()
infile.close()

(a, b, c, d, e, f, g) = calc_probabilities(cPOS)
q1_output(a, b, c, d, e, f, g)

Prob = {}
infile = open('probabilities.txt', 'r')
prob_text = infile.readlines()

for sentence in prob_text:
    keyValPair = sentence.split(":")
    Prob[keyValPair[0]] = float(keyValPair[1][:-1])

infile.close()

posFeatures = minePOSPats(cPOS)

Processing gender: ./2000novel/female/
Files: 1000


In [9]:
names = []
F_features = []
GRF_features = []
WC_features = []
POS_features = []
labels = []

for gender in [0, 1]:
    if gender == 0:
        txtDir = './postprocess_2000novel/female/'
    else:
        txtDir = './postprocess_2000novel/male/'

    print("Processing gender: {}".format(txtDir))
    blogs_gender = os.listdir(txtDir)
    print("Files:", len(blogs_gender))
    for m in blogs_gender:
        name = txtDir + m
        text = gettext(name)
        words = nltk.word_tokenize(text)
        sentences = nltk.sent_tokenize(text)
        tags = nltk.pos_tag(words)
        words_l = wordlemmatize(tags)

        F_feature = F_measure(tags)
        GRF_feature = Gender_Preferential_Features(words_l)
        WC_feature = Word_Classes_Feature(words_l)

        textTags = ""
        for word, tag in tags_s:
            if tag in tagList:
                textTags = textTags + tag + " "

        POS_feature = []

        for feature in posFeatures:
            if feature in textTags:
                POS_feature.append(1)
            else:
                POS_feature.append(0)
        names.append(name)
        F_features.append(F_feature)
        GRF_features.append(GRF_feature)
        WC_features.append(WC_feature)
        POS_features.append(POS_feature)
        labels.append(gender)
        

Processing gender: ./postprocess_2000novel/female/
Files: 1000
Processing gender: ./postprocess_2000novel/male/
Files: 1000


In [10]:
def getsingle(features, n):
    single = []
    for item in features:
        single.append(item[n])
    return single


WC_features_l = []
for i in range(len(WC_features[0])):
    n = i
    WC_features_l.append(getsingle(WC_features, n))

GRF_features_l = []
for i in range(len(GRF_features[0])):
    n = i
    GRF_features_l.append(getsingle(GRF_features, n))

POS_features_l = []
for i in range(len(POS_features[0])):
    n = i
    POS_features_l.append(getsingle(POS_features, n))

In [14]:
map1 = {'name': names, 'label': labels, 'F_feature': F_features}

for i in range(len(WC_features[0])):
    key = 'WC_' + str(i + 1)
    value = WC_features_l[i]
    map1[key] = value

for i in range(len(GRF_features[0])):
    key = 'GRF_' + str(i + 1)
    value = GRF_features_l[i]
    map1[key] = value

for i in range(len(POS_features[0])):
    key = 'POS_' + str(i + 1)
    value = POS_features_l[i]
    map1[key] = value

allofall = pd.DataFrame(map1)

F_features_u = np.array(F_features)
F_features_u = (F_features_u - np.mean(F_features_u)) / np.std(F_features_u)
allofall['F_feature'] = F_features_u

allofall.to_csv('allofall_novel.csv',index = False)
#allofall = pd.read_csv('allofall.csv')
#df_per_txt = pd.read_csv('blogs_genderbias.csv')

In [18]:
df_per_txt

Unnamed: 0,bias,gender,word ratio
0,0.542212,0.0,0.454310
1,0.585030,0.0,0.607524
2,0.485678,0.0,0.571576
3,0.600768,0.0,0.370205
4,0.495262,0.0,0.569502
5,0.433581,0.0,0.449752
6,0.549476,0.0,0.423285
7,1.027705,0.0,0.714715
8,1.102950,0.0,0.757576
9,0.768278,0.0,0.537118


In [17]:
df_per_txt = pd.read_csv('novel_genderbias.csv')

In [19]:
allofall['bias'], allofall['word ratio'] = df_per_txt['bias'], df_per_txt['word ratio']

In [4]:
#allofall.to_csv('novel_features.csv',index=False)
allofall = pd.read_csv('novel_features.csv')