In [481]:
import sqlite3
from sqlite3 import Error

# For Linguistics Computation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import nltk
import math
import re

# Relevant stopwords
# import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

In [482]:
def create_connection(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)
    return conn

In [483]:
def get_fanfiction(conn):
    sql_select = """SELECT id, tags, characters FROM fanfiction LIMIT 100"""
    cur = conn.cursor()
    fanfictions_execute = cur.execute(sql_select)
    fanfictions = fanfictions_execute.fetchall()
    conn.commit()
    return fanfictions

In [484]:
def get_text(conn, id):
    sql_select = """SELECT body FROM fanfiction WHERE id = '{}'""".format(id)
    cur = conn.cursor()
    texts_execute = cur.execute(sql_select)
    texts = texts_execute.fetchall()
    conn.commit()
    return texts[0][0]

In [485]:
def clean_text(string):
    string = re.sub("\n", "", string)
    string = re.sub("[0-9]", " ", string)
    string = re.sub("\.\.", " ", string)
    string = re.sub("\(.*?\)", " ", string)
    string = re.sub("[\*| , |<| - |+]~_", " ", string)
    string = re.sub(" [a-z|A-Z] ", " ", string)
    string = re.sub("-", "", string)
    string = re.sub("'", "", string)
    string = re.sub("/", " ", string)
    string = re.sub("\|", " ", string)
    string = re.sub("[a-z|A-Z]\.[a-z|A-Z]\.", " ", string)
    string = re.sub("[hH][Tt]{1,3}[pP]", " ", string)
    string = re.sub("(\.)(\S)", "\g<1> \g<2>", string)
    string = re.sub("\b[HhAaOoIiEeUuKkSs]{3,}\b", " ", string)
        
    countnew = 10
    for i in range(1, countnew):
        string = re.sub(" \. ", " ", string)
    for s in range(1, countnew):
        string = re.sub("  ", " ", string)
    
    return string

In [486]:
def gen_normtokens(text, characters):
    lemma = WordNetLemmatizer()
    return_tokens = []
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        sentence_pos = nltk.pos_tag(nltk.word_tokenize(sentence))
        for token, pos in sentence_pos:
            if "NN" in pos:
                return_tokens.append(lemma.lemmatize(token.lower(), pos = "n"))
    tokens_no_characters = list(set(return_tokens) - set(characters))
    return tokens_no_characters

In [487]:
database = r"fanfiction.db"
conn = create_connection(database)
all_tokens = nltk.FreqDist()
dictonary = {}
fanfictions = get_fanfiction(conn)

In [488]:
# Number of Fanfictions
print(f'There are {len(fanfictions)} Fanfiction is the database.')

There are 100 Fanfiction is the database.


In [489]:
for id, tags, characters in fanfictions:
    print("Processing Fanfiction {}/{}".format(id, len(fanfictions)), end='\r') #
    authortags = tags.split(', ')
    characters_singletokens = re.split(', | ', characters)
    characters = characters.split(', ')
    text = get_text(conn, id)
    text = clean_text(text)
    tokens = gen_normtokens(text, characters_singletokens)
    all_tokens.update(tokens)
    tokens_freq = nltk.FreqDist()
    tokens_freq.update(tokens)
    dictonary[id] = tokens_freq

Processing Fanfiction 100/100

In [490]:
# Length of the Dictionary
print(f'All Tokens = {all_tokens.N()}')

# Preview of the Dictionary
print(f'Most Common Token = {all_tokens.max()}')

All Tokens = 124523
Most Common Token = eye


In [491]:
nr_tokens_ref = all_tokens.N()

for id, tokens_freq in dictonary.items():
    nr_tokens_documents = tokens_freq.N()
    c = float(nr_tokens_ref) / float(nr_tokens_documents)
    word_rel_freq = {lemma: math.log(c * tokens_freq.get(lemma) / all_tokens.get(lemma,1)) for lemma in tokens_freq}
    word_rel_freq = sorted(word_rel_freq.items(), key=lambda x: x[1], reverse=True)
    result_list = []
    for key, value in word_rel_freq[0:50]:
        result_list.append(key)
    print("The main terms for the Fanfiction {}: {}".format(id, result_list))

The main terms for the Fanfiction 1: ['bilbo', 'endearing', 'stressor', 'sag', 'twentyeightyearold', 'sheshes', 'pentup', 'delayed', 'abide', 'wandlighting', 'howell', 'teabags', 'sstay', 'patdown', 'fullmoon', 'waning', 'janey', 'howells', 'wouldbeassailant', 'queenie', 'snarling', 'xxxx', 'palpitation', 'meatloaf', 'jerry', 'darlin', 'rolloff', 'weatherbeaten', 'underwriter', 'vexing', 'ssomeone', 'misfortunate', 'toothpick', 'twobedroom', 'offbalance', 'butterknife', 'tinas', 'whitehot', 'tramp', 'scarylooking', 'rainfall', 'painfullooking', 'modernday', 'warbling', 'pseudokindness', 'chancea', 'acidity', 'halfsob', 'eek', 'lipgloss']
The main terms for the Fanfiction 2: ['scattered', 'clunk', 'aerodynamics', 'janes', 'papercut', 'spiderweb', 'wham', 'burner', 'hoax', 'committing', 'oar', 'dumbridge', 'coon', '_lane', 'ribbit', 'testimonial', 'umbiti', 'anticoagulant', 'adultery', 'birch', 'amiright', 'vicious', 'aragog', 'speedy', 'ak', 'inquisitor', 'titus', 'dumbles', 'maine', '~

The main terms for the Fanfiction 39: ['blog', 'invito', 'welcomingly', 'patermal', 'septimus', 'enamel', 'graviditate', 'parseltounge', 'parentis', 'governers', 'lyrs', 'resides', 'torturing', 'relieving', 'pointedness', 'depop', 'vowing', 'greyclaw', 'lyra', 'sandcastles', 'username', 'placenta', 'prepared', 'drownings', 'hailey', 'painrelieving', 'malfoyweasley', 'memorias', 'parselmagic', 'labour', 'sell', 'adjoining', 'honourable', 'matrimonium', 'uuncle', 'unharmed', 'reponere', 'haileys', 'adopted', 'expectant', 'aapparation', 'pmakepeace', 'daugther', 'hidam', 'abortion', 'screenplay', 'creative', 'exfriends', 'invincible', 'grandfathter']
The main terms for the Fanfiction 40: ['virgin', 'ccf', 'sahara', 'confide', 'app', 'meatball', 'tebe', 'coldest', 'sstop', 'ivanovich', 'ia', 'derry', 'nikolai', 'breathy', 'consiglieres', 'mafia', 'ga_submit_new=', 'bunnelbby', 'zambini', 'uum', 'suffers', 'bulkas', 'gauthier', 'maf', 'tupperware', 'homophobe', 'dobroe', 'ravenette', 'raven

The main terms for the Fanfiction 61: ['zoe', 'fredos', 'poetic', 'zendaya', 'musky', 'rye', 'oma', 'ceramic', 'kali', 'sec', 'stepdad', 'continued', 'soundtrack', 'faffing', 'lilyrose', 'chloe', 'original', 'chuck', 'checkout', 'fivetruthchapter', 'lestrangeevans', 'gossiping', 'letas', 'downsizing', 'danny', 'noneno', 'piercings', 'thoma', 'etc', 'grandaughter', 'instead', 'leta', 'certified', 'ooowoahhhhhh', 'mushy', 'squinting', 'commencing', 'gnocchi', 'mri', 'blender', 'thomasis', 'buellers', 'depp', 'creating', 'anot', 'ft', 'nan', 'g', 'dulcet', 'dionnes']
The main terms for the Fanfiction 62: ['allstars', 'los', 'elevenhours', 'pavilion', 'angeles', 'marley', 'marlee', 'indiscreet', 'tulle', 'diego', 'spiderman', 'studio', 'hugger', 'instagram', 'san', 'marlenes', 'twitter', 'pinpoint', 'cab', 'comic', 'parker', 'lils', 'itch', 'noticing', 'mckinnon', 'convention', 'firsthand', 'jane', 'hairstyle', 'backpack', 'ledge', 'con', 'costume', 'speaker', 'along', 'panel', 'holy', 'ma

The main terms for the Fanfiction 95: ['seein', 'henish', 'thea', 'yalls', 'timealso', 'xxxxx', 'joked', 'greene', 'thoughtokie', 'monotonous', 'yalloh', 'moldyshorts', 'cubby', 'comadream', 'yee', 'schooler', 'reeked', 'meme', 'ahhhh', 'mraz', 'cringey', 'slifai', 'budum', 'momento', 'dissentrium', 'snd', 'theodosia', 'ummmm', 'vyr', 'overthinky', 'celsius', 'dayglowbest', 'uwus', 'starbursts', 'fanfics', 'mac', 'fox', 'brainpower', 'protagonist', 'universal', 'byeeetake', 'lakehouse', 'maestro', 'spintwitches', 'waiver', 'brutally', 'warsaw', 'joyful', 'rapist', 'riveting']
The main terms for the Fanfiction 96: ['ramc', 'zaitzevs', 'surreal', 'soothers', 'gilt', 'holmes', 'mugglebuilt', 'smallwood', 'gunpowder', 'cancel', 'hawkes', 'legwork', 'edwin', 'adage', 'syndrome', 'vignette', 'quandary', 'whooohoot', 'horseshit', 'antheas', 'chauffeur', 'tailoring', 'despatch', 'adler', 'talbot', 'iceman', 'plummeting', 'deduces', 'ilvermony', 'dan', 'zaitzev', 'polyester', 'flatshare', 'matr