In [199]:
import numpy as np
import pandas as pd
from simpletransformers.language_modeling import LanguageModelingModel
from simpletransformers.language_generation import LanguageGenerationModel
import warnings
import pickle
import re
warnings.filterwarnings('ignore')

In [200]:
def clean(post):
    post_split = post.split('|||')
    # split the kaggle data set posts by |||
    post_split_split = [x.split(' ') for x in post_split]
    
    # removes any 'words' that have http:// or https:// in them
    return_list = [[item for item in sentence if ('http://' not in item and 'https://' not in item)] for sentence in post_split_split]
    
    # returns a list of posts if they are not empty after removing the links
    return [' '.join(sentence) for sentence in return_list if sentence]

In [230]:
df = pd.read_csv('../data/mbti_1.csv')

In [231]:
df_dict = {'INTJ':0, 'INTP':0, 'ENTJ':1, 'ENTP':1, 
           'INFJ':0, 'INFP':0, 'ENFJ':1, 'ENFP':1, 
           'ISTJ':0, 'ISFJ':0, 'ESTJ':1, 'ESFJ':1, 
           'ISTP':0, 'ISFP':0, 'ESTP':1, 'ESFP':1}

In [232]:
for dataframe in df_dict.keys():
    if(df[df['type'] == dataframe].shape[0] < 200):
        df_dict[dataframe] = df[df['type'] == dataframe]
    else:
        df_dict[dataframe] = df[df['type'] == dataframe].sample(n=200)
    df_dict[dataframe]['post_split'] = df_dict[dataframe].posts.apply(clean)
    print(dataframe, df_dict[dataframe].shape)

INTJ (200, 2)
INTP (200, 2)
ENTJ (200, 2)
ENTP (200, 2)
INFJ (200, 2)
INFP (200, 2)
ENFJ (190, 2)
ENFP (200, 2)
ISTJ (200, 2)
ISFJ (166, 2)
ESTJ (39, 2)
ESFJ (42, 2)
ISTP (200, 2)
ISFP (200, 2)
ESTP (89, 2)
ESFP (48, 2)


In [234]:
df_dict['INTJ'].head()

Unnamed: 0,type,posts,post_split
167,INTJ,"'Hi, are you really manic?|||greetings.|||Yes,...","['Hi, are you really manic?, greetings., Yes, ..."
7500,INTJ,This is good...I was asking for a group analys...,[This is good...I was asking for a group analy...
790,INTJ,"'throughtheroses I cannot help you with this,...",['throughtheroses I cannot help you with this...
4514,INTJ,'I got 23|||YAYYYY!! XD Can I have a cookie in...,"['I got 23, YAYYYY!! XD Can I have a cookie in..."
6810,INTJ,'Psychology is a science. You probably feel un...,['Psychology is a science. You probably feel u...


In [235]:
for dataframe in df_dict.keys():
    df_dict[dataframe] = df_dict[dataframe].post_split.tolist()

In [270]:
def write_posts_to_file(posts, file):
    for post in posts:
        for ind_post in post:
            if(re.search('[a-zA-Z]', ind_post)):
                if(ind_post.endswith('...')or ind_post.endswith("...'")):
                    if(re.search('.*[\.?!]\s', ind_post)):
                        file.writelines(re.search('.*[\.?!]\s', ind_post).group(0).strip() + "\n")
                else:
                    file.writelines(ind_post)

In [237]:
for type_ in df_dict.keys():
    with open(f"../data/2.0_GPT-2_text_gen_posts/{type_}_posts_train.txt", "w") as f:
        write_posts_to_file(df_dict[type_][:-15], f)

    with open(f"../data/2.0_GPT-2_text_gen_posts/{type_}_posts_test.txt", "w") as f:
        write_posts_to_file(df_dict[type_][-15:], f)

In [238]:
train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "train_batch_size": 64,
    "num_train_epochs": 3,
    "mlm": False,
}

results = []

In [306]:
type_ = 'ISFP'
model = LanguageModelingModel('gpt2', 'gpt2', args=train_args, use_cuda=False)

model.train_model(train_file = f"../data/2.0_GPT-2_text_gen_posts/{type_}_posts_train.txt", 
                  eval_file = f"../data/2.0_GPT-2_text_gen_posts/{type_}_posts_test.txt", 
                  output_dir = f"2.0_gen_lang_models/{type_}_lang_model/")

# pickle.dump(model, open(f'models/{type_}_lang_model', 'wb'))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3820.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1713.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 3'), FloatProgress(value=0.0, max=27.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 1 of 3'), FloatProgress(value=0.0, max=27.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 2 of 3'), FloatProgress(value=0.0, max=27.0), HTML(value='')))





(81, 4.000250972347495)

In [263]:
results.append((type_, model.eval_model(f"../data/2.0_GPT-2_text_gen_posts/{type_}_posts_test.txt")))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=260.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=142.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=18.0), HTML(value='')))




In [264]:
results

[('ESFP', {'eval_loss': 4.146227061748505, 'perplexity': tensor(63.1951)}),
 ('ESTP', {'eval_loss': 4.114682740635342, 'perplexity': tensor(61.2328)})]

In [283]:
gen_model_ESFP = LanguageGenerationModel("gpt2", f"2.0_gen_lang_models/ESFP_lang_model/", args={"max_length": 16}, use_cuda=False)

In [284]:
gen_model_ESTP = LanguageGenerationModel("gpt2", f"2.0_gen_lang_models/ESTP_lang_model/", args={"max_length": 16}, use_cuda=False)

In [301]:
prompts = [
    "My outlook on the world is",
    "When I think back on my youth, overwhelming feeling is",
    "Other people in general make me feel"
]

In [305]:
ESFP_words= []
ESTP_words = []

for _ in range(20):
    for prompt in prompts:
        # Generate text using the model. Verbose set to False to prevent logging generated sequences.
        generated = ''
        while len(generated) <= len(prompt):
            generated = gen_model_ESFP.generate(prompt, verbose=False, args={"max_length": 16})
            generated = '.'.join(generated[0].split('.')[:-1]) + '.'
#         ESFP_words.append(generated[len(prompt):])
        print(generated)

#         generated = '' 
#         while len(generated) <= len(prompt):
#             generated = gen_model_ESTP.generate(prompt, verbose=False, args={"max_length": 16})
#             generated = '.'.join(generated[0].split('.')[:-1]) + '.'
#         ESTP_words.append(generated[len(prompt):])
#         print(generated)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is uncertain as well.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is the most difficult thing for me.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel terrible and I have to make choices because I am trying to make them.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is very good and it's the best place for me to be.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is just something that I've been doing in the past.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel more positive when people think we're good.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is not just uncertain, but more uncertain as well.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is what has made me happiest.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel a little bit nervous and I'm always trying to prove them wrong.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is more pessimistic.  I'm much more interested in the past than the future.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is what I have. I feel happy and strong.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel like the one who is too stupid for my taste to understand.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is changing all of a sudden, and I am quite excited about it.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is never good and not good at all, it's a terrible mistake to make.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel like that.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is going to change.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is always a little bit of a challenge.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel bad, or are too quick to blame people when I've gone wrong.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is in danger and we must keep our heads down.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is pretty much what I'm always going to feel.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel like they're making something for me.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is just that of me.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is something I've known all my life.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel uncomfortable.I would like to think that my behavior is okay.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is a little different than I actually would have been if I were a guy.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is something I never experienced before.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel sorry for myself.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is much better than for myself as well.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is what I want or need to feel.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel better knowing that they were actually interested in me for a while.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is going to change when I see where I've done the worst of things.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is what made me strong enough to make it.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel like a douchebag.I don't feel ashamed for being stupid.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is not bad.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is the strongest emotion I have about life.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel extremely lucky to be alive.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is the same.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is how I got into a school.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel bad about myself because I'm a victim. I'm usually just the victim.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is very bleak.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is hard to come by, no matter how nice you think that sounds.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel a certain way when I see people using an image to explain how things are.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is getting worse.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is a good thing. So I think that was the lesson I learned.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel like I am stupid or something (or worse that).


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is different than my real life. I just thought I had a different mindset.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is a huge part of life.

I grew up in the Bronx.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel good.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is pretty much the same.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is the most important thing.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel bad.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is completely different from being in school.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is one of the most valuable things that a human being can have.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel good with my body/soul.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is generally not positive as I often look at things from a foreign perspective.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is like a tsunami.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel awkward about it.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My outlook on the world is different now. The old generation of people are happy.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When I think back on my youth, overwhelming feeling is more like a loss than positive feedback.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Other people in general make me feel like I am in jail for being an asshole.


In [303]:
ESFP_words

[' more favorable, as far as I could tell.',
 ' always a good thing.',
 ' comfortable talking to them.',
 " somewhat bleak, but I'm pretty sure I won't get a job.",
 ' the only thing that keeps me stuck.',
 " like they really don't care.",
 ' one of optimism, uncertainty, hope and hope.',
 ' very powerful," says Caine.',
 ' bad for saying this, though.',
 ' pretty bleak.',
 " very good. It's not that my life changes. I never have.",
 ' that I need a lot more in order to cope.',
 ' less clear," he says.',
 " that I didn't have any right to be this way.",
 ' bad for them.',
 ' very positive. I can accept things more easily than I can admit I feel.',
 " when you are tired and you can't move on.",
 ' uncomfortable, or at the very least uncomfortable.',
 ' completely different from what I think it is.',
 ' the only thing I can feel.',
 ' horrible.',
 ' much closer to normal.',
 ' something which I can relate to.',
 ' guilty for doing that. It was just a really stupid comment.',
 ' pretty si

In [304]:
ESTP_words

[' somewhat different from those I had before.',
 ' always present to me.',
 ' good and happy I guess.',
 ' very different.',
 " a good thing, it's not just about feeling happy.",
 ' sorry for myself.',
 " one that might change drastically. Maybe my views won't change.",
 ' always the one that keeps me sane.',
 ' good about myself. No real problems.',
 ' very pessimistic.',
 ' a thing.',
 " bad for being an adult in my early 20's.",
 ' one of fearlessness and uncertainty.',
 ' what I do when I have a stressful experience or something.',
 ' bad about that.',
 ' still pretty murky at best.',
 ' always a good thing and my parents were so good.',
 " angry and sad. Well I'm trying to get out of it.",
 ' very bleak, mostly to put into words.',
 " like I don't have enough.",
 ' sorry for her and you are the only one who feels that way.',
 ' extremely different. I would say my focus is definitely on being a successful programmer.',
 ' always an indication of mental health issues.',
 ' sorry fo