## Imoprt modules

In [1]:
import re

# Get pandas and postgres to work together
import psycopg2 as pg
import pandas as pd

# We are also going to do some basic viz
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer

## Initialize environment

In [2]:
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

## Functions

In [3]:
def pg_fetch_all(conn, script):
    cursor = conn.cursor()
    try:
        cursor.execute(script)
        conn.commit()
        contents = cursor.fetchall()
    except (Exception, pg.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        contents = 1
    cursor.close()

    return contents

## Database connection

In [4]:
# Postgres info to connect
connection_args = {
    'host': 'localhost',  # We are connecting to our _local_ version of psql
    'dbname': 'myersbriggs',    # DB that we are connecting to
    'port': 5432          # port we opened on AWS
}

connection = pg.connect(**connection_args)  # What is that "**" there??

In [5]:
postgreSQL_select_Query = "SELECT * FROM raw_data;"

myers_briggs = pg_fetch_all(connection, postgreSQL_select_Query)

In [6]:
get_posts_by_type = "SELECT posts FROM raw_data WHERE type = 'ENTJ';"

posts_by_type = pg_fetch_all(connection, get_posts_by_type)

In [7]:
mb_df = pd.DataFrame(myers_briggs)

post_list = [re.split('\|\|\|+', post) for post in mb_df[1]]
post_df = pd.DataFrame(post_list)
post_df.insert(loc=0, column='type', value=mb_df[0])

In [8]:
connection.close()

### For a baseline - create initial word vector.

In [12]:
vectorizer = CountVectorizer()

In [14]:
X = vectorizer.fit_transform(mb_df[1])

In [15]:
print(vectorizer.get_feature_names())

oos', 'yvmcm3ik2dc', 'yvo4kvvpmqs', 'yvonne', 'yvqmdqxisqm', 'yvvtzgwywvo', 'yvxstzhzkxk', 'yvy', 'yvyunpl1l', 'yw', 'yw1eiqd', 'yw3chifny0g', 'yw4fpp8sazm', 'yw4maxwncke', 'yw4sivh', 'ywabgqbnzko', 'ywc4x5x', 'ywclw_xg7x4', 'ywdd1yeehlg', 'ywdhycstsui', 'ywk6dqodri0', 'ywkemo18ndq', 'ywp', 'ywszvhqf9qm', 'ywt', 'ywu27t2dhcw', 'ywx', 'yx0hyjo9auo', 'yx1l7a5wfxy', 'yx5xpo0lzvq', 'yx_kez_xwjs', 'yxbwawq9ew5ld3m7ct04ntt3ptyzma', 'yxdbmh', 'yxe4xbxvemg', 'yxeueuqip3q', 'yxev1gl0adk', 'yxfdawjdj2c', 'yxhf5zc', 'yxj0df7laye', 'yxnozsrayu0', 'yxqasa0', 'yxrwosw', 'yxs4lqppz6y', 'yxsr8fyfumg', 'yxubxatre7m', 'yxwblwjhwqo', 'yxwq0sdgmiw', 'yxxbhkjnrr8', 'yxxxn', 'yxypy2nia3g', 'yxz', 'yy', 'yy104', 'yy162', 'yy216', 'yy247', 'yy4cn9dvpii', 'yy67', 'yy96htb8wgi', 'yyaasss', 'yyagchm8rww', 'yyarnbza', 'yycyaclrpns', 'yydmetmjpdu', 'yyelz5q0z9w', 'yyeo', 'yyeyee', 'yygvfvvzlbi', 'yyhhhheeyyy', 'yyknbtm_yym', 'yyod5camrv8', 'yyoooouuu', 'yyou', 'yypek', 'yyr2geouemm', 'yyrlr_gv67q', 'yyuyfo2rd3q', 

In [25]:
mb_df = pd.DataFrame(myers_briggs, columns=['type', 'posts'])
types = sorted(mb_df['type'].unique())

In [28]:
ENFJ = mb_df[mb_df['type'] == 'ENFJ']
ENFP = mb_df[mb_df['type'] == 'ENFP']
ENTJ = mb_df[mb_df['type'] == 'ENTJ']
ENTP = mb_df[mb_df['type'] == 'ENTP']
ESFJ = mb_df[mb_df['type'] == 'ESFJ']
ESFP = mb_df[mb_df['type'] == 'ESFP']
ESTJ = mb_df[mb_df['type'] == 'ESTJ']
ESTP = mb_df[mb_df['type'] == 'ESTP']
INFJ = mb_df[mb_df['type'] == 'INFJ']
INFP = mb_df[mb_df['type'] == 'INFP']
INTJ = mb_df[mb_df['type'] == 'INTJ']
INTP = mb_df[mb_df['type'] == 'INTP']
ISFJ = mb_df[mb_df['type'] == 'ISFJ']
ISFP = mb_df[mb_df['type'] == 'ISFP']
ISTJ = mb_df[mb_df['type'] == 'ISTJ']
ISTP = mb_df[mb_df['type'] == 'ISTP']

In [27]:
types

['ENFJ',
 'ENFP',
 'ENTJ',
 'ENTP',
 'ESFJ',
 'ESFP',
 'ESTJ',
 'ESTP',
 'INFJ',
 'INFP',
 'INTJ',
 'INTP',
 'ISFJ',
 'ISFP',
 'ISTJ',
 'ISTP']