This file is similar to mbti_ohe.ipynb, in that it does one-hot encoding for the mbti data. However: <br>
- We now have a dataframe entry for each tweet, rather than for each twitter account (previously, we aggregated each account's tweets (~50 tweets) into a single bag of words)
- The one-hot-encoding table only has words that show up in *both* the aggression data and the mbti data.

Input: 
- mbti_1_1.csv (raw mbti data)
- common_cols.json (a list of words that exist in both the mbti data and the aggression data)
  - (generated by loading ohe tables for mbti and aggression into memory, then doing: *common_cols = \[col for col in set(df1.columns).intersection(df.columns)\])*

In [None]:
import pandas as pd
import numpy as np
import feather
from sklearn.feature_extraction.text import CountVectorizer
import re
import enchant
import json

with open('common_cols.json', 'r') as filehandle:
    common_cols = json.load(filehandle)

posts_column = df['posts'].replace(to_replace=r'[^\w\s]', value='', regex=True).str.split()

df.posts = df['posts'].str.split('\|\|\|')
rows_list = []
for i in range(df.posts.size):
    mbti_type = df.type[i]
    mbti_post = df.posts[i]
    rows_list.extend( [{'type': mbti_type, 'tweet': tweet} for tweet in mbti_post] )
df_new = pd.DataFrame(rows_list)

def words_in_texts(words, texts):
    '''
    Args:
        words (list-like): words to find
        texts (Series): strings to search in
    Returns:
        NumPy array of 0s and 1s with shape (n, p) where n is the
        number of texts and p is the number of words.
    '''
    print('executing words_in_texts') #debug
    counter = 0; #debug
    total_counter = len(words) #debug
    array = []
    for i in words:
        x = [1 if i in _ else 0 for _ in texts]
        array.append(x)
        counter += 1 #debug
        print('{0} / {1}'.format(counter, total_counter)) #debug
    print(str(len(array)) + ' by ' + str(len(array[0])))
    array = np.vstack(array)
    print(array)
    array = array[:].astype(int)
    array = np.transpose(array)
    return array

def clean_words(word):
    print('cleaning')
    d = enchant.Dict('en_US')
    word = [x for x in word if (len(x) < 12 and len(x) > 2 and d.check(x))]
    word = pd.Series(word)
    word = word.str.findall(r"^[a-zA-Z]+$").to_list()
    word = list(filter(None, word))
    word = [item for sublist in word for item in sublist]
    return word

words = common_cols

array_data = words_in_texts(words, df_new['tweet'])
print('Converting to dataframe')
data = pd.DataFrame(data = array_data, columns = words)
data['type'] = df_new['type']
print('outputting to feather')
feather.write_dataframe(data, 'ohe_data_refilter.feather')
print(data)