## Preliminaries

In [1]:
import pandas as pd
import nltk
import re
import numpy as np
import pprint
from nltk import Tree
from collections import Counter

## Load Data

In [2]:
df = pd.read_csv('clean_data/survey_data.csv')

## Function To Idenify Noun Phrases

In [3]:
patterns = """
    NP: {<JJ>*<NN*>+}
    {<JJ>*<NN*><CC>*<NN*>+}
    """

NPChunker = nltk.RegexpParser(patterns)

def prepare_text(input):
    sentences = nltk.sent_tokenize(input)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    sentences = [NPChunker.parse(sent) for sent in sentences]
    return sentences


def parsed_text_to_NP(sentences):
    nps = []
    for sent in sentences:
        tree = NPChunker.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'NP':
                t = subtree
                t = ' '.join(word for word, tag in t.leaves())
                nps.append(t)
    return nps

def find_nps(text):
    prepared = prepare_text(text)
    parsed = parsed_text_to_NP(prepared)
    return parsed

## Function To Output A List Of Noun Phrases

In [4]:
def list_of_noun_phrases(series):

    nouns = []

    for row in series.dropna():
        nouns.append(find_nps(row))
        
    list_of_nouns = [i.lower() for row in nouns for i in row]
    
    list_of_list_of_nouns = [[x[0]] * x[1] for x in Counter(list_of_nouns).most_common(30)]
    
    flattened_list_of_nouns = [i.lower() for row in list_of_list_of_nouns for i in row]
    
    titlecased_list_of_nouns = [x.title() for x in flattened_list_of_nouns]
    
    return titlecased_list_of_nouns

## Create List Of Noun Phrases

In [5]:
df_helped_women = pd.DataFrame()
df_helped_women['What Helps Women?'] = list_of_noun_phrases(df['What Helps Women?'])

df_barriers = pd.DataFrame()
df_barriers['What Are Barriers To Women?'] = list_of_noun_phrases(df['What Are Barriers To Women?'])

df_helped_you = pd.DataFrame()
df_helped_you['What Helped You The Most?'] = list_of_noun_phrases(df['What Helped You The Most?'])

## Export

In [7]:
df_helped_women.to_csv('clean_data/helped_women_data.csv')
df_barriers.to_csv('clean_data/barriers_data.csv')
df_helped_you.to_csv('clean_data/helped_you_data.csv')