In [2]:
# import section

import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns




Importing the full GSS datafile over all years does not work, so we needed to import every year separately. The files of all available years are in one directory on our computers, and LoadData function below loops through every year in the directory and appends it to the total data file. Initially we worked in Spyder as we could collaborate using Github, but there we had to load the data everytime we ran the full program, which took a lot of time. We made a separate data loading program before we realized that Jupyter works with blocks. Luckily we could incorporate some of our previous code, and we now work on this notebook through Github, although the updated file has to be uploaded to the server each time. 

In [3]:
# Data loading function

def LoadData():
    year = 1972
    total_data = {}
    years = []
    for i in range(28):
        if year == 1979 or year == 1981 or year == 1984 or year == 1986 or year ==1992:
            year += 1
        years.append(str(year))
        print year,
        data = pd.io.stata.read_stata("/Users/jorrenbosga/Desktop/AUC/Masterclass/MasterclassBDProject/GSS" + str(year) + ".DTA")
        total_data[str(year)] = data
        year += 1
        if year > 1994:
            year += 1
        
    print "Data Loaded"
    return total_data

total_data = LoadData()

1972 1973 1974 1975 1976 1977 1978 1980 1982 1983 1985 1987 1988 1989 1990 1991 1993 1994 1996 1998 2000 2002 2004 2006 2008 2010 2012 2014 Data Loaded


In [4]:
def AllAnswers(category):
    for year in total_data.keys():
        if category not in total_data[year].keys():
            continue
        else:
            cat = getattr(total_data[year], category)
            cont_subcats = []
            for subcat in cat:
                if subcat not in cont_subcats:
                    cont_subcats.append(subcat)
                    
    return cont_subcats



The block below is dedicated to calculating proportions of answers within a certain category in one year. This is useful for getting a general idea of distributions within a single year. The first function, Count, counts the frequencies of the answers to a survey question (or category), and returns them in a dictionary. The second function, CategoryCompletion, appends the answers that were not chosen or included in the options for a survey question of a single year to the output of the Counted function with a very small value. The last function, Proportions, transforms the frequencies into proportions and returns them in a list.

In [27]:
# Counting function
# Returns dictionary with frequency of all answers in a chosen category

def Count(year, category):
    if category not in total_data[year].keys():
        return "Error: category nonexistent"
        
    else:
        filtered = getattr(total_data[year], category)
        counted = {}
        for answer in filtered:
            if answer not in counted:
                counted[answer] = 1
            else:
                counted[answer] = counted[answer] + 1   
        return counted

def CategoryCompletion(year, category):
    incomplete = Count(year, category)
    if type(incomplete) == str:
        return incomplete
    totalcats = AllAnswers(category)
    for category in totalcats:
        if category not in incomplete.keys():
            incomplete[category] = 1e-16
    return incomplete
    
# Proportion function
# Returns list with proportions based on counting function
def Proportions(year, category):  # input should be output of Count function
    counted = CategoryCompletion(year, category)
    if type(counted) == str:
        return counted
    numbervals = counted.values()
    totalval = 0
    proportions = []
    for item in numbervals:
        totalval += item
    for item in numbervals:
        proportions.append(float(item) / float(totalval))
        
    return proportions

a = total_data["1972"]
print Proportions("1972", "grass")



Error: category nonexistent


While the block above focuses on single categories, it is statistically speaking more useful to look for correlations between proportions within categories. The block below performs this action by calculating the entropy between the general opinion on a certain subject and the opinion of a subgroup on a certain subject (for example, what is the entropy between the opinion on premarital sex of protestants and the opinion of the whole population on that topic?). The block is largely similar to the block above, but the functions CrossCount, CrossCategoryComplete, and CrossProportions only take the answers of a subset of the population to a certain question into account (in code: total_data["1990", premarsx][religion == "protestant"]).


In [38]:
def CrossCount(year, category1, category2, subcategory):
    if category1 not in total_data[year].keys():
        return "Error: category 1 nonexistent"
    elif category2 not in total_data[year].keys():
        return "Error: category 2 & subcategory nonexistent"
    else:
        
        filtered = getattr(total_data[year], category1)[(getattr(total_data[year], category2)) == subcategory]
        counted = {}
        for answer in filtered:
            if answer not in counted:
                counted[answer] = 1
            else:
                counted[answer] = counted[answer] + 1   
        return counted

def CrossCategoryComplete(year, category1, category2, subcategory):
    incomplete = CrossCount(year, category1, category2, subcategory)
    totalcats = AllAnswers(category1)
    if type(incomplete) == str:
        return incomplete
    for category in totalcats:
        if category not in incomplete.keys():
            incomplete[category] = 1e-16
    return incomplete

def CrossProportions(year, category1, category2, subcategory):
    counted = CrossCategoryComplete(year, category1, category2, subcategory)
    if type(counted) == str:
        return counted
    numbervals = counted.values()
    totalval = 0
    proportions = []
    for item in numbervals:
        totalval += item
    for item in numbervals:
        proportions.append(float(item) / float(totalval))
        
    return proportions

# Entropy function

def Entropy(year, category1, category2, subcategory):
    param1 = CrossProportions(year, category1, category2, subcategory)
    if type(param1) == str:
        return param1
    param2 = Proportions(year, category1)
    if len(param1) != len(param2):
        params = [param1, param2]
        lengths = [len(param1), len(param2)]
        maxlength = max(lengths)    
        for param in params:
            if len(param) < maxlength:
                for i in range(maxlength - len(param)):
                    param.append(1*10**-20)
    entropy = sp.stats.entropy(param1, param2)
    return entropy


print Entropy("1972", "premarsx", "grass", "legal")


Error: category2 nonexistent


In [50]:
ListofCategories = ['wrkstat','agewed','wrkgovt','divorce','sibs','childs','degree','hompop','partyid','polview',
                    'natcrim','natdrug','natarms','spkath','spkrac','spkcom','spkhomo','cappun','gunlaw','courts','grass']
def ListofEntropies(ListofCategories, category2, subcategory):
    Entropylist = []
    for category1 in ListofCategories:
        #loop over the every year
        year = 1972
        entropies = []
        for i in range(28):
            if year == 1979 or year == 1981 or year == 1984 or year == 1986 or year ==1992:
                year += 1
            if category1 not in total_data[str(year)].keys():
                year += 1
                if year > 1994:
                    year += 1
                continue
            if category2 not in total_data[str(year)].keys():
                year+= 1
                if year > 1994:
                    year += 1
                continue
            entropies.append(Entropy(str(year), category1, category2, subcategory))
            year += 1
            if year > 1994:
                year += 1
        Entropylist.append((np.mean(entropies), category1))
    return Entropylist

a = ListofEntropies(ListofCategories, "relig", "catholic")
print type(a), a

<type 'list'> [(0.076853060417739774, 'wrkstat'), (8.0400188871683831, 'agewed'), (0.0021004511415455124, 'wrkgovt'), (0.011586987415324471, 'divorce'), (0.2519168121641337, 'sibs'), (0.023964440346372188, 'childs'), (0.026488120969954867, 'degree'), (0.10701723763992552, 'hompop'), (0.016875861550608745, 'partyid'), (nan, 'polview'), (nan, 'natcrim'), (0.0027258081568727406, 'natdrug'), (0.0033689679846973246, 'natarms'), (0.0042482015566932896, 'spkath'), (0.0036995310946022725, 'spkrac'), (0.002849372443166106, 'spkcom'), (0.006539372806051845, 'spkhomo'), (0.0034619508293949877, 'cappun'), (0.011462166300593931, 'gunlaw'), (0.003540339023189832, 'courts'), (0.0026071652267151378, 'grass')]


In [54]:

def EntropiesPerAnswer(category):
    answers = CategoryCompletion("2014", str(category))
    ListOfEntropies = []
    for answer in answers.keys():
        print answer
        entropies = ListofEntropies(ListofCategories, str(category), str(answer))
        ListofEntropies.append(entropies)
    return ListofEntropies

# NB: dit is nog niet echt mooi maar wel redelijk overzichtelijk. Ik heb 2014 als uitgangsjaar
# genomen, maar dit boeit eigenlijk niet omdat de antwoorden toch worden aangevuld door de 
# CategoryCompletion functie.
# twee kleine probleempjes nog: 
# hij is echt retetraag en hij neemt bijvoorbeeld MOSLEM 
# en moslem als twee verschillende categoriën

print EntropiesPerAnswer("relig")


nan
catholic
protestant
christian
INTER-NONDENOMINATIONAL
jewish
MOSLEM/ISLAM
OTHER EASTERN
none
inter-nondenominational
buddhism
other
moslem/islam
NATIVE AMERICAN
other eastern
hinduism
ORTHODOX-CHRISTIAN
native american
orthodox-christian
[(4.1338262290116718, 'wrkstat'), (31.712045149828676, 'agewed'), (0.51246038265618732, 'wrkgovt'), (0.17539366255985955, 'divorce'), (5.1502629454396907, 'sibs'), (0.77840693811499651, 'childs'), (1.5683494528348301, 'degree'), (2.0484620812887733, 'hompop'), (0.77740889555150017, 'partyid'), (nan, 'polview'), (nan, 'natcrim'), (0.58302641856089066, 'natdrug'), (0.23529184015629362, 'natarms'), (0.42002083130765383, 'spkath'), (0.23151024969146075, 'spkrac'), (0.27794949161679749, 'spkcom'), (0.33169077834053989, 'spkhomo'), (0.35307710868078968, 'cappun'), (0.41090407524486555, 'gunlaw'), (0.64588463356039294, 'courts'), (0.2786742704070736, 'grass')]


## 