In [2]:
#######################
# this is an improvement over the code: GSSproject/Code/DEPRECATED--get-random-articles-from-articleClasses-pickle.py

In [3]:
import cPickle as cp
from random import sample
import sys
import pandas as pd
import pickle

sys.path.append('../')    
import GSSUtility as GU
sys.path.append('../Code/')
from articleClass import *

In [4]:
pathToData = '../Data/'
articleClasses = cp.load(open(pathToData + 'articleClasses.pickle', 'rb'))


In [9]:
def filterArticles(articleClasses, GSSYearsUsed=True, GSSYearsPossible=False, unusedGSSYears=False, noIVs=True, noDVs=True, \
                    centralIVs=True, nextYearBound=0, yearPublished=False, linearModels=True, GSSCentralVariable=False):
    '''
    This function filters the articleClasses list according to the following criteria.
    arguments:
     - noIVs: skip if no IVs specified
     - noDVs: skip if no DVs specified
     - GSSYearsPossible: skip if there are no GSS years possible besides the ones the article used
     - unusedGSSYears=False: If True, then keep only those articles which have some GSS Years they could have used, but didn't
     - centralIV: skip if there is no IV(s) designated as "central"
     - nextYearBound = int: skip if next future year of data is not within "int" of last year used
                     = 0 by default, in which case it's not used
     - yearPublished=False: if set to True, yearPublished is required to be not None
     - GSSCentralVariable=False: if True, keep only those articles where GSSCentralVariable is True in the mysql
                                 table gss_question
     - linearModels=False: if True, keep only those articles where model type is .. and I should think about what to use here.
     - TODO: ADD AN "UNUSED YEARS" filter

    '''
    indicesToKeep = []
    
    pathToData = '../Data/'
    if GSSCentralVariable:
        gssCentral = cp.load(open(pathToData + 'ARTICLEID_GSS_CENTRAL_VARIABLE.pickle', 'rb'))

    if linearModels:
        modelUsed = pd.read_pickle(pathToData + 'ARTICLEID_AND_TRUE_IF_LINEAR_NONLINEAR.pickle')

    for ind, a in enumerate(articleClasses):  # a = article
        
        # skip article if there is no info on DVs or IVs
        # Should we change this to skip only if BOTH controls AND IVs are not there?
        if noDVs:
            if len(a.DVs) < 1: continue
        
        if noIVs: 
            if len(a.IVs) < 1: continue

        if GSSYearsUsed:         
            # if there is no used years of GSS possible to run the data on, then just skip this article
            if len(a.GSSYearsUsed) < 1: continue
            
        if GSSYearsPossible:         
            # if there is no un-used years of GSS possible to run the data on, then just skip this article
            if len(a.GSSYearsPossible) < 1: continue

        if unusedGSSYears:
            unusedEarlyYears = [yr for yr in a.GSSYearsPossible if yr <= max(a.GSSYearsUsed)]
            if len(unusedEarlyYears)==0: continue
            
        if centralIVs:    
            # if GSS is not the central dataset used then skip
            if len(a.centralIVs) < 1: continue
                   
        if nextYearBound:
            # nextYear is an integer that specifies how soon the next available year of data is supposed to be.
            # e.g. if nextYearBound = 4, then the new future year of data is to occur within 4 years of the last year of data
            # actually used. 
            maxYearUsed = max(a.GSSYearsUsed)
            futureYearsPossible = [yr for yr in a.GSSYearsPossible if yr > maxYearUsed]
            if not futureYearsPossible or min(futureYearsPossible) > maxYearUsed + nextYearBound: continue
                   
        if yearPublished:
            if not a.yearPublished: continue
                        
        if GSSCentralVariable:
            if a.articleID not in gssCentral or gssCentral[a.articleID]==False: continue
        
        if linearModels:
            if a.articleID not in modelUsed: continue            
            
        # if the article survived all of the checks above, add it to the list
        indicesToKeep.append(ind)
    
    return [articleClasses[ind] for ind in indicesToKeep] # return elements that have survived
                                                            # the filtering

In [17]:
# note, nextYearBound = 40 essentially requires that there's at least one future year
articlesToUse = filterArticles(articleClasses, GSSYearsUsed=True, GSSYearsPossible=True, \
                                    centralIVs=True, nextYearBound=40, linearModels=False)        

suitable_articles = []
for article in articlesToUse:
    maxYearUsed = max(article.GSSYearsUsed)
    futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed]
    suitable_articles.append( (article.articleID, futureYearsPossible) )  
    
sample_i = sample(suitable_articles, 50)

In [23]:
', '.join([str(el[0]) for el in sample_i])

'5407, 4599, 4665, 1937, 3758, 1307, 5625, 1340, 2824, 6724, 1796, 5015, 1899, 2974, 2308, 1525, 1570, 4441, 2902, 4274, 1964, 6734, 3606, 6603, 2001, 5498, 3027, 2554, 21, 640, 1954, 6577, 3017, 6922, 1793, 6841, 1973, 1533, 3846, 2697, 3401, 5556, 6992, 2898, 6363, 2253, 3489, 1404, 1813, 6686'