In [1]:
# !pip install pandas sas7bdat pyreadstat
import pandas as pd
import sas7bdat, signal, pyreadstat, pickle
from toolbox import *

In [2]:
data_path = r'C:\Users\justi\Dropbox\Work\Job Search\Other\GSS\GSS_sas\gss7222_r3.sas7bdat'
df, meta = pyreadstat.read_sas7bdat(data_path)

In [6]:
print(meta.column_names)
print(meta.column_labels)
print(meta.column_names_to_labels)



['YEAR', 'ID', 'WRKSTAT', 'HRS1', 'HRS2', 'EVWORK', 'OCC', 'PRESTIGE', 'WRKSLF', 'WRKGOVT', 'COMMUTE', 'INDUSTRY', 'OCC80', 'PRESTG80', 'INDUS80', 'INDUS07', 'OCCONET', 'FOUND', 'OCC10', 'OCCINDV', 'OCCSTATUS', 'OCCTAG', 'PRESTG10', 'PRESTG105PLUS', 'INDUS10', 'INDSTATUS', 'INDTAG', 'MARITAL', 'MARTYPE', 'AGEWED', 'DIVORCE', 'WIDOWED', 'SPWRKSTA', 'SPHRS1', 'SPHRS2', 'SPEVWORK', 'COWRKSTA', 'COWRKSLF', 'COEVWORK', 'COHRS1', 'COHRS2', 'SPOCC', 'SPPRES', 'SPWRKSLF', 'SPIND', 'SPOCC80', 'SPPRES80', 'SPIND80', 'SPOCC10', 'SPOCCINDV', 'SPOCCSTATUS', 'SPOCCTAG', 'SPPRES10', 'SPPRES105PLUS', 'SPIND10', 'SPINDSTATUS', 'SPINDTAG', 'COOCC10', 'COIND10', 'PAOCC16', 'PAPRES16', 'PAWRKSLF', 'PAIND16', 'PAOCC80', 'PAPRES80', 'PAIND80', 'PAOCC10', 'PAOCCINDV', 'PAOCCSTATUS', 'PAOCCTAG', 'PAPRES10', 'PAPRES105PLUS', 'PAIND10', 'PAINDSTATUS', 'PAINDTAG', 'MAOCC80', 'MAPRES80', 'MAWRKSLF', 'MAIND80', 'MAOCC10', 'MAOCCINDV', 'MAOCCSTATUS', 'MAOCCTAG', 'MAPRES10', 'MAPRES105PLUS', 'MAIND10', 'MAINDSTATUS'

Step 1: Get list of df ready for primetime, first for general, then each for specific section of country

In [3]:
data_path = r'C:\Users\justi\Dropbox\Work\Job Search\Other\GSS\GSS_sas\gss7222_r3.sas7bdat'
df, meta = pyreadstat.read_sas7bdat(data_path)

############# cleaning df #############
# df is the original data - each column header is a question, each row is respondents answers
# need to change the values in the Age column to 0 if between 18-34
# 1 if between 35-49, 2 if between 50-64, 3 if 65+
df['AGE'] = df['AGE'].apply(lambda x: 0 if x < 35 else 1 if x < 50 else 2 if x < 65 else 3)
# do same for degree, where is 3 or 4, make it 2
df['DEGREE'] = df['DEGREE'].apply(lambda x: 2 if x >= 2 else x)
# for partyid, if 0-1 do 0, 2-4,7 do 1, 5-6 do 2
df['PARTYID'] = df['PARTYID'].apply(lambda x: 0 if x < 2 else 1 if x < 5 or x == 7 else 2)
# leaving sex and race alone, those go from 1-2 and 1-3 respectively

############# creating df_list #############
df_list = [df]
age_list = [df[df['AGE'] == age] for age in range(4)]
degree_list = [df[df['DEGREE'] == degree] for degree in range(3)]
partyid_list = [df[df['PARTYID'] == partyid] for partyid in range(3)]
sex_list = [df[df['SEX'] == sex] for sex in range(1, 3)]
race_list = [df[df['RACE'] == race] for race in range(1, 4)]
df_list.extend(age_list)
df_list.extend(degree_list)
df_list.extend(partyid_list)
df_list.extend(sex_list)
df_list.extend(race_list)



            

Step 2: Get melted tables list for each df. First get Year and Question pairings from general one, with min_answers, then have melted tables list - each question, how that demographic answered over time. 

These tables are saved in system, and need to be called upon quickly for Step 3

In [4]:
############# params for creating first melted table #############
# df, id_vars, exclude_columns, include_columns, year, min_answers
paradata_keywords = read_file_lines(r"C:\Users\justi\Dropbox\Work\Job Search\Other\GSS\GSS_sas\Paradata_variables.txt")
df = df; id_vars=['YEAR']; exclude_columns=['ID']+(paradata_keywords); include_columns=[]
year = 2000; min_answers = 850

############# creating first melted table, and params to filter future ones #############
# melted table cols --- id_vars (YEAR), Question, Answer, Percentage, Num_Answers
melted_list = []
melted_list.append(process_survey_data(df, id_vars=id_vars, exclude_columns=exclude_columns, include_columns=include_columns, year=year,min_answers=min_answers))
filter_df = melted_list[0]["YEAR", "Question"]

############# creating future melted tables #############
for i in range(1, len(df_list)):
    df = df_list[i]
    # min_answers = 0 because already got YEAR, Question pairings of interest from first df
    melted_df = process_survey_data(df, id_vars=id_vars, 
        exclude_columns=exclude_columns, include_columns=include_columns, 
        year=year, min_answers=0)
    filtered_df = filter_dataframe(melted_df, filter_df)
    melted_list.append(filtered_df)


Step 3: Can put any of the melted dfs into compare_years_delta, and get data table sorted by delta

In [None]:
############# given melted_df, and years, get sorted by biggest delta #############
# final_table = compare_years_delta(melted_table, 2010, 2018)
# print(final_table.head(50))

############# vars so can find correct melted table for the website #############
place = {"AGE":1, "DEGREE":5, "PARTYID":8,"SEX":11,"RACE":13}
naming = ["The Country", "18-34", "35-49", "50-64", "65+", "No High School", 
          "High School", "College+", "Democrat", "Independent/ Other", 
          "Republican", "Male", "Female", "White", "Black", "Other"]

NOW TO PERFECT THE LABELS AND ANSWERS
1) get all labels
2) get other labels
3) complete labels, answers
4) save labels, answers in pickle format 

In [3]:
#### get all labels ####

# code to get melted_table for labels and answers
data_path = r'C:\Users\justi\Dropbox\Work\Job Search\Other\GSS\GSS_sas\gss7222_r3.sas7bdat'
df, meta = pyreadstat.read_sas7bdat(data_path)

paradata_keywords = read_file_lines(r"C:\Users\justi\Dropbox\Work\Job Search\Other\GSS\GSS_sas\Paradata_variables.txt")
df = df; id_vars=['YEAR']; exclude_columns=['ID']+(paradata_keywords); include_columns=[]
year = 2000; min_answers = 850

melted_table = process_survey_data(df, id_vars=id_vars, exclude_columns=exclude_columns, include_columns=include_columns, year=year,min_answers=min_answers)
print(melted_table.head(50))

      YEAR  Question  Answer  Percentage  Num_Answers
0   2000.0     ABANY     1.0   39.875566         1768
1   2000.0     ABANY     2.0   60.124434         1768
2   2000.0  ABDEFECT     1.0   78.748591         1774
3   2000.0  ABDEFECT     2.0   21.251409         1774
4   2000.0    ABHLTH     1.0   88.515406         1785
5   2000.0    ABHLTH     2.0   11.484594         1785
6   2000.0  ABNOMORE     1.0   40.677966         1770
7   2000.0  ABNOMORE     2.0   59.322034         1770
10  2000.0    ABPOOR     1.0   42.244783         1773
11  2000.0    ABPOOR     2.0   57.755217         1773
12  2000.0    ABRAPE     1.0   80.560320         1749
13  2000.0    ABRAPE     2.0   19.439680         1749
14  2000.0  ABSINGLE     1.0   39.105829         1767
15  2000.0  ABSINGLE     2.0   60.894171         1767
18  2000.0    ADULTS     1.0   34.067496         2815
19  2000.0    ADULTS     2.0   53.676732         2815
20  2000.0    ADULTS     3.0    8.987567         2815
21  2000.0    ADULTS     4.0

In [19]:
#### get list_other_labels ####

list_all_labels = melted_table['Question'].tolist()
# get current labels and answers
# Unpickle answers.pkl
with open('../own_data_objects/answers.pkl', 'rb') as file:
    current_answers = pickle.load(file)

# Unpickle labels.pkl
with open('../own_data_objects/labels.pkl', 'rb') as file:
    current_labels = pickle.load(file)

# compare this to labels currently have, just have list what do not have (missing_labels)
# code
list_current_labels = list(current_labels.keys())
list_other_labels = list(set(list_all_labels) - set(list_current_labels))

In [22]:
#### complete labels, and answers ####
def parse3(variable):
    whole_filepath = '../own_data_objects/GSS2022_whole.txt'  # Path to the whole data file
    variable_tag = f'[VAR: {variable}]'  # Format the variable tag to look for

    with open(whole_filepath, 'r', encoding='utf-8') as file:  # Open the file with UTF-8 encoding
        capture = False  # Flag to start capturing data after finding the variable
        after_all = False  # Flag to start recording after "ALL"
        answers = {}  # Dictionary to store the response codes and their descriptions
        
        for line in file:
            line = line.strip()  # Strip any leading/trailing whitespace
            
            if after_all:
                if 'Not applicable' in line:  # Stop capturing on this line
                    break
                # Split the line to separate the description from the codes
                import re
                match = re.split(r'(\d+)', line, 1)  # Split by the first number
                if len(match) >= 2:
                    description = match[0].strip()
                    code = match[1].strip()
                    answers[code] = description
                    # debugging
                    # print(f"Captured: {code} -> {description}")  # Debug output
                else:
                    # debugging
                    # print(f"Skipping line: {line}")  # Debug output for non-data lines
                    pass

            if capture and "ALL" in line:  # This marks the start of the actual data capture
                after_all = True

            if variable_tag in line:  # Check for the specific variable line
                # print(f"Found variable line: {line}")  # Debug output
                capture = True  # Set capture flag to true to start capturing data

    # print("Final captured answers:", answers)  # Output final dictionary
    return answers  # Return the dictionary of response codes and descriptions

def parse2(label):
    index_filepath = '../own_data_objects/GSS_index.txt'  # Path to the index file
    with open(index_filepath, 'r') as file:  # Open the file in read mode
        for line in file:  # Iterate over each line in the file
            if line.startswith(label + " "):  # Check if the line starts with the label followed by a space
                return line.strip().split(' ', 1)[1]  # Split the line at the first space and return the description part
    return None  # Return None if label is not found


# fill in missing labels
for label in list_other_labels:
    value = parse2(label)
    current_labels[label] = value

################################################

# fill in missing answers
for label in list_other_labels:
    value = parse3(label)
    current_answers[label] = value

In [23]:
print(current_labels["ABANY"])
print(current_answers["ABANY"])
print(current_labels["TRDUNION"])
print(current_answers["TRDUNION"])

(Please tell me whether or not you think it should be possible for a pregnant woman to obtain a legal abortion if. . .) If the woman wants it for any reason?
{'1': 'YES', '2': 'NO'}
WORKERS NEED STRONG UNIONS
{'1': 'Strongly agree', '2': 'Agree', '3': 'Disagree', '4': 'Strongly disagree', '8': "Don't know", '9': 'No answer'}


In [30]:
# give me df where sex column is not 1 or 2, just give me that column
test_sex = df[(df['SEX'] != 1) & (df['SEX'] != 2)]['SEX']
test_sex

64816   NaN
64817   NaN
64852   NaN
64930   NaN
64997   NaN
         ..
71080   NaN
71085   NaN
71263   NaN
71271   NaN
71329   NaN
Name: SEX, Length: 112, dtype: float64

In [24]:
######## CLOSE AND SAVE LABELS AND ANSWERS ########

labels, answers = current_labels, current_answers

# Set the directory path where the files will be saved
directory_path = r"C:\Users\justi\Dropbox\Work\Job Search\Other\GSS\own_data_objects"

# File paths
labels_file_path = os.path.join(directory_path, 'labels.pkl')
answers_file_path = os.path.join(directory_path, 'answers.pkl')

# Save the 'labels' dictionary
with open(labels_file_path, 'wb') as file:
    pickle.dump(labels, file)

# Save the 'answers' dictionary
with open(answers_file_path, 'wb') as file:
    pickle.dump(answers, file)


In [28]:
naming = ["The Country", "18-34", "35-49", "50-64", "65+", 
          "No High School", "High School", "College+", "Democrat", "Independent/ Other", 
          "Republican", "Male", "Female", "White", "Black", 
          "Other"]
naming.index("Male")

11