## Initializing Python

In [2]:
#!/usr/bin/env python
# -*- coding: UTF-8

In [1]:
# IMPORTING KEY PACKAGES
import csv # for reading in CSVs and turning them into dictionaries
import re # for regular expressions
import os # for navigating file trees
import nltk # for natural language processing tools
import pandas # for working with dataframes
import numpy as np # for working with numbers

In [2]:
# FOR CLEANING, TOKENIZING, AND STEMMING THE TEXT
# from nltk import word_tokenize, sent_tokenize # widely used text tokenizer
# from nltk.stem.porter import PorterStemmer # an approximate method of stemming words (it just cuts off the ends)
# from nltk.corpus import stopwords # for one method of eliminating stop words, to clean the text
# stopenglish = list(stopwords.words("english")) # assign the string of english stopwords to a variable and turn it into a list
import string # for one method of eliminating punctuation
punctuations = list(string.punctuation) # assign the string of common punctuation symbols to a variable and turn it into a list
from scipy.stats.stats import pearsonr

In [3]:
# FOR VISUALIZATIONS
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# Visualization parameters
% pylab inline 
% matplotlib inline
matplotlib.style.use('ggplot')

Populating the interactive namespace from numpy and matplotlib


## Reading in preliminary data

In [4]:
# Read the newly merged data in as a pandas dataframe
df = pandas.read_csv("withPVI_new.csv", encoding = 'Latin-1')
# df = df.dropna(subset=["WEBTEXT"]) # drop any schools with no webtext that might have snuck in (none currently)

  interactivity=interactivity, compiler=compiler, result=result)


## Using PVI Scores
Here we'll use Partisan Voting Index scores instead of voting records to assess political preference
PVI scores are based on congressional district, but the data we have does not say which congressional district a school belongs to.  So, we will use zip codes to map schools to their congressional districts.

In [6]:
def strip_leading_zero(s):
    if s[0] == "0" and len(s) > 1:
        return s[1:]
    return s

# Get mapping from zip code to congressional district
zip_dict = dict()
with open("zipcode_to_cd.txt") as f:
    line = f.readline().replace("\n", "")
    while line:
        line = line.split(",")
        line = [strip_leading_zero(el) for el in line]
        zip_dict[line[1]] = line[0] + "," + line[2]
        line = f.readline().replace("\n", "")

# Get mapping from congressional district to PVI score
# Note that negative scores indicate a Democratic score and positive scores indicate a Republican score
pvi_dict14 = dict()
with open("pvi_2014.txt") as f:
    line = f.readline().replace("\n", "")
    while line:
        line = line.split()
        line = [strip_leading_zero(el) for el in line]
        pvi_dict14[line[0] + "," + line[1]] = line[2]
        line = f.readline().replace("\n", "")

pvi_dict17 = dict()
with open("pvi2017_n.txt") as f:
    line = f.readline().replace("\n", "")
    while line:
        line = line.split()
        line = [strip_leading_zero(el) for el in line]
        pvi_dict17[line[0] + "," + line[1]] = line[2]
        line = f.readline().replace("\n", "")

In [7]:
# Get Berkeley's PVI score
print(zip_dict['94709'])
print(zip_dict['94709'])
#print(pvi_dict)
print(pvi_dict14[zip_dict['94704']])
print(pvi_dict17[zip_dict['94704']])

6,13
6,13
-37
-40


In [8]:
# Map schools to their PVI scores and store in a "PVI" column
pvis, not_found, = [], 0
for index, row in df.iterrows():
    if not math.isnan(row["LZIP"]):
        v = int(row["LZIP"])
    else:
        pvis.append('NA')
        continue
    if str(v) not in zip_dict:
        pvis.append('NA') # For now defaulting to 0 when zip code not found
        not_found += 1
    else:
        if zip_dict[str(v)] not in pvi_dict14:
            pvis.append('NA')
            continue
        pvi_value = pvi_dict14[zip_dict[str(v)]]
        pvis.append(int(pvi_value))
print(str(not_found) + " zip codes not found")
df["PVI2014"] = pvis
df[["SCH_NAME", "PVI2014"]][:10]

6073 zip codes not found


Unnamed: 0,SCH_NAME,PVI2014
0,Sequoyah Sch - Chalkville Campus,
1,Eufaula Sch - Eufaula Campus,17.0
2,Camps,17.0
3,Det Ctr,-20.0
4,Wallace Sch - Mt Meigs Campus,
5,McNeel Sch - Vacca Campus,-20.0
6,Alabama Youth Services,
7,Ala Avenue Middle Sch,28.0
8,Albertville High Sch,28.0
9,Evans Elem Sch,28.0


In [9]:
pvis, not_found, = [], 0
for index, row in df.iterrows():
    if not math.isnan(row["LZIP"]):
        v = int(row["LZIP"])
    else:
        pvis.append('NA')
        continue
    if str(v) not in zip_dict:
        pvis.append('NA') # For now defaulting to 0 when zip code not found
        not_found += 1
    else:
        if zip_dict[str(v)] not in pvi_dict17:
            pvis.append('NA')
            continue
        pvi_value = pvi_dict17[zip_dict[str(v)]]
        pvis.append(int(pvi_value))
print(str(not_found) + " zip codes not found")
df["PVI2017"] = pvis
df[["SCH_NAME", "PVI2014", "PVI2017"]][:50]

6073 zip codes not found


Unnamed: 0,SCH_NAME,PVI2014,PVI2017
0,Sequoyah Sch - Chalkville Campus,,
1,Eufaula Sch - Eufaula Campus,17.0,16.0
2,Camps,17.0,16.0
3,Det Ctr,-20.0,-20.0
4,Wallace Sch - Mt Meigs Campus,,
5,McNeel Sch - Vacca Campus,-20.0,-20.0
6,Alabama Youth Services,,
7,Ala Avenue Middle Sch,28.0,30.0
8,Albertville High Sch,28.0,30.0
9,Evans Elem Sch,28.0,30.0


In [28]:
print(df)

         SURVYEAR  FIPST STABR STATENAME                          SEANAME  \
0       2014-2015    1.0    AL   ALABAMA  Alabama Department Of Education   
1       2014-2015    1.0    AL   ALABAMA  Alabama Department Of Education   
2       2014-2015    1.0    AL   ALABAMA  Alabama Department Of Education   
3       2014-2015    1.0    AL   ALABAMA  Alabama Department Of Education   
4       2014-2015    1.0    AL   ALABAMA  Alabama Department Of Education   
5       2014-2015    1.0    AL   ALABAMA  Alabama Department Of Education   
6       2014-2015    1.0    AL   ALABAMA  Alabama Department Of Education   
7       2014-2015    1.0    AL   ALABAMA  Alabama Department Of Education   
8       2014-2015    1.0    AL   ALABAMA  Alabama Department Of Education   
9       2014-2015    1.0    AL   ALABAMA  Alabama Department Of Education   
10      2014-2015    1.0    AL   ALABAMA  Alabama Department Of Education   
11      2014-2015    1.0    AL   ALABAMA  Alabama Department Of Education   

In [11]:
df.to_csv("withPVI_final.csv", index = False)