In [1]:
import os
os.chdir(r"C:\Users\kaveh\OneDrive\Code Repos\Data Science\Insight\PolitImpact\politimpact")
from pathlib import Path
from configparser import ConfigParser
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import math
from datetime import datetime
from collections import defaultdict
import pandas as pd
import config as cfg
from joblib import load, dump
from scipy.special import softmax
import numpy as np

Running in C:\Users\kaveh\OneDrive\Code Repos\Data Science\Insight\PolitImpact\politimpact
Reading from configuration file 'config.ini'


In [50]:
def preCalc(user_party=None, today=None):
    modelFile = 'LogRegModel.joblib'
    model = load(cfg.linRegModel)
    
    """
    Steps:
    -1. Clean Data, Engineer features.
    
    0. For dynamic input, recalculate based on user_inputs
        If date given, drop all moneys after date
        then re-engineer features
        If party given, loop over races (below) where the user's party is NOT in top two
        
    1. Create baseline race table
        Loop over races
        Feed each race to model
        Find #1 #2, and their parties
        Store in table

    2. Group races in table and loop over them
            Loop over candidates
            Create 5 new RACE dataframes, with extra amounts of money
            Plug RACE with new cand info back into model
            Tabulate results and save
            Identify needed amount by each candidate
            Create new candidate table: Cand, Seat, Party, Present_Rank, Money required to break top 2
    """    
    
    if today:
        pass
        features = pd.read_csv(cfg.flask_candidate_file, index_col=0)
        # Run feature engineering with today's date
        # mask = dfMoney['TRANSACTION_DATE'].apply(lambda x: x.year) < today
        # dfMoney = dfMoney[mask]
    else:
        # Load already engineered data
        data = pd.read_csv(cfg.flask_candidate_file, index_col=0)
        # Convert date columns from string to datetime
        data.loc[:, 'ELECTION_DATE'] = pd.to_datetime(data['ELECTION_DATE'])
    
    # Engineer features
    # step 0
    
    #Step 1
    races = createBaselineRaceTable(data, model)

def createBaselineRaceTable(data, model=None):
    """
     Loop over races
        Feed each race to model
        Find #1 #2, and their parties
        Store in table
    """

   
    race_key = ['CONTEST_NAME', 'ELECTION_DATE']
    cand_key = [*race_key, 'CANDIDATE_NAME', ]
    
    # Group candidates by race and apply general model to all
    groups = data.groupby(race_key)
    race_columns = [*race_key, 'WINNER', 'WINNER_PARTY_NAME',
                    'WINNER_PARTY_LEAN', 'RUNNER_UP', 'RUNNER_UP_PARTY_NAME', 'RUNNER_UP_PARTY_LEAN']
    races = pd.DataFrame(columns=race_columns)
    for key, group in groups:
        
        # Call model for each group
        output = raceModel(group)
        
        top = output.nlargest(2, 'PRED_VOTE_PCT')[[*cand_key, 'PARTY_NAME','PARTY_LEAN', 'PRED_VOTE_PCT']]
    
        this_race = top.iloc[0].loc[[*race_key]]
        if len(top.index) == 2 and len(group.index > 2):
                winner = top.iloc[0, :].loc[['CANDIDATE_NAME', 'PARTY_NAME', 'PARTY_LEAN']]
                runner_up = top.iloc[1, :].loc[['CANDIDATE_NAME', 'PARTY_NAME', 'PARTY_LEAN']]
                row = pd.Series([*this_race, *winner, *runner_up], index=race_columns)
                races = races.append(row, ignore_index=True)
    races.to_csv(cfg.flask_race_file)
    return races

def addMoney(candGroup, cand_name, amount):
    
    # REMEMBER TO FIX LOG SCALE IF APPLYING LOG FOR MONEY
    candGroup = candGroup.reset_index().set_index('CANDIDATE_NAME')
    prevMoney = candGroup.loc[cand_name, 'CAND_TOTAL_RAISED']
    newMoney = prevMoney + amount
    candGroup.loc[cand_name, 'CAND_TOTAL_RAISED'] = newMoney
    return candGroup
    
def raceModel(candGroup):
    """ Take in a candidate group, append a column 'PRED_VOTE_PCT' with predicted percentage of votes"""
    
    trial = True
    
    # split group, apply 
    if trial:
        # shoot out a bunch of random results
        a = np.random.random(len(candGroup))
        a /= a.sum()
        candGroup['PRED_VOTE_PCT'] = a
    return candGroup
preCalc()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


                         CONTEST_NAME ELECTION_DATE                 WINNER  \
0    State Assembly Member District 1    2018-06-05            CALEEN SISK   
1   State Assembly Member District 10    2018-06-05            MARC LEVINE   
2   State Assembly Member District 11    2018-06-05            LISA ROMERO   
3   State Assembly Member District 12    2018-06-05        ROBERT D. CHASE   
4   State Assembly Member District 13    2018-06-05      CARLOS VILLAPUDUA   
..                                ...           ...                    ...   
90           State Senate District 38    2018-06-05          JEFF GRIFFITH   
91            State Senate District 4    2018-06-05  MICHAEL "MIKE" WORLEY   
92           State Senate District 40    2018-06-05              BEN HUESO   
93            State Senate District 6    2018-06-05            JACOB MASON   
94            State Senate District 8    2018-06-05        PAULINA MIRANDA   

   WINNER_PARTY_NAME WINNER_PARTY_LEAN          RUNNER_UP  \
0 