# MODEL BUILDING

* The features that we've analyzed to matter include
    * net_ope_exp
    * winner
    * votes
    * can_par_aff
    * can_off
    * can_off_dis
    * can_off_sta
    * can_inc_cha_ope_sea
    * campaign_duration
    
* create two dataframes for classification and regression tasks
    * create two subframes from the original frames for granularity of prediction
        * Regression_data
            * H_model_data_reg
            * P_model_data_reg
            * S_model_data_reg
        * Classification_data
            * H_model_data_cla
            * P_model_data_cla
            * S_model_data_cla


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
#import data

data = pd.read_csv("../data_secgrp/model_data.csv")

In [3]:
data.columns

Index(['can_id', 'can_nam', 'can_off', 'can_off_sta', 'can_off_dis',
       'can_par_aff', 'can_inc_cha_ope_sea', 'can_str1', 'can_cit', 'can_sta',
       'can_zip', 'ind_ite_con', 'ind_uni_con', 'ind_con', 'par_com_con',
       'oth_com_con', 'can_con', 'tot_con', 'tra_fro_oth_aut_com', 'can_loa',
       'tot_loa', 'off_to_ope_exp', 'oth_rec', 'tot_rec', 'ope_exp',
       'can_loa_rep', 'tot_loa_rep', 'ind_ref', 'oth_com_ref', 'tot_con_ref',
       'oth_dis', 'tot_dis', 'cas_on_han_beg_of_per', 'cas_on_han_clo_of_per',
       'net_con', 'net_ope_exp', 'deb_owe_by_com', 'cov_sta_dat',
       'cov_end_dat', 'winner', 'votes', 'campaign_duration'],
      dtype='object')

In [4]:
#create regression data
Regression_data = data[['can_off', 'can_off_sta', 'can_off_dis', 'can_inc_cha_ope_sea', 'net_ope_exp', 'can_par_aff','campaign_duration','votes']]


#create classification data
Classification_data = data[['can_off', 'can_off_sta', 'can_off_dis', 'can_inc_cha_ope_sea', 'net_ope_exp', 'can_par_aff','campaign_duration','winner']]


**Check regression analysis possibility**

In [5]:
Regression_data.isna().sum()/len(Regression_data)

can_off                0.000000
can_off_sta            0.000000
can_off_dis            0.001103
can_inc_cha_ope_sea    0.001103
net_ope_exp            0.082139
can_par_aff            0.000551
campaign_duration      0.000000
votes                  0.791069
dtype: float64

**Inference**
* approximately 80% of the votes data is missing, this will make regression analysis inaccurate, therefore, we will not look into extracting data for regression analysis

# Proceed to check classification possibility

In [6]:
Classification_data.isna().sum()

can_off                  0
can_off_sta              0
can_off_dis              2
can_inc_cha_ope_sea      2
net_ope_exp            149
can_par_aff              1
campaign_duration        0
winner                   0
dtype: int64

**Handle missing data in classification data**

In [7]:
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

Classification_data['can_off_dis'] = imp_mode.fit_transform(Classification_data[['can_off_dis']]).copy()
Classification_data['can_inc_cha_ope_sea'] = imp_mode.fit_transform(Classification_data[['can_inc_cha_ope_sea']]).copy()
Classification_data['net_ope_exp'] = Classification_data['net_ope_exp'].fillna(-99999999999999999999999).copy()
Classification_data['can_par_aff'] = imp_mode.fit_transform(Classification_data[['can_par_aff']]).copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pand

In [8]:
Classification_data.isna().sum()

can_off                0
can_off_sta            0
can_off_dis            0
can_inc_cha_ope_sea    0
net_ope_exp            0
can_par_aff            0
campaign_duration      0
winner                 0
dtype: int64

remove a single uninformative data point that affects the pipeline

In [9]:
Classification_data = Classification_data[Classification_data.can_par_aff != 'PPT']

In [10]:
#make respective dataframes
H_model_data_cla = Classification_data.loc[Classification_data['can_off'] == 'H']
P_model_data_cla = Classification_data.loc[Classification_data['can_off'] == 'P']
S_model_data_cla = Classification_data.loc[Classification_data['can_off'] == 'S']

# Build classification pipeline

### Test pipeline on house of rep data

In [11]:
from sklearn.model_selection import train_test_split

X = H_model_data_cla.iloc[:,:-1]
y = H_model_data_cla.iloc[:,-1]

In [12]:
# determine categorical and numerical features

numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns


# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=t)

In [13]:
# define base the model

model = SVC(kernel='rbf',gamma='scale',C=100)
# define the data preparation and modeling pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m', model) ])

## Train and predict pipeline on house of rep data

In [14]:
#divide data into train and test split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=0)

In [15]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  Index(['can_off', 'can_off_sta', 'can_inc_cha_ope_sea', 'can_par_aff'], dtype='object')),
                                                 ('num', MinMaxScaler(),
                                                  Index(['can_off_dis', 'net_ope_exp', 'campaign_duration'], dtype='object'))])),
                ('m', SVC(C=100))])

In [16]:
y_pred = pipeline.predict(X_val)

# evaluating base model

In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score

print(f'confusion matrix is: \n{confusion_matrix(y_true=y_val, y_pred=y_pred)}')
print(f'accuracy of the base model on house of rep election is: {accuracy_score(y_val, y_pred) * 100}%')

confusion matrix is: 
[[192   7]
 [ 20  67]]
accuracy of the base model on house of rep election is: 90.55944055944056%


# Create function to evaluate different models

In [18]:
def test_model_(models_dict, X_train, y_train, X_val, y_val):
    """
    a function that takes in a dictionary of models along with train and test data
    to calculate the f1_score and accuracy score of the built pipeline then return a dataframe as the output
    
    """
    metrics = {}
    for i in models_dict:
        model_name = str(i)
        model = models_dict[i]
        
        pipeline = Pipeline(steps=[('prep',col_transform), ('m', model) ])
        pipeline.fit(X_train, y_train)
        test_pred = pipeline.predict(X_val)
        metric_1 = accuracy_score(y_val, test_pred) * 100
        metric_2 = f1_score(y_val, test_pred, average='weighted')
        metrics[i] = metric_1, metric_2
        
    metrics_df = pd.DataFrame.from_dict(metrics, orient='index', columns=['Accuracy score', 'f1_score'])
    return metrics_df
        

In [19]:
#create a dictionary of classification models
candidate_models = {'xgboost':XGBClassifier(), 'log_reg': LogisticRegression(), 'svm':SVC(), 'random forest': RandomForestClassifier() }

#cal test_model_function
test_model_(candidate_models, X_train, y_train, X_val, y_val)

Unnamed: 0,Accuracy score,f1_score
xgboost,91.258741,0.910052
log_reg,91.258741,0.910052
svm,91.258741,0.910052
random forest,90.909091,0.906245
