In [165]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from sklearn import metrics


In [166]:
column_names = ['Class Name', 'Handicapped Infants','Water Project Cost Sharing',
               'Adoption of the budget resolution', 'Physician fee freeze',
                'El salvador aid', 'Religious groups in schools', 'Anti-satellite test ban',
               'Aid to nicaraguan contras', 'MX Missile', 'Immigration', 
                'Synfuels corporation cutback', 'Education spending', 
                'Superfund right to sue', 'Crime', 'Duty free exports', 
                'Export administration act south africa']

In [167]:
dataframe = pd.read_csv('../data/info.data', names = column_names, header=None);
dataframe.head()

Unnamed: 0,Class Name,Handicapped Infants,Water Project Cost Sharing,Adoption of the budget resolution,Physician fee freeze,El salvador aid,Religious groups in schools,Anti-satellite test ban,Aid to nicaraguan contras,MX Missile,Immigration,Synfuels corporation cutback,Education spending,Superfund right to sue,Crime,Duty free exports,Export administration act south africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


### Replacing missing values ? with NaN

In [168]:
dataframe = dataframe.replace('?', np.NaN)
dataframe.head()

Unnamed: 0,Class Name,Handicapped Infants,Water Project Cost Sharing,Adoption of the budget resolution,Physician fee freeze,El salvador aid,Religious groups in schools,Anti-satellite test ban,Aid to nicaraguan contras,MX Missile,Immigration,Synfuels corporation cutback,Education spending,Superfund right to sue,Crime,Duty free exports,Export administration act south africa
0,republican,n,y,n,y,y,y,n,n,n,y,,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,
2,democrat,,y,y,,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,,y,y,y,y


### Case 1: Dropping NaN values

In [169]:
dropped_nan_dataframe = dataframe.dropna(how = 'any');
dropped_nan_dataframe.head()

Unnamed: 0,Class Name,Handicapped Infants,Water Project Cost Sharing,Adoption of the budget resolution,Physician fee freeze,El salvador aid,Religious groups in schools,Anti-satellite test ban,Aid to nicaraguan contras,MX Missile,Immigration,Synfuels corporation cutback,Education spending,Superfund right to sue,Crime,Duty free exports,Export administration act south africa
5,democrat,n,y,y,n,y,y,n,n,n,n,n,n,y,y,y,y
8,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,y
19,democrat,y,y,y,n,n,n,y,y,y,n,y,n,n,n,y,y
23,democrat,y,y,y,n,n,n,y,y,y,n,n,n,n,n,y,y
25,democrat,y,n,y,n,n,n,y,y,y,y,n,n,n,n,y,y


### Case 2: Treat missing values as a value 

In [170]:
for column in dataframe.columns:
    missing_values_dataframe[column] = missing_values_dataframe[column].fillna('m')
        
missing_values_dataframe.head()

Unnamed: 0,Class Name,Handicapped Infants,Water Project Cost Sharing,Adoption of the budget resolution,Physician fee freeze,El salvador aid,Religious groups in schools,Anti-satellite test ban,Aid to nicaraguan contras,MX Missile,Immigration,Synfuels corporation cutback,Education spending,Superfund right to sue,Crime,Duty free exports,Export administration act south africa
0,republican,n,y,n,y,y,y,n,n,n,y,n,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,y
2,democrat,n,y,y,n,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,y,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,n,y,y,y,y


### Case 3: Impute missing values 

In [171]:
for column in dataframe.columns:
    imputed_dataframe[column] = dataframe[column].fillna(dataframe[column].mode()[0])

imputed_dataframe.head()

Unnamed: 0,Class Name,Handicapped Infants,Water Project Cost Sharing,Adoption of the budget resolution,Physician fee freeze,El salvador aid,Religious groups in schools,Anti-satellite test ban,Aid to nicaraguan contras,MX Missile,Immigration,Synfuels corporation cutback,Education spending,Superfund right to sue,Crime,Duty free exports,Export administration act south africa
0,republican,n,y,n,y,y,y,n,n,n,y,n,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,y
2,democrat,n,y,y,n,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,y,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,n,y,y,y,y


### Label encoding

In [172]:
label_encoder = preprocessing.LabelEncoder()

def label_encoding(df):
    handicapped_infants_encoded = label_encoder.fit_transform(df['Handicapped Infants'])
    water_project_cost_sharing_encoded = label_encoder.fit_transform(df['Water Project Cost Sharing'])
    adoption_of_the_budget_resolution_encoded = label_encoder.fit_transform(df['Adoption of the budget resolution'])
    physician_fee_freeze_encoded = label_encoder.fit_transform(df['Physician fee freeze'])
    el_salvador_aid_encoded = label_encoder.fit_transform(df['El salvador aid'])
    religious_groups_in_schools_encoded = label_encoder.fit_transform(df['Religious groups in schools'])
    anti_satellite_test_ban_encoded = label_encoder.fit_transform(df['Anti-satellite test ban'])
    aid_to_nicaraguan_contras_encoded = label_encoder.fit_transform(df['Aid to nicaraguan contras'])
    mx_missile_encoded = label_encoder.fit_transform(df['MX Missile'])
    immigration_encoded = label_encoder.fit_transform(df['Immigration'])
    synfuels_corporation_cutback_encoded = label_encoder.fit_transform(df['Synfuels corporation cutback'])
    education_spending_encoded = label_encoder.fit_transform(df['Education spending'])
    superfund_right_to_sue_encoded = label_encoder.fit_transform(df['Superfund right to sue'])
    crime_encoded = label_encoder.fit_transform(df['Crime'])
    duty_free_exports_encoded = label_encoder.fit_transform(df['Duty free exports'])
    export_administration_act_south_africa_encoded = label_encoder.fit_transform(df['Export administration act south africa'])
    
    
    features = zip(handicapped_infants_encoded,water_project_cost_sharing_encoded, adoption_of_the_budget_resolution_encoded,
                  physician_fee_freeze_encoded, el_salvador_aid_encoded, religious_groups_in_schools_encoded,
                  anti_satellite_test_ban_encoded, aid_to_nicaraguan_contras_encoded, mx_missile_encoded,
                  immigration_encoded, synfuels_corporation_cutback_encoded, education_spending_encoded,
                  superfund_right_to_sue_encoded, crime_encoded, duty_free_exports_encoded, 
                  export_administration_act_south_africa_encoded)
    return list(features)
        

### Cross validation 

In [173]:
kf = model_selection.KFold(n_splits = 5, shuffle = True);


def cross_validation(df, model):
    for train_index, test_index in kf.split(df):
        X = df.drop(["Class Name"], axis=1)
        y = df["Class Name"]

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        X_train_features = label_encoding(X_train)
        y_train_labels = label_encoder.fit_transform(y_train)
        X_test_features = label_encoding(X_test)
        y_test_labels = label_encoder.fit_transform(y_test)

        
        model.fit(X_train_features, y_train_labels)
        y_pred = model.predict(X_test_features)
        print("Precision score =", metrics.precision_score(y_test_labels, y_pred, average="weighted"))
        print("Recall score =", metrics.recall_score(y_test_labels, y_pred, average="weighted"))
        print("F1 score =", metrics.precision_score(y_test_labels, y_pred, average="weighted"))

