### CS 5644: MACHINE LEARNING WITH BIG DATA ASSIGNMENT 2

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn import metrics

In [2]:
column_names = ['Class Name', 'Handicapped Infants','Water Project Cost Sharing',
               'Adoption of the budget resolution', 'Physician fee freeze',
                'El salvador aid', 'Religious groups in schools', 'Anti-satellite test ban',
               'Aid to nicaraguan contras', 'MX Missile', 'Immigration', 
                'Synfuels corporation cutback', 'Education spending', 
                'Superfund right to sue', 'Crime', 'Duty free exports', 
                'Export administration act south africa']
dataframe = pd.read_csv('../data/info.data', names = column_names, header=None);

### Replacing missing values ? with NaN

In [3]:
def replace_nan_dataframe(df):
    return df.replace('?', np.NaN)

dataframe = replace_nan_dataframe(dataframe)

### Dropping NaN values

In [4]:
def dropping_nan_values(df):
    return df.dropna(how = 'any')

### Treat missing values as a value 

In [5]:
def treat_missing_values(df):
    treated_missing_values = df;
    for column in treated_missing_values.columns:  
        treated_missing_values[column].fillna('m', inplace = True)
    return treated_missing_values


### Impute missing values 

In [6]:
def impute_missing_values(df):
    imputed_dataframe = df;
    for column in imputed_dataframe.columns:  
        imputed_dataframe[column].fillna(imputed_dataframe[column].mode()[0], inplace = True)
    return imputed_dataframe


### Label encoding 

In [7]:
label_encoder = preprocessing.LabelEncoder()

def label_encoding(df):
    handicapped_infants_encoded = label_encoder.fit_transform(df['Handicapped Infants'])
    water_project_cost_sharing_encoded = label_encoder.fit_transform(df['Water Project Cost Sharing'])
    adoption_of_the_budget_resolution_encoded = label_encoder.fit_transform(df['Adoption of the budget resolution'])
    physician_fee_freeze_encoded = label_encoder.fit_transform(df['Physician fee freeze'])
    el_salvador_aid_encoded = label_encoder.fit_transform(df['El salvador aid'])
    religious_groups_in_schools_encoded = label_encoder.fit_transform(df['Religious groups in schools'])
    anti_satellite_test_ban_encoded = label_encoder.fit_transform(df['Anti-satellite test ban'])
    aid_to_nicaraguan_contras_encoded = label_encoder.fit_transform(df['Aid to nicaraguan contras'])
    mx_missile_encoded = label_encoder.fit_transform(df['MX Missile'])
    immigration_encoded = label_encoder.fit_transform(df['Immigration'])
    synfuels_corporation_cutback_encoded = label_encoder.fit_transform(df['Synfuels corporation cutback'])
    education_spending_encoded = label_encoder.fit_transform(df['Education spending'])
    superfund_right_to_sue_encoded = label_encoder.fit_transform(df['Superfund right to sue'])
    crime_encoded = label_encoder.fit_transform(df['Crime'])
    duty_free_exports_encoded = label_encoder.fit_transform(df['Duty free exports'])
    export_administration_act_south_africa_encoded = label_encoder.fit_transform(df['Export administration act south africa'])

    features = zip(handicapped_infants_encoded,water_project_cost_sharing_encoded, adoption_of_the_budget_resolution_encoded,
                  physician_fee_freeze_encoded, el_salvador_aid_encoded, religious_groups_in_schools_encoded,
                  anti_satellite_test_ban_encoded, aid_to_nicaraguan_contras_encoded, mx_missile_encoded,
                  immigration_encoded, synfuels_corporation_cutback_encoded, education_spending_encoded,
                  superfund_right_to_sue_encoded, crime_encoded, duty_free_exports_encoded, 
                  export_administration_act_south_africa_encoded)
    return list(features)
        

## Cross validation

In [8]:
cv = KFold(n_splits=5, random_state=None, shuffle=False) 

def cross_validation(model, df):
    precision_scores = []
    recall_scores = []
    f1_scores = []
    
    x = df.drop('Class Name', axis = 1)
    y = df['Class Name']
    
    for train_index, test_index in cv.split(x,y):
        X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.25)
        x_train_features = label_encoding(X_train)
        y_train_labels = label_encoder.fit_transform(y_train)
        x_test_features = label_encoding(X_test)
        y_test_labels = label_encoder.fit_transform(y_test)

        model.fit(x_train_features, y_train_labels)
        y_pred = model.predict(label_encoding(X_test))
        
        precision_scores.append(metrics.precision_score(label_encoder.fit_transform(y_test_labels), y_pred))
        recall_scores.append(metrics.recall_score(label_encoder.fit_transform(y_test_labels), y_pred))
        f1_scores.append(metrics.f1_score(label_encoder.fit_transform(y_test_labels), y_pred))

        
    print("Precision Score: {}".format(np.mean(precision_scores) * 100))
    print("Recall Score: {}".format(np.mean(recall_scores) * 100))
    print("F-1 Score: {}".format(np.mean(f1_scores) * 100))

## Naive bayes classifier

In [9]:
naive_bayes_model = GaussianNB()

#### Case I - Discard instances that have missing feature values

In [10]:
removed_nan_values_nb = dropping_nan_values(dataframe)

cross_validation(naive_bayes_model, removed_nan_values_nb)


Precision Score: 97.72079772079772
Recall Score: 92.61391941391942
F-1 Score: 95.03594186582477


#### Case II - Treat “missing” as if it is a value

In [11]:
treated_nan_values_nb = treat_missing_values(dataframe)

cross_validation(naive_bayes_model, treated_nan_values_nb)


Precision Score: 83.516317016317
Recall Score: 94.85195585195585
F-1 Score: 88.76697912305109


#### Case III - impute missing values 

In [12]:
imputed_nan_values_nb = impute_missing_values(dataframe)

cross_validation(naive_bayes_model, imputed_nan_values_nb)


Precision Score: 85.38074353381158
Recall Score: 89.76056620472328
F-1 Score: 87.42958654475561


## Decision tree classifier

In [13]:
decision_tree_model = DecisionTreeClassifier(criterion='entropy',random_state=0) 

#### Case I - Discard instances that have missing feature values

In [14]:
removed_nan_values_dt = dropping_nan_values(dataframe)

cross_validation(decision_tree_model, removed_nan_values_dt)


Precision Score: 95.38843196853209
Recall Score: 91.69975552902383
F-1 Score: 93.41976099278297


#### Case II - Treat “missing” as if it is a value

In [15]:
treated_nan_values_dt = treat_missing_values(dataframe)

cross_validation(decision_tree_model, treated_nan_values_dt)


Precision Score: 88.96283045120255
Recall Score: 92.90513145391192
F-1 Score: 90.82925713494589


#### Case III - impute missing values 

In [16]:
imputed_nan_values_dt = impute_missing_values(dataframe)

cross_validation(decision_tree_model, imputed_nan_values_dt)


Precision Score: 90.90696071233779
Recall Score: 95.07218271924154
F-1 Score: 92.87095626937307
