In [6]:
#Imports
import pandas as pd
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [7]:
#Read in Data
red_df = pd.read_csv('../Resources/winequality-red.csv', delimiter=';')
white_df = pd.read_csv('../Resources/winequality-white.csv', delimiter=';')

In [8]:
def TestModel(features_to_test):
    #Splitting
    target_red = pd.get_dummies(red_df['quality'])
    features_red = red_df[features_to_test]

    target_white = pd.get_dummies(white_df['quality'])
    features_white = white_df[features_to_test]

    X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(features_red, target_red, random_state=42)
    
    X_train_white, X_test_white, y_train_white, y_test_white = train_test_split(features_white, target_white, random_state=42)


    #Scaling
    X_scaler_red = skl.preprocessing.StandardScaler()
    X_scaler_white = skl.preprocessing.StandardScaler()

    X_scaler_red.fit(X_train_red)
    X_scaler_white.fit(X_train_white)

    X_train_scaled_red = X_scaler_red.transform(X_train_red)
    X_test_scaled_red = X_scaler_red.transform(X_test_red)

    X_train_scaled_white = X_scaler_white.transform(X_train_white)
    X_test_scaled_white = X_scaler_white.transform(X_test_white)

    
    #Create and Evaluate Red Model
    model_red = RandomForestClassifier(n_estimators=50)
    
    model_red.fit(X_train_scaled_red, y_train_red)
    model_accuracy_red = model_red.score(X_test_scaled_red, y_test_red)

    #Create and Evaluate White Model
    model_white = RandomForestClassifier(n_estimators=50)
    
    model_white.fit(X_train_scaled_white, y_train_white)
    model_accuracy_white = model_white.score(X_test_scaled_white, y_test_white)

    return({'Red_Acc':model_accuracy_red, 'White_Acc':model_accuracy_white})
    

In [9]:
feature_results = []
missing_feature_results = []

In [10]:
#Test Single Features
cols = red_df.drop(columns='quality').columns
results = []
for feature in cols:
    result = TestModel([feature])
    results.append({'feature':feature, 'result':result})
feature_results.append({'single_features':results})

In [11]:
#Test Double Features
cols = red_df.drop(columns='quality').columns
results = []
for feature_a in cols:
    for feature_b in cols:
        result = TestModel([feature_a, feature_b])
        results.append({'features': f'{feature_a} & {feature_b}', 'result':result})
feature_results.append({'double_features':results})

In [12]:
#Test Single Missing Features
cols = red_df.drop(columns='quality').columns
results = []
for feature in cols:
    feature_list = list(cols)
    feature_list.remove(feature)
    result = TestModel(feature_list)
    results.append({'feature':feature, 'result':result})
missing_feature_results.append({'single_features':results})

In [13]:
#Test Double Missing Features
cols = red_df.drop(columns='quality').columns
results = []
for feature_a in cols:
    for feature_b in cols:
        if feature_a != feature_b:
            feature_list = list(cols)
            feature_list.remove(feature_a)
            feature_list.remove(feature_b)
            result = TestModel(feature_list)
            results.append({'features': f'{feature_a} & {feature_b}', 'result':result})
missing_feature_results.append({'double_features':results})

In [15]:
import json
with open('Outputs/feature_results.json', 'w', encoding='utf-8') as f:
    json.dump(feature_results, f, ensure_ascii=False, indent=4)
with open('Outputs/missing_feature_results.json', 'w', encoding='utf-8') as f:
    json.dump(missing_feature_results, f, ensure_ascii=False, indent=4)