In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import *
import matplotlib.pyplot as plt
import sklearn.linear_model as linear_model
from sklearn.model_selection import *
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
import seaborn as sns


In [2]:
sns.color_palette("hls", 8)
def r2(xr, yr):
    loo = LeaveOneOut()
    ytests = []
    ypreds = []
    x_arr = np.array(xr)
    y_arr = np.array(yr)
    for train_idx, test_idx in loo.split(xr):
        X_train, X_test = x_arr[train_idx], x_arr[test_idx] #requires arrays
        y_train, y_test = y_arr[train_idx], y_arr[test_idx]

        model = linear_model.LinearRegression()
        model.fit(X = X_train, y = y_train)
        y_pred = model.predict(X_test)

        # there is only one y-test and y-pred per iteration over the loo.split,
        # so to get a proper graph, we append them to respective lists.

        ytests += list(y_test)
        ypreds += list(y_pred)
    rr = r2_score(y_true=ytests, y_pred=ypreds)

    ms_error = mean_squared_error(ytests, ypreds)
    print("R2")
    print("Leave One Out Cross Validation")
    print("R^2: {:.5f}, MSE: {:.5f}".format(rr, ms_error))
def allTesting(xCat, yCat, xpred,ypred, all_models, outname, title = "Confusion Matrix"):
    loo = LeaveOneOut()
    print(yCat.value_counts())
    logm = linear_model.LogisticRegression()
    score = cross_val_score(logm, xCat, yCat, scoring= "accuracy", cv = loo, n_jobs=-1)
    logm_Predict = cross_val_predict(logm, xCat, yCat, cv = loo)
    cm = confusion_matrix(yCat, logm_Predict)
    ax= plt.subplot()
    #annot=True to annotate cells, ftm='g' to disable scientific notation
    sns.heatmap(cm, annot=True, fmt='g', ax=ax).set(title=title)
    # labels, title and ticks
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title(title)
    ax.xaxis.set_ticklabels(['False', 'True'])
    ax.yaxis.set_ticklabels(['False','True'])

    plt.show()
    print("Accuracy")
    print(np.mean(np.absolute(score)))
    if all_models:
        loo = LeaveOneOut()
        linm = linear_model.LinearRegression()
        score = cross_validate(linm, xpred, ypred, scoring="neg_mean_absolute_error", cv = loo, n_jobs=-1, return_estimator = True)
        print("MAE")
        print(-1*np.mean(score['test_score']))
        stuff = np.arange(xCat.shape[1]).reshape((xCat.shape[1],))
        i = 0
        for model in score['estimator']:
            i += 1
            stuff = np.add(stuff, np.transpose(model.coef_))
        out = pd.concat([pd.DataFrame(xCat.columns), pd.DataFrame(np.transpose(stuff))], axis=1)
        out.to_csv(outname)
        print("r2 LOO")
        r2(xpred, ypred)
        print("R2 4 fold")
        linm = linear_model.LinearRegression()
        kFold = KFold(n_splits=4)
        score = cross_val_score(linm, xpred, ypred, scoring="r2", cv = kFold)
        print(score)
        print(np.mean(score))


def write_dataframe_to_csv(stringA, stringB):
    # Create an empty dataframe with two columns
    df = pd.DataFrame(columns=["word", "number"])

    # Split the input string into lines
    lines = stringA.split("\n")

    # Loop over the lines and split each line into a word and a number
    for line in lines:
        if len(line) > 0:
            # Split the line into three parts: the word, the space-separated middle part, and the number
            parts = line.split()
            word = " ".join(parts[:-1])
            number = float(parts[-1])
            df = df.append({"word": word, "number": number}, ignore_index=True)

    # Write the dataframe to a CSV file
    df.to_csv(stringB, index=False)


In [5]:
redDF = pd.read_csv('winequality-red.csv', sep=";")
whiteDF = pd.read_csv('winequality-white.csv', sep=";")
redDF['wine_color'] = 'red'
whiteDF['wine_color'] = 'white'

combinedDF = redDF.append(whiteDF, ignore_index=True)

  combinedDF = redDF.append(whiteDF, ignore_index=True)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_color
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,7.4,0.29,0.28,10.2,0.032,43.0,138.0,0.99510,3.10,0.47,10.6,6,white
4996,6.3,0.19,0.29,5.5,0.042,44.0,189.0,0.99304,3.19,0.47,10.3,6,white
4997,6.1,0.33,0.32,7.8,0.052,52.0,183.0,0.99657,3.39,0.65,9.5,5,white
4998,5.6,0.32,0.33,7.4,0.037,25.0,95.0,0.99268,3.25,0.49,11.1,6,white
