In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression as Lin_Reg
from sklearn.linear_model import Ridge as Ridge_Reg
from sklearn.linear_model import Lasso as Lasso_Reg
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


# To plot pretty figures
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize']=12,10

N = 256
vals = np.ones((N, 4))
vals[:, 0] = np.linspace(255/256, 1, N)    ## red  255,192, 203
vals[:, 1] = np.linspace(192/256, 1, N)
vals[:, 2] = np.linspace(203/256, 1, N)  ## blue 75, 0, 130

Pinks = ListedColormap(vals)
top = cm.get_cmap(Pinks, 256)
bottom = cm.get_cmap('Purples', 256)
newcolors = np.vstack((top(np.linspace(0, 1, 256)),
                       bottom(np.linspace(0, 1, 256))))
newcmp = ListedColormap(newcolors, name='PinkPueple')

np.random.seed(17)

In [None]:
data = pd.read_csv('data_normalized.csv')
data.head(5)

## Correlation Matrix

In [None]:
corr_matrix = np.corrcoef(data.T)
pd.DataFrame(corr_matrix)
plt.figure(figsize=(10,10))
plt.imshow(corr_matrix, cmap = newcmp, vmax = 1, vmin = -1)
plt.colorbar()
plt.show()

## training set, validation set, testing set

In [None]:
def split_data(data_x: np.ndarray, data_y: np.ndarray):
    
    ordering = np.arange(data_x.shape[0])
    np.random.shuffle(ordering)
    data_x = data_x[ordering]
    data_y = data_y[ordering]
    
    valid_start = int(len(data_x) * 0.7)
    test_start = int(len(data_x) * 0.9)
    
    train_set = (data_x[:valid_start], data_y[:valid_start])
    valid_set = (data_x[valid_start:test_start], data_y[valid_start:test_start])
    test_set = (data_x[test_start:], data_y[test_start:])
    
    return train_set, valid_set, test_set

In [None]:
y = np.array(data['target'])
x = np.array(data.drop(['target'], axis=1))

train_set, valid_set, test_set = split_data(x,y)

print(train_set[0].shape)
print(valid_set[0].shape)
print(test_set[0].shape)

In [None]:
x_train = train_set[0]
y_train = train_set[1]

x_val = valid_set[0]
y_val = valid_set[1]

x_test = test_set[0]
y_test = test_set[1]

## Simple Linear Regression - worth try :)

In [None]:
reg = Lin_Reg()
reg.fit(x_train, y_train)
coefficients = reg.coef_
reg.predict(x_val)

print('Simple Linear Regression')
#print('\nCoefficients: {:}'.format(coefficients))

output = np.round(reg.predict(x_train) ).astype(int)
target = y_train
accuracy = sum(output == target)/ len(target)
print("\n\nAccuracy on train set: {:.2f} %".format(accuracy*100))

output = np.round(reg.predict(x_val) ).astype(int)
target = y_val
accuracy = sum(output == target)/ len(target)
print("\n\nAccuracy on validation set: {:.2f} %".format(accuracy*100))

print('\nAverage Cross Validation in training set:\t{:}'.format(np.average(cross_val_score(reg,x_train, y_train, scoring ='r2',cv = 5))))
print('Average Cross Validation in test set:\t{:}'.format(np.average(cross_val_score(reg,x_test, y_test, scoring ='r2',cv = 5))))

# Ridge Regression

In [None]:
# Run for different values of lambda 
lambda_min = -5
lambda_max = 10
eta = 10

num_lambdas = eta * (lambda_max- lambda_min)
num_predictors = x.shape[1]
lambdas= np.linspace(lambda_min,lambda_max, num_lambdas)

train_accuracy = np.zeros(num_lambdas)
val_accuracy = np.zeros(num_lambdas)
coeff_a = np.zeros((num_lambdas, num_predictors))

In [None]:
for ind, i in enumerate(lambdas):    
    # Fit ridge regression on train set
    reg = Ridge_Reg(alpha = 10**i)
    reg.fit(x_train, y_train)
       
    coeff_a[ind,:] = reg.coef_
    
    # Evaluate train & test performance
    output = np.round(reg.predict(x_train)).astype(int)
    target = y_train
    train_accuracy[ind] = sum(output == target)/ len(target)
    
    output = np.round(reg.predict(x_val)).astype(int)
    target = y_val
    val_accuracy[ind] = sum(output == target)/ len(target)

In [None]:
# Plotting
plt.figure(figsize=(18, 8))

plt.plot(lambdas, train_accuracy, 'bo-', label=r'accuracy training set', color="violet", alpha=0.6, linewidth=3)
plt.plot(lambdas, val_accuracy, 'bo-', label=r'accuracy val set', color="darkviolet", alpha=0.6, linewidth=3)

plt.xlabel('Lambda value'); plt.ylabel(r'accuracy')
plt.xlim(lambda_min, lambda_max)
plt.title(r'Evaluate ridge regression with different lamdas')
plt.legend(loc='best')
plt.grid()

# Logistic regression

In [None]:
n=30

C_arr = np.linspace(-10, 5, n)
train_accuracy = np.zeros(n)
val_accuracy = np.zeros(n)

for ind, C in enumerate(C_arr):    
    
    logreg = LogisticRegression(solver='newton-cg', C=10**C)
    logreg.fit(x_train, y_train)

    
    # Evaluate train & test performance
    output = logreg.predict(x_train)
    target = y_train
    train_accuracy[ind] =sum(output == target)/ len(target)

    output = logreg.predict(x_val)
    target = y_val
    val_accuracy[ind] =sum(output == target)/ len(target)

In [None]:
# Plotting
plt.figure(figsize=(18, 8))

plt.plot(C_arr, train_accuracy, 'bo-', label=r'accuracy training set', color="violet", alpha=0.6, linewidth=3)
plt.plot(C_arr, val_accuracy, 'bo-', label=r'accuracy val set', color="darkviolet", alpha=0.6, linewidth=3)

plt.xlabel('C value'); plt.ylabel(r'accuracy')
plt.title(r'Evaluate logistic regression with different C')
plt.legend(loc='best')
plt.grid()

In [None]:
index = np.argmax(val_accuracy)
C_best= C_arr[index]

logreg = LogisticRegression(solver = 'newton-cg',C=10**C_best)
logreg.fit(x_train, y_train)
coefficients = logreg.coef_

print('Logistic Regression')

output = logreg.predict(x_train)
target = y_train
accuracy = sum(output == target)/ len(target)
print("\nAccuracy on train set: {:.2f} %".format(accuracy*100))

output = logreg.predict(x_val)
target = y_val
accuracy = sum(output == target)/ len(target)
print("\nAccuracy on validation set: {:.2f} %".format(accuracy*100))


output = logreg.predict(x_test)
target = y_test
accuracy = sum(output == target)/ len(target)
print("\nAccuracy on validation set: {:.2f} %".format(accuracy*100))

In [None]:
fig, axs= plt.subplots(1,3, figsize=(15,5))
axs[0].plot(coefficients[0],c='indigo')
axs[1].plot(coefficients[1],c='indigo')
axs[2].plot(coefficients[2],c='indigo')

axs[0].set_title("coefficients for class 0 stars")
axs[1].set_title("coefficients for class 1 stars")
axs[2].set_title("coefficients for class 2 stars")

plt.show()


# Check, what features have highest and lowest coefficients in logistic regression

In [None]:
features = list(data.columns)[:-1]

def get_most_correlated(coeffs, features, treshold):
    coeffs, features = zip(*sorted(zip( coeffs, features)))  

    for coef, feat in zip(coeffs, features):
        if abs(coef) > treshold:
            print("coef = {:.4f} \tfor\t {:}".format(coef, feat))
            
            
def get_least_correlated(coeffs, features, treshold):
    coeffs, features = zip(*sorted(zip( coeffs, features)))  ## sorting elements in (val, freq) indexing on 

    for coef, feat in zip(coeffs, features):
        if abs(coef) < treshold:
            print("coef = {:.4f} \tfor\t {:}".format(coef, feat))

## most important

In [None]:
print("Most correlated for 0 star")
get_most_correlated(coefficients[0], features, 2)

In [None]:
print("Most correlated for 1 star")
get_most_correlated(coefficients[1], features, 2)

In [None]:
print("Most correlated for 2 star")
get_most_correlated(coefficients[2], features, 2)

### less important

In [None]:
print("Least correlated for 0 star")
get_least_correlated(coefficients[0], features, 0.2)

In [None]:
print("Least correlated for 0 star")
get_least_correlated(coefficients[2], features, 0.2)

In [None]:
print("Least correlated for 0 star")
get_least_correlated(coefficients[2], features, 0.2)

## Comment:

Linear regression and regularized linear regression are not suitable for this particular task.
From analisys of coefficients in logistic regression the following seems to be not important:
* cuisine_x_Bar
* cuisine_x_Italian
* cuisine_y_Pizzeria

But, stutus "widow" is highly correlated with "0" and "1" star