# Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go

pyo.init_notebook_mode()


from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor

import tensorflow as tf

# Definitions

In [None]:
week_day_names = {
    "Monday": 1, 
    "Tuesday": 2, 
    "Wednesday": 3, 
    "Thursday": 4, 
    "Friday": 5, 
    "Saturday": 6, 
    "Sunday": 7
}
source_columns = {
    'location':'source',
    'temp':'source_temp',
    'clouds': 'source_clouds',
    'pressure':'source_pressure',
    'rain': 'source_rain',
    'humidity':'source_humidity',
    'wind':'source_wind'
}
destination_columns = {
    'location':'destination',
    'temp':'destination_temp',
    'clouds': 'destination_clouds',
    'pressure':'destination_pressure',
    'rain': 'destination_rain',
    'humidity':'destination_humidity',
    'wind':'destination_wind'
}
cab_type_dict = {
    'Lyft': 0,
    'Uber': 1
}
names_list = [
    'distance','cab_type','time_stamp',\
    'destination', 'source', 'price', \
    'surge_multiplier', 'product_id', \
    'name', 'source_temp', 'source_location', \
    'source_clouds', 'source_pressure', \
    'source_rain', 'source_time_stamp', \
    'source_humidity', 'source_wind', \
    'destination_temp', 'destination_location', \
    'destination_clouds', 'destination_pressure', \
    'destination_rain', 'destination_time_stamp', \
    'destination_humidity', 'destination_wind'
]

inf = 10000000000

# Functions

In [None]:
def draw_heatmap(train, name = ""):
    colormap = plt.cm.viridis
    plt.figure(figsize=(24,24))
    plt.title("Pearson Correlation of Features " + name, y=1.05, size=15)
    sns.heatmap(train.astype(float).corr(),linewidths=0.01,vmax=1.0,
    square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
def convert(base):
    return_base = []
    for b in base:
        return_base.append(int(b[1:]))
    return return_base

def analyze_dataframe_basic(filename, sourcepath = "..\\data\\data_refactored\\", n=2):
    """Odczyt danych z pliku, podstawowa analiza PCA"""
    filepath = sourcepath+filename
    train = pd.read_csv(filepath)
    train = train.drop('Date/Time', axis=1)
    train = train.drop('Year', axis=1)
    train = train.replace({"Day_Name": week_day_names})
    train['Base'] = convert(train['Base'])
    print()
    print(train.head())
    draw_heatmap(train, filename) 
    
def analyze_dataframe_radial(filename, sourcepath = "..\\data\\data_refactored\\", n=2):
    """Odczyt danych z pliku, podstawowa analiza PCA"""
    filepath = sourcepath+filename
    train = pd.read_csv(filepath)
    train = train.drop('Date/Time', axis=1)
    train = train.drop('Year', axis=1)
    lat_mean, lon_mean = np.mean(train['Lat']), np.mean(train['Lon'])
    train['Radius'] = np.sqrt((train['Lat'] - lat_mean) ** 2 + (train['Lon'] - lon_mean) ** 2)
    train['Sin'] = train['Lon'] / train['Radius']
    train['Cos'] = train['Lat'] / train['Radius']
    train = train.drop('Lon', axis=1)
    train = train.drop('Lat', axis=1)
    train = train.replace({"Day_Name": week_day_names})
    train['Base'] = convert(train['Base'])
    train['Base'] -= min(train['Base'])
    print()
    print(train.head())
    draw_heatmap(train, filename)

# uber-raw-data-14.csv

In [None]:
analyze_dataframe_basic("uber-raw-data-14.csv")

In [None]:
analyze_dataframe_radial("uber-raw-data-14.csv")

# Cab Rides Prices

In [None]:
def merge_dataframes(path="..\\data\\cab_rides\\"):
    """Function to join data from two files: cab_rides.csv, weather.xls"""
    filename_1, filename_2 = "cab_rides.csv", "weather.xls"
    prices_dataframe = pd.read_csv(path + filename_1).dropna(axis=0).reset_index(drop=True)
    prices_dataframe = prices_dataframe.drop('id', axis=1)
    weather_dataframe = pd.read_csv(path + filename_2).fillna(0).drop('time_stamp', axis=1)
    weather_dataframe = weather_dataframe.groupby('location').mean().reset_index(drop=False)
    src_weather_dataframe = weather_dataframe.rename(columns=source_columns)
    des_weather_dataframe = weather_dataframe.rename(columns=destination_columns)
    data = prices_dataframe.merge(src_weather_dataframe, on='source').merge(des_weather_dataframe, on='destination')
    return data

def onehot_encode(data, column, prefix):
    """Change singular int value into a list of 0s and 1 by onehot encode"""
    onehot_columns = pd.get_dummies(data[column], prefix=prefix)
    data = data.drop(column, axis=1)
    data = pd.concat([data, onehot_columns], axis=1)    
    return data

def refactor_data(data):
    """Function to refactor and prepare data for further actions"""
    data['cab_type'] = data['cab_type'].replace(cab_type_dict)
    data = onehot_encode(data, column='destination', prefix='destination')
    data = onehot_encode(data, column='source', prefix='source')
    data = onehot_encode(data, column='product_id', prefix='product_id')
    data = onehot_encode(data, column='name', prefix='name')
    return data    

def analyze_dataframe_basic(sourcepath="..\\data\\cab_rides\\"):
    """PCA analyze for the whole dataset"""
    data = merge_dataframes(sourcepath)
    data = refactor_data(data)
    draw_heatmap(data, "cab_rides")

In [None]:
analyze_dataframe_basic()

# Models - predicting price of cab ride

In [None]:
import numpy as np 
from tensorflow import keras
from matplotlib import pyplot as plt
from IPython.display import clear_output

class LearningCurvesPlotter(keras.callbacks.Callback):
    """Callback to plot the learning curves for the model during training."""
    
    def on_train_begin(self, logs={}):
        self.metrics = {}
        for metric in logs:
            self.metrics[metric] = []
            

    def on_epoch_end(self, epoch, logs={}):
        for metric in logs:
            if metric in self.metrics:
                self.metrics[metric].append(logs.get(metric))
            else:
                self.metrics[metric] = [logs.get(metric)]
        
        metrics = [x for x in logs if 'val' not in x]
        
        f, axs = plt.subplots(1, len(metrics), figsize=(15,5))
        clear_output(wait=True)

        for i, metric in enumerate(metrics):
            axs[i].plot(range(1, epoch + 2), self.metrics[metric], label=metric)
            if logs['val_' + metric]:
                axs[i].plot(range(1, epoch + 2), self.metrics['val_' + metric], label='val_' + metric)
            axs[i].legend()
            axs[i].grid()

        plt.tight_layout()
        plt.show()

In [None]:
callbacks_list = [LearningCurvesPlotter()]

In [None]:
def predition_score(prediction, actual_price):
    """Score model predictions - square average of the prediction error"""
    n = actual_price.shape[0]
    return np.sqrt(np.sum((np.array(prediction)-np.array(actual_price))**2)/n)

In [None]:
def split_dataframe(data):
    """Split dataframe into feature data and labels - price values"""
    y = data['price']
    X = data.drop('price', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)
    return X_train, X_test, y_train.to_numpy(), y_test.to_numpy()

def down_scale_vector(X_train, X_test, n):
    """Transform data - reduce dimentionality by using PCA"""
    pca = PCA(n_components=n).fit(X_train)
    X_train, X_test = pca.transform(X_train), pca.transform(X_test)
    return X_train, X_test

def prepare_data(sourcepath="..\\data\\cab_rides\\", pca_com=False):
    """Preprocess data for prediction models"""
    data = merge_dataframes(sourcepath)
    data = refactor_data(data)
    X_train, X_test, y_train, y_test = split_dataframe(data)
    if pca_com: X_train, X_test = down_scale_vector(X_train, X_test, pca_com)
    return X_train, X_test, y_train, y_test  

def score_model(model, X, y, n=2, m=50):
    """Score the model base on its performance on test subset"""
    if n==2:
        pred = model.predict(X)
    else:
        X = PCA(n_components=2).fit_transform(X)
    pred = pred.reshape(pred.shape[0])
    return predition_score(pred, y)

def error_analyze(model, X, y, n=2):
    """Analyze the influence of actual price on method error"""
    if n==2:
        pred = model.predict(X)
    else:
        X = PCA(n_components=2).fit_transform(X)
    prediction = pred.reshape(pred.shape[0])
    n = y.shape[0]
    minimal_price, maximal_price = int(np.min(y)), int(np.max(y))
    error = np.abs(np.array(prediction)-np.array(y))
    average_error, ns = [0] * (maximal_price - minimal_price + 1), [0] * (maximal_price - minimal_price + 1)
    
    for err, p, act_price in zip(error, prediction, y):
        idx = int(act_price) - minimal_price
        average_error[idx] = average_error[idx] * ns[idx] / (ns[idx] + 1) + err / (ns[idx] + 1)
        ns[idx] += 1
        if idx > 0:
            average_error[idx - 1] = average_error[idx - 1] * ns[idx - 1] / (ns[idx - 1] + 1) + err / (ns[idx - 1] + 1)
            ns[idx - 1] += 1 
        if idx < len(average_error) - 1:
            average_error[idx + 1] = average_error[idx + 1] * ns[idx + 1] / (ns[idx + 1] + 1) + err / (ns[idx + 1] + 1)
            ns[idx + 1] += 1
            
    std = [0] * (maximal_price - minimal_price + 1)
    for err, act_price in zip(error, y):
        idx = int(act_price) - minimal_price
        std[idx] += (err - average_error[idx]) ** 2 / ns[idx]
        if idx > 0:
            std[idx-1] += (err - average_error[idx-1]) ** 2 / ns[idx-1]
        if idx < len(average_error) - 1:
            std[idx+1] += (err - average_error[idx+1]) ** 2 / ns[idx+1]
            
    std = np.sqrt(np.array(std))
            
    plt.figure(figsize=(12, 12))
    plt.xlim((0, 100))
    plt.title("Distribution of actual prices in test subset")
    plt.xlabel("Actual price [$]")
    plt.ylabel("Number of cab rides with fixed price")
    counts, bins = np.histogram(y, bins = (maximal_price - minimal_price) // 3)
    plt.hist(bins[:-1], bins, weights=counts)
    plt.show()
    
    plt.figure(figsize=(12, 12))
    plt.xlim((0, 100))
    plt.title("Distribution of predicted prices in test subset")
    plt.xlabel("Predicted price [$]")
    plt.ylabel("Number of cab rides with prediced price")
    counts, bins = np.histogram(prediction, bins = (maximal_price - minimal_price) // 3)
    plt.hist(bins[:-1], bins, weights=counts)
    plt.show()
    
    plt.figure(figsize=(12, 12))
    plt.xlim((0, 100))
    plt.title("Prediction error")
    plt.xlabel("Actual price [$]")
    plt.ylabel("Average error [$]")
    plt.grid()
    plt.errorbar([i for i in range(minimal_price, maximal_price + 1)], average_error, np.sqrt(std), linestyle='None', marker='.')
    plt.show()
    
def draw_result_3d(model, X, y, n=2, m=50):
    """Draw predictions and actual prices in 3d plot and compare them"""
    if n==2:
        pred = model.predict(X)
    else:
        X = PCA(n_components=2).fit_transform(X)
    pred = pred.reshape(pred.shape[0])
    fig = go.Figure()
    fig.add_trace(go.Scatter3d(x=X[0:m,0], y=X[0:m,1], z=y[0:m], mode='markers', name="Actual price"))
    fig.add_trace(go.Scatter3d(x=X[0:m,0], y=X[0:m,1], z=pred[0:m], mode='markers', name="Predicted price"))
    fig.show()
    print("Prediction score square sum =", predition_score(pred, y))
    for i in range(m): print("actual price =", "{:5.2f}".format(y[i]), "predicted price =", \
                        "{:5.2f}".format(abs(pred[i])))
    
def linear_regression_model(sourcepath="..\\data\\cab_rides\\", n=False, m=50, \
                            do_print_ceof=False, do_draw_results=True, \
                            do_score=False, do_analyze_error = False):
    """Linear regression model - predict prices"""
    X_train, X_test, y_train, y_test = prepare_data(sourcepath, pca_com=n)
    model = LinearRegression()
    model.fit(X_train, y_train)
    if do_print_ceof: print("Coefficients:\n", model.coef_, model.intercept_)
    if do_draw_results: 
        print("Test score =", model.score(X_test, y_test))
        draw_result_3d(model, X_test, y_test, m=m)
    if do_score: return score_model(model, X_test, y_test)
    if do_analyze_error: error_analyze(model, X_test, y_test)

def gaussian_regression_model(sourcepath="..\\data\\cab_rides\\", size=20000, n=False, \
                              m=50, do_print_ceof=False, do_draw_results=True, \
                              do_score=False, do_analyze_error = False):
    """Gaussian regression model - predict prices"""
    """Giving some a priori weights is probably necessary"""
    """and may have practical usage in our project - uber"""
    """may want to give some general prediction about prices"""
    X_train, X_test, y_train, y_test = prepare_data(sourcepath, pca_com=n)
    size = min(X_train.shape[0], size)
    X_train, X_test, y_train, y_test = X_train[:size,:], X_test[:size // 4,:], y_train[:size], y_test[:size // 4]
    model = GaussianProcessRegressor()
    model.fit(X_train, y_train)
    if do_print_ceof: print("Coefficients:\n", model.L_, model.alpha_)
    if do_draw_results: 
        print("Test score =", model.score(X_test, y_test))
        draw_result_3d(model, X_test, y_test, m=m)
    if do_score: return score_model(model, X_test, y_test)
    if do_analyze_error: error_analyze(model, X_test, y_test)
        
def nn_regression_model(sourcepath="..\\data\\cab_rides\\", n=False, m=50, \
                        do_print_ceof=False, do_draw_results=True, epochs=10, \
                        do_score=False, layer_n=1, do_analyze_error=False, \
                        do_analyze_epochs=False):
    """Neural network model predicting the prices"""
    X_train, X_test, y_train, y_test = prepare_data(sourcepath, pca_com=n)
    layer_width = 8
    if n: layer_width = int(np.sqrt(n))
    model = tf.keras.models.Sequential()
    for _ in range(layer_n): model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='relu'))
    model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
    if do_analyze_epochs:  
        model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs = epochs, verbose=1, callbacks=callbacks_list)
    else:  
        model.fit(X_train, y_train, epochs=epochs)
    if do_draw_results: 
        print("Test score =", model.evaluate(X_test, y_test))
        draw_result_3d(model, X_test, y_test, m=m)
    if do_score: return score_model(model, X_test, y_test)
    if do_analyze_error: error_analyze(model, X_test, y_test)

In [None]:
def find_best_dimensionality_lin_reg():
    ns, scores = [65], []
    best_n, best_score = 65, linear_regression_model(do_draw_results = False, do_score = True)
    scores.append(best_score)
    for n in range(64, 2, -2):
        score = linear_regression_model(n = n, do_draw_results = False, do_score = True)
        if score < best_score: best_n, best_score = n, score
        ns.append(n)
        scores.append(score)
    plt.figure(figsize=(20, 20))
    plt.title("Linear regression score for different data dimensionality")
    plt.xlabel("n - data dimensionality")
    plt.ylabel("Linear regression score")
    plt.plot(ns, scores)
    plt.show()
    
    print(f"Best score = {best_score} for n = {best_n}")
    
    return best_n
    
def find_best_nn():
    ns, epochs, layers_ns, scores = [], [], [], []
    best_score, best_n, best_epoch, best_layer_n = inf, 0, 0, 0
    for n in [10, 30, 50, False]:
        epoch = 10
        for layer_n in range(3):            
            print(f"n = {n}, epoch = {epoch}, layer_n = {layer_n}")
            score = nn_regression_model(n = n, epochs = epoch, do_draw_results = False, \
                                        do_score = True, layer_n = layer_n)
            if score < best_score: 
                best_n, best_score, best_epoch, best_layer_n = n, score, epoch, layer_n
            ns.append(n)
            layers_ns.append(layer_n)
            scores.append(score)
    fig = go.Figure()
    fig.add_trace(go.Scatter3d(x=ns, y=layers_ns, z=scores, mode='markers', name="Prediction score"))
    fig.show()
    
    print(f"Best score = {best_score} for n = {best_n}, epoch = {best_epoch}, layer number = {best_layer_n + 1}")
    
    return best_n, best_epoch, best_layer_n

## Neural Network model

In [None]:
# best_n, best_epoch, best_layer_n = 50, 10, 1
best_n, best_epoch, best_layer_n = find_best_nn()

In [None]:
best_n = 50
best_layer_n = 3

In [None]:
score = nn_regression_model(n = best_n, epochs = best_epoch, layer_n = best_layer_n)

In [None]:
score = nn_regression_model(n = best_n, epochs = best_epoch, layer_n = best_layer_n, do_draw_results=False, do_analyze_error=True)

In [None]:
score = nn_regression_model(n = best_n, epochs = 50, layer_n = best_layer_n, do_analyze_epochs = True)

# Cab rides analyze with exact condition

In [None]:
def init_dict(weather_dataframe):
    loc_dict = dict()
    for loc in np.unique(weather_dataframe['location']): 
        loc_dict[loc] = []
    return loc_dict

def fill_dict(loc_dict, weather_dataframe):
    for row in weather_dataframe.to_numpy(): 
        loc_dict[row[1]].append(row)
    return loc_dict
 
def sort_cond(loc_dict):
    for loc, rows in loc_dict.items(): 
        rows.sort(key=lambda x: x[5])
    return loc_dict

def find_condition(loc_dict, row, i):
    t, conditions = row[2], loc_dict[row[i]]
    a, b = 0, len(conditions) - 1
    while a < b - 1:
        c = (a + b) // 2
        if conditions[c][5] < t: a = c
        else: b = c
    return conditions[c]    

def append_rows(loc_dict, prices_dataframe):
    data, data_merged = prices_dataframe.to_numpy(), []
    for row in data:
        row = np.append(row, find_condition(loc_dict, row, 4), 0)
        data_merged.append(np.append(row, find_condition(loc_dict, row, 3), 0))
    return data_merged

def join_dataframes(path="..\\data\\cab_rides\\"):
    """Function to join data from two files: cab_rides.csv, weather.xls"""
    filename_1, filename_2 = "cab_rides.csv", "weather.xls"
    prices_dataframe = pd.read_csv(path + filename_1).dropna(axis=0).reset_index(drop=True)
    prices_dataframe = prices_dataframe.drop('id', axis=1)
    prices_dataframe['time_stamp'] = prices_dataframe['time_stamp'] // 1000
    weather_dataframe = pd.read_csv(path + filename_2).fillna(0)
    loc_dict = init_dict(weather_dataframe)
    loc_dict = fill_dict(loc_dict, weather_dataframe)
    loc_dict = sort_cond(loc_dict)
    data_merged = append_rows(loc_dict, prices_dataframe)
    df = pd.DataFrame(np.array(data_merged), columns = names_list)
    df = df.drop('destination_location', axis=1).drop('source_location', axis=1)
    df = df.drop('destination_time_stamp', axis=1).drop('source_time_stamp', axis=1)
    return df

def preprocess_data(sourcepath="..\\data\\cab_rides\\", pca_com=False):
    """Preprocess data for prediction models"""
    data = join_dataframes(sourcepath)
    data = refactor_data(data)
    X_train, X_test, y_train, y_test = split_dataframe(data)
    if pca_com: X_train, X_test = down_scale_vector(X_train, X_test, pca_com)
    return X_train, X_test, y_train, y_test
    
def analyze_dataframe_joined(sourcepath="..\\data\\cab_rides\\"):
    """PCA analyze for the whole dataset"""
    data = join_dataframes(sourcepath)
    data = refactor_data(data)
    draw_heatmap(data, "cab_rides")

In [None]:
def linear_regression_model_condition(sourcepath="..\\data\\cab_rides\\", n=False, \
                                      do_print_ceof=False, do_draw_results=True, \
                                      do_score = False, do_analyze_error=False):
    """Linear regression model - predict prices"""
    X_train, X_test, y_train, y_test = preprocess_data(sourcepath, pca_com=n)
    model = LinearRegression()
    model.fit(X_train, y_train)
    if do_print_ceof: print("Coefficients:\n", model.coef_, model.intercept_)
    print("Test score =", model.score(X_test, y_test))
    if do_draw_results: draw_result_3d(model, X_test, y_test)  
    if do_score: return score_model(model, X_test, y_test)
    if do_analyze_error: error_analyze(model, X_test, y_test)

def gaussian_regression_model_condition(sourcepath="..\\data\\cab_rides\\", size=20000, n=False, \
                                        do_print_ceof=False, do_draw_results=True, \
                                        do_score = False, do_analyze_error=False):
    """Gaussian regression model - predict prices"""
    """Giving some a priori weights is probably necessary"""
    """and may have practical usage in our project - uber"""
    """may want to give some general prediction about prices"""
    X_train, X_test, y_train, y_test = preprocess_data(sourcepath, pca_com=n)
    size = min(X_train.shape[0], size)
    X_train, X_test, y_train, y_test = X_train[:size,:], X_test[:size // 4,:], y_train[:size], y_test[:size // 4]
    model = GaussianProcessRegressor()
    model.fit(X_train, y_train)
    if do_print_ceof: print("Coefficients:\n", model.L_, model.alpha_)
    print("Test score =", model.score(X_test, y_test))
    if do_draw_results: draw_result_3d(model, X_test, y_test)
    if do_score: return score_model(model, X_test, y_test)
    if do_analyze_error: error_analyze(model, X_test, y_test)
        
def nn_regression_model_condition(sourcepath="..\\data\\cab_rides\\", size=20000, n=False, \
                                  do_print_ceof=False, do_draw_results=True, epochs=10, \
                                  m=50, do_score=False, layer_n=1, do_analyze_error=False, \
                                  do_analyze_epochs=False):
    """Neural network model predicting the prices"""
    X_train, X_test, y_train, y_test = preprocess_data(sourcepath, pca_com=n)
    X_train, X_test, y_train, y_test = X_train.astype('float32'), X_test.astype('float32'), y_train.astype('float32'), y_test.astype('float32')
    layer_width = 8
    if n: layer_width = int(np.sqrt(n))
    model = tf.keras.models.Sequential()
    for _ in range(layer_n): model.add(tf.keras.layers.Dense(layer_width, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='relu'))
    model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
    if do_analyze_epochs:  
        model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs = epochs, verbose=1, callbacks=callbacks_list)
    else:  
        model.fit(X_train, y_train, epochs=epochs)
    if do_draw_results: 
        print("Test score =", model.evaluate(X_test, y_test))
        draw_result_3d(model, X_test, y_test, m=m)
    if do_score: return score_model(model, X_test, y_test)
    if do_analyze_error: error_analyze(model, X_test, y_test)    

In [None]:
analyze_dataframe_joined()

In [None]:
def find_best_dimensionality_lin_reg_condition():
    ns, scores = [65], []
    best_n, best_score = 65, linear_regression_model_condition(do_draw_results = False, do_score = True)
    scores.append(best_score)
    for n in range(64, 2, -2):
        score = linear_regression_model_condition(n = n, do_draw_results = False, do_score = True)
        if score < best_score: best_n, best_score = n, score
        ns.append(n)
        scores.append(score)
    plt.title("Linear regression score for different data dimensionality")
    plt.xlabel("n - data dimensionality")
    plt.ylabel("Linear regression score")
    plt.plot(ns, scores)
    plt.show()
    
    print(f"Best score = {best_score} for n = {best_n}")
    
    return best_n
    
def find_best_nn_condition():
    ns, epochs, layers_ns, scores = [], [], [], []
    best_score, best_n, best_epoch, best_layer_n = inf, 0, 0, 0
    for n in [10, 30, 50, False]:
        epoch = 10
        for layer_n in range(3):            
            print(f"n = {n}, epoch = {epoch}, layer_n = {layer_n}")
            score = nn_regression_model_condition(n = n, epochs = epoch, do_draw_results = False, \
                                        do_score = True, layer_n = layer_n)
            if score < best_score: 
                best_n, best_score, best_epoch, best_layer_n = n, score, epoch, layer_n
            ns.append(n)
            layers_ns.append(layer_n)
            scores.append(score)
    fig = go.Figure()
    fig.add_trace(go.Scatter3d(x=ns, y=layers_ns, z=scores, mode='markers', name="Prediction score"))
    fig.show()
    
    print(f"Best score = {best_score} for n = {best_n}, epoch = {best_epoch}, layer number = {best_layer_n + 1}")
    
    return best_n, best_epoch, best_layer_n

## Linear regression model

## Simple neural network model

In [None]:
# best_n, best_epoch, best_layer_n = 50, 10, 1
best_n, best_epoch, best_layer_n = find_best_nn_condition()

In [None]:
nn_regression_model_condition(n = best_n, epochs = best_epoch, layer_n = best_layer_n)

In [None]:
nn_regression_model_condition(n = best_n, epochs = best_epoch, layer_n = best_layer_n, do_draw_results=False, do_analyze_error=True)

In [None]:
score = nn_regression_model_condition(n = best_n, epochs = 50, layer_n = best_layer_n, do_draw_results=False, do_analyze_epochs=True)

# Weights analysis

## Average conditions

In [None]:
def analyze_dataframe_analyse(sourcepath="..\\data\\cab_rides\\"):
    """PCA analyze for the whole dataset"""
    data = merge_dataframes(sourcepath)
    data = refactor_data(data)
    return data.columns, data

def split_dataframe_analyse(data):
    """Split dataframe into feature data and labels - price values"""
    y = data['price']
    X = data.drop('price', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)
    return X_train.to_numpy(), X_test.to_numpy(), y_train.to_numpy(), y_test.to_numpy()

def prepare_data_analyse(sourcepath="..\\data\\cab_rides\\", pca_com=False):
    """Preprocess data for prediction models"""
    data = merge_dataframes(sourcepath)
    data = refactor_data(data)
#     X_train, X_test, y_train, y_test = split_dataframe_analyse(data)
    X_train, X_test, y_train, y_test = split_dataframe(data)
    if pca_com: X_train, X_test = down_scale_vector(X_train, X_test, pca_com)
    return X_train, X_test, y_train, y_test 

def prepare_data_normal(sourcepath="..\\data\\cab_rides\\", pca_com=False):
    """Preprocess data for prediction models"""
    data = merge_dataframes(sourcepath)
    data = refactor_data(data)
    X_train, X_test, y_train, y_test = split_dataframe_analyse(data)
#     X_train, X_test, y_train, y_test = split_dataframe(data)
    if pca_com: X_train, X_test = down_scale_vector(X_train, X_test, pca_com)
    return X_train, X_test, y_train, y_test  

def compare_scale(X_1, X_2):
    r = [1] * 64
    for i in range(64):
        r[i] = (np.max(X_2[:,i]) - np.min(X_2[:,i])) / (np.max(X_1[:,i]) - np.min(X_1[:,i]))
    r[2] = 1
    return r

def nn_regression_model(X_train, X_test, y_train, y_test, epochs=10, layer_n=1):
    """Neural network model predicting the prices"""
    layer_width = 8
    model = tf.keras.models.Sequential()
    for _ in range(layer_n): model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='relu'))
    model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
    model.fit(X_train, y_train, epochs=epochs)
    return model

def average_features(X):
    idx_av = [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
    group1 = [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
    group2 = [28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
    group3 = [40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51]
    group4 = [52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]
    idx_moda = [1]
    x = [0] * 64
    for i in range(64): x[i] = np.min(X[:,i])
    m1, m2 = [0] * 64, [0] * 64
    for i in idx_av: x[i] = np.mean(X[:,i])
    y = [0] * 64
    for i in group1: y[i] = np.sum(X[:,i])
#     print(np.argmax(y))
    x[np.argmax(y)] = np.max(X[:,np.argmax(y)])
    y = [0] * 64
    for i in group2: y[i] = np.sum(X[:,i])
#     print(np.argmax(y))
    x[np.argmax(y)] = np.max(X[:,np.argmax(y)])
    y = [0] * 64
    for i in group3: y[i] = np.sum(X[:,i])
#     print(np.argmax(y))
    x[np.argmax(y)] = np.max(X[:,np.argmax(y)])
    y = [0] * 64
    for i in group4: y[i] = np.sum(X[:,i])
#     print(np.argmax(y))
    x[np.argmax(y)] = np.max(X[:,np.argmax(y)])
    for i in range(64):
        m1[i] = np.min(X[:,i])
        m2[i] = np.max(X[:,i])
    for i in idx_moda:
        values, counts = np.unique(X[i], return_counts=True)
        ind = np.argmax(counts)
        x[i] = values[ind]
    return x, m1, m2
    
# model = nn_regression_model(X_train, X_test, y_train, y_test, epochs = 50, layer_n = 3)


In [None]:
X_train, X_test, y_train, y_test = prepare_data_analyse("..\\data\\cab_rides\\", pca_com=False)
x_average, min_x, max_x = average_features(X_train)

In [None]:
model = nn_regression_model(X_train, X_test, y_train, y_test, epochs = 10, layer_n = 3)

In [None]:
def find_coeff(model, x_average, min_x, max_x, i):
    xs, ys = [], []
    x_av = x_average.copy()
    for j in range(11):
        x = min_x[i] + (max_x[i] - min_x[i]) * j / 10
        x_av[i] = x 
        xs.append(x)
        ys.append(model.predict(np.array(x_av).reshape(1, -1)))
    Reg = LinearRegression().fit(np.array(xs).reshape(-1, 1), np.array(ys).reshape(-1, 1))
    return Reg.coef_[0][0]

def find_coeff_2(model, x_average, min_x, max_x, i, a):
    xs, ys = [], []
    x_av = x_average.copy()
    x_av[a] = min_x[a]
    for j in range(2):
        x = (1 - j) * min_x[i] + max_x[i] * j
        x_av[i] = x 
        xs.append(x)
        ys.append(model.predict(np.array(x_av).reshape(1, -1)))
    return -((ys[1] - ys[0]) / (xs[1] - ys[0]))[0][0]

In [None]:
a = [0] * 64
for i in [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]:
    a[i] = find_coeff(model, x_average, min_x, max_x, i)
for i in [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]:
    a[i] = find_coeff_2(model, x_average, min_x, max_x, i, 21)
for i in [28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]:
    a[i] = find_coeff_2(model, x_average, min_x, max_x, i, 29)
for i in [40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51]:
    a[i] = find_coeff_2(model, x_average, min_x, max_x, i, 41)
for i in [52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]:
    a[i] = find_coeff_2(model, x_average, min_x, max_x, i, 52)
for i in [1]:
    a[i] = find_coeff_2(model, x_average, min_x, max_x, i, 1)

In [None]:
X_train, X_test, y_train, y_test = prepare_data_analyse(sourcepath="..\\data\\cab_rides\\", pca_com=False)
X_train_n, X_test_n, y_train_n, y_test_n = prepare_data_normal(sourcepath="..\\data\\cab_rides\\", pca_com=False)
r = compare_scale(X_train_n, X_train)

print(r)

In [None]:
for i in range(64):
    print(f"a_{i+1} = {a[i] / r[i]}")

## Current conditions

In [None]:
def preprocess_data_analyse(sourcepath="..\\data\\cab_rides\\", pca_com=False):
    """Preprocess data for prediction models"""
    data = merge_dataframes(sourcepath)
    data = refactor_data(data)
#     X_train, X_test, y_train, y_test = split_dataframe_analyse(data)
    X_train, X_test, y_train, y_test = split_dataframe(data)
    if pca_com: X_train, X_test = down_scale_vector(X_train, X_test, pca_com)
    return X_train, X_test, y_train, y_test 

def preprocess_data_normal(sourcepath="..\\data\\cab_rides\\", pca_com=False):
    """Preprocess data for prediction models"""
    data = merge_dataframes(sourcepath)
    data = refactor_data(data)
    X_train, X_test, y_train, y_test = split_dataframe_analyse(data)
#     X_train, X_test, y_train, y_test = split_dataframe(data)
    if pca_com: X_train, X_test = down_scale_vector(X_train, X_test, pca_com)
    return X_train, X_test, y_train, y_test  

In [None]:
X_train, X_test, y_train, y_test = preprocess_data_analyse("..\\data\\cab_rides\\", pca_com=False)
x_average, min_x, max_x = average_features(X_train)
print(x_average, min_x, max_x)