In [1]:
# import packages for the file
# standard mathematical imports
import numpy as np
import pandas as pd

# visualisation imports
import matplotlib.pyplot as plt
import matplotlib.colors as clr
import seaborn as sns

# data science imports
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# warnings and settings
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.4f}'.format)



In [2]:
def get_data():
    df = pd.read_csv('../KB Assignment/measures_v2.csv') # file location
    df['time']= df.groupby('profile_id').cumcount() # get the sequential time column rather than relying on index
    df['d_power'] = df['i_d']*df['u_d'] # electrical power formula
    df['q_power'] = df['i_q']*df['u_q']
    df = df[['profile_id','time','u_q','u_d','i_q','i_d','q_power','d_power','torque','motor_speed','ambient','coolant','pm','stator_yoke','stator_winding','stator_tooth']]
    # above is re-ordering the columns
    return df

In [3]:
def describe_data(df):
    return df.describe()

In [4]:
def add_absolute_values(df):  # function to just take the absolute values for each electrical measurement
    df['u_q_abs'] = df['u_q'].abs()
    df['u_d_abs'] = df['u_d'].abs()
    df['i_q_abs'] = df['i_q'].abs()
    df['i_d_abs'] = df['i_d'].abs()
    df['q_power_abs'] = df['q_power'].abs()
    df['d_power_abs'] = df['d_power'].abs()
    df = df[['profile_id','time','u_q','u_q_abs','u_d','u_d_abs','i_q','i_q_abs','i_d','i_d_abs','q_power','q_power_abs','d_power','d_power_abs','torque','motor_speed','ambient','coolant','pm','stator_yoke','stator_winding','stator_tooth']]
    return df

In [5]:
def histplots(df):   # function to plot every histogram
    dfui = df.drop(['profile_id', 'time'], axis=1)  # dataframe under investigation
    # prepare colors
    color_list = plt.cm.gist_rainbow(np.linspace(0, 1, 18)[list(range(18))+[0, 1]]) # colour scheme
    coi = [c for c in dfui]  # columns of interest
    feat_clrs = {k: clr.rgb2hex(color_list[i][:3]) for i, k in enumerate(coi)} if color_list is not None else {}

    n_cols = 4
    n_rows = np.ceil(dfui.shape[1] / n_cols).astype(int)
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(2.8*n_cols, n_rows*4))
    for i, (ax, col) in enumerate(zip(axes.flatten(), list(dfui.columns))):
        sns.distplot(dfui[col], color=feat_clrs[col], ax=ax)
        if i % n_cols == 0:
            ax.set_ylabel('Density')
    plt.tight_layout()

In [6]:
def boxplots(df):   # same as above just with boxplots instead
    dfui = df.drop(['profile_id', 'time'], axis=1)  # dataframe under investigation
    # prepare colors
    color_list = plt.cm.gist_rainbow(np.linspace(0, 1, 18)[list(range(18))+[0, 1]]) # colour scheme
    coi = [c for c in dfui] # columns of interest
    feat_clrs = {k: clr.rgb2hex(color_list[i][:3]) for i, k in enumerate(coi)} if color_list is not None else {}

    n_cols = 4
    n_rows = np.ceil(dfui.shape[1] / n_cols).astype(int)
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(2.8*n_cols, n_rows*4))
    for i, (ax, col) in enumerate(zip(axes.flatten(), list(dfui.columns))):
        sns.boxplot(dfui[col], color=feat_clrs[col], ax=ax)
        if i % n_cols == 0:
            ax.set_ylabel('Values')
        ax.set_xlabel(col)
    plt.tight_layout()

In [7]:
# correlation heatmap
def correlation_matrix(df):
    f,ax=plt.subplots(figsize=(12,12))
    corr=df.corr()

    sns.heatmap(corr, annot=True, linewidths=.5, fmt='.2f',  
                square=True, ax=ax)

    plt.show()

In [8]:
def sessiondf(df):  # simple function to identify the session for the next set of plots
    session_df = df[df['profile_id'] == 20]
    return session_df

In [9]:
def temp_plot(df):    # function to produce a line chart with multiple lines
    plt.figure(figsize=(20,8))
    session_df = sessiondf(df)
    plt.plot(session_df.time, session_df.pm, "-b", label="Permanent Magent")
    plt.plot(session_df.time, session_df.stator_yoke, "-r", label="Stator Yoke")
    plt.plot(session_df.time, session_df.stator_tooth, "-g", label="Stator Tooth")
    plt.plot(session_df.time, session_df.stator_winding, "-k", label="Stator Winding")
    plt.legend(loc="upper left")
    plt.xlabel("Time")
    plt.ylabel("Temperature")
    plt.title('Temperature of the Permanent Magnet and Stators')
    plt.show()

In [10]:
def plot_temp_against_var(df, var, label):  # similar to above except having a secondary y axis on a different scale
    fig, host = plt.subplots(figsize=(20,8))  
    ax2 = host.twinx()  
    host.set_xlabel("Time")
    host.set_ylabel("Temperature")
    ax2.set_ylabel(label)
    p1 = host.plot(df.time, df.pm, "-b", label="Permanent Magnet")
    p2 = host.plot(df.time, df.stator_yoke, "-r", label="Stator Yoke")
    p3 = host.plot(df.time, df.stator_tooth, "-g", label="Stator Tooth")
    p4 = host.plot(df.time, df.stator_winding, "-k", label="Stator Winding")
    p5 = ax2.plot(df.time, df[var], ":m", label=label)
    host.legend(handles=p1+p2+p3+p4+p5, loc='best')
    plt.title('Temperature of the Permanent Magnet and Stators against '+str(label))
    plt.show()

In [11]:
def x_y_split(df):    # function to split df into x and y
    X = df.copy()
    X = X.drop(['profile_id', 'time', 'pm', 'stator_yoke', 'stator_winding', 'stator_tooth'], axis=1)
    y = df.copy()
    y = y.stator_yoke
    return X, y

In [12]:
def vif(x):   # function to explore multicolinearity
    vif = pd.DataFrame()
    vif["variables"] = x.columns
    vif["VIF"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
    return vif

In [13]:
def ols(x,y):       # function to explore p values and other metrics
    X_constant = sm.add_constant(x, prepend=False)
    model = sm.OLS(y,X_constant)
    results = model.fit()
    print(results.summary())

In [14]:
def tts(data):   # typical variable selection and test train split
    x = data[['u_q', 'u_d', 'i_q', 'i_d','motor_speed','ambient', 'coolant']]
    y = data.stator_yoke
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

In [15]:
def linear_model(data):    # build linear regression, fitting, and scoring.
    X_train, X_test, y_train, y_test = tts(data)   # uses function above
    ln_model = LinearRegression()
    ln_model.fit(X_train, y_train)
    ln_score = ln_model.score(X_test, y_test)
    return ln_model, ln_score

In [16]:
def linear_example(data, model):
    test = data[data['profile_id'] == 6]
    tbp = test[['u_q', 'u_d', 'i_q', 'i_d','motor_speed','ambient', 'coolant']]
    test['predicted_temp'] = model.predict(tbp)
    plt.figure(figsize=(20,8))
    plt.plot(test.time, test.predicted_temp, "-k", label="Predicted Temp")
    plt.plot(test.time, test.stator_yoke, "-r", label="Stator Yoke")
    plt.legend(loc="upper left")
    plt.xlabel("Time")
    plt.ylabel("Temperature")
    plt.show()

In [17]:
def randomforestregression(data):
    X_train, X_test, y_train, y_test = tts(data)
    rfr_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rfr_model.fit(X_train, y_train)
    rfr_score = rfr_model.score(X_test, y_test)
    return rfr_model, rfr_score

In [18]:
def random_forest_example(data, rfr_model):
    test = data[data['profile_id'] == 74]
    tbp = test[['u_q', 'u_d', 'i_q', 'i_d','motor_speed','ambient', 'coolant']]
    test['predicted_temp'] = rfr_model.predict(tbp)
    plt.figure(figsize=(20,8))
    plt.plot(test.time, test.predicted_temp, "-k", label="Predicted Temp")
    plt.plot(test.time, test.stator_yoke, "-r", label="Stator Yoke")
    plt.legend(loc="upper left")
    plt.xlabel("Time")
    plt.ylabel("Temperature")
    plt.show()