In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.dates as mdates
import mplcursors
import numpy as np
import matplotlib.cm as cm
%matplotlib notebook
plt.ion()
from sklearn.cluster import KMeans
import seaborn as sns
import plotly.graph_objects as go
from plotly import tools
from plotly.subplots import make_subplots
import plotly.offline as py

# Configure display options
pd.set_option('display.max_rows', None)     # To display all rows
pd.set_option('display.max_columns', None)  # To display all columns

In [None]:
df = pd.read_csv('B737MAX-8-M-03.csv')
sim=df['Sensor Name'][0]

In [None]:
print(sim)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
#df.drop([:-4], axis=0, inplace=True)

if sim=='B737MAX-8-M-03':
    df=df[:-4]
else:
    df=df

In [None]:
df.shape

In [None]:
df.tail(10)

In [None]:
df.info(verbose=True)

In [None]:
#new columns names based 'Raw Data' order
col_name=['phase1_avg','phase1_max','phase1_min','phase1_duty','phase2_avg','phase2_max','phase2_min','phase2_duty','phase3_avg','phase3_max','phase3_min','phase3_duty','value_Ah','sensor_state']

#split and convert it into list
split_values = df['Raw Data'].str.split('|').tolist()

# Create new columns in the DataFrame
for i, col_values in enumerate(zip(*split_values)):
    df[col_name[i]] = [float(val) for val in col_values]

    
df = df.rename(columns={'Date': 'Timestamp' }) 
df[['Date', 'Time']] = df['Timestamp'].str.split(' ', 1, expand=True)    
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Date'] = pd.to_datetime(df['Date'])
# Extract the number of seconds since midnight from the 'Time' column
df['Time'] = pd.to_datetime(df['Time']).dt.hour * 3600 + \
             pd.to_datetime(df['Time']).dt.minute * 60 #+ \
             #pd.to_datetime(df['Time']).dt.second
    
# Display the updated DataFrame
df.head()

In [None]:
df.info(verbose=True)

In [None]:
df.set_index('Timestamp', inplace=True)
weekly_consumption = df['Value'].resample('W').mean()

# Plot the weekly consumption trends
plt.figure(figsize=(10, 6))
weekly_consumption.plot(marker='o')
plt.xlabel('Week')
plt.ylabel('Average Consumption')
plt.title('Weekly Consumption Trends')
image_name = f"{sim}_weekly Current Consumption.png"
plt.savefig (image_name)
plt.grid(True)
plt.show()

In [None]:
# Calculate daily consumption
daily_consumption = df['Value']

# Create the histogram
plt.figure(figsize=(10, 6))
plt.hist(daily_consumption, bins=40,range=(0, 15), edgecolor='black')
plt.xlabel('Daily Consumption (Ah)')
plt.ylabel('Frequency')
plt.title('Histogram of RMS Value')
image_name = f"{sim}_Histogram of RMS Value.png"
plt.savefig (image_name)
plt.grid(True)
plt.show()

In [None]:
# Calculate daily consumption
daily_consumption = df['phase1_avg']

# Create the histogram
plt.figure(figsize=(10, 6), dpi=100)
plt.hist(daily_consumption, bins=20, edgecolor='black')
#plt.xticks(range(0,250, 10))
plt.xlabel('Daily Consumption (Ah)- Phase 1')
plt.ylabel('Frequency')
plt.title('Histogram of Daily Consumption')
plt.grid(True)
image_name = f"{sim}_Histogram of Daily Consumption.png"
plt.savefig (image_name)
plt.show()

In [None]:
# Create a 3D plot
fig = plt.figure(figsize=(5, 3), dpi=200)
ax = fig.add_subplot(111, projection='3d')


ax.set_xlabel('Date')
ax.set_ylabel('Time')
ax.set_zlabel('Current Consumption')
x = mdates.date2num(df['Date'])
ax.set_zlim(0, 15)
ax.set_ylim(0, 24)  # Change the range to 0-24 hours
# Plot the 3D scatter plot
ax.scatter(x, df['Time']/3600, df['Value'])

# Format the 'Date' axis with date labels
date_formatter = mdates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_formatter)
fig.autofmt_xdate()
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=6)
ax.tick_params(axis='z', labelsize=10)
image_name = f"{sim}_current consumption.png"
plt.savefig (image_name)
# Show the plot
plt.show()

In [None]:
# Create a 3D plot
fig = plt.figure(figsize=(5, 3), dpi=300)
ax = fig.add_subplot(111, projection='3d')


ax.set_xlabel('Date')
ax.set_ylabel('Time (hrs)')
ax.set_zlabel('Current Consumption (A)')
x = mdates.date2num(df['Date'])
ax.set_zlim(0, 25)
ax.set_ylim(0, 24)  # Change the range to 0-24 hours
# Plot the 3D scatter plot
#ax.scatter(x, df['Time']/3600, df['phase1_avg'])
phase1=ax.scatter(x, df['Time']/3600, df['phase1_avg'], label='Phase 1')
phase2=ax.scatter(x, df['Time']/3600, df['phase2_avg'], label='Phase 2')
phase3=ax.scatter(x, df['Time']/3600, df['phase3_avg'], label='Phase 3')

# Format the 'Date' axis with date labels
date_formatter = mdates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_formatter)
fig.autofmt_xdate()
ax.tick_params(axis='x', labelsize=6)
ax.tick_params(axis='y', labelsize=6)
ax.tick_params(axis='z', labelsize=6)
ax.legend(handles=[phase1, phase2, phase3])

ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
image_name = f"{sim}_3D plot of total Current Consumption.png"
plt.savefig (image_name)
# Show the plot
plt.show()

In [None]:
df.plot(y=['phase1_avg', 'phase2_avg', 'phase3_avg'], figsize=(10, 6))

# Set the title and labels
plt.title('Three Phases')
plt.xlabel('Date')
plt.ylabel('Value (A)')

plt.tight_layout()

#plt.savefig('Three Phases.png', dpi=800)
image_name = f"{sim}_Three phase plot.png"
plt.savefig (image_name)
# Display the chart
plt.show()

In [None]:
df['phase1-2']= df['phase1_avg']-df['phase2_avg']
df['phase2-3']= df['phase2_avg']-df['phase3_avg']
df['phase1-3']= df['phase1_avg']-df['phase3_avg']

In [None]:
df.plot(y=['phase1-2', 'phase2-3', 'phase1-3', 'phase1_avg', 'phase2_avg', 'phase3_avg'], figsize=(10, 6))

# Set the title and labels
plt.title('Three Phases diffrence plot')
plt.xlabel('Date')
plt.ylabel('Value (A)')

plt.tight_layout()

#plt.savefig('Three Phases.png', dpi=800)
image_name = f"{sim}_Three phase difference plot.png"
plt.savefig (image_name)
# Display the chart
plt.show()

In [None]:
df.plot(y=['phase1_max', 'phase2_max', 'phase3_max'], figsize=(10, 6))

# Set the title and labels
plt.title('Three Phases max ')
plt.xlabel('Date')
plt.ylabel('Value (A)')

plt.tight_layout()

#plt.savefig('Three Phases max.png', dpi=800)
image_name = f"{sim}_Three phases max.png"
plt.savefig (image_name)
# Display the chart
plt.show()

In [None]:
# Calculate daily hourly averages
df_avg = df.groupby([df.index.date, df['Time'] / 3600])['Value'].mean().reset_index()
df_avg.columns = ['Date', 'Hour', 'Average']

# Plot the daily hourly averages
fig, ax = plt.subplots(figsize=(10, 6))

# Iterate over the unique dates and plot the hourly averages
lines = []
for date in df_avg['Date'].unique():
    data = df_avg[df_avg['Date'] == date]
    line, = ax.plot(data['Hour'], data['Average'], label=str(date))
    lines.append(line)

# Create hover events for each line using mplcursors
mplcursors.cursor(lines).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))

# Set the labels and title
ax.set_xlabel('Hour')
ax.set_ylabel('Value (Ah)')
ax.set_title('Daily Hourly Averages Total RMS Current')
ax.set_xticks(range(25))

# Add a legend
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=7)

# Set the same limits for both x and y axes
ax.set_xlim(0, 24)
ax.set_ylim(0, 10)  # Adjust the limits based on your data

plt.tight_layout()
image_name = f"{sim}_Daily Hourly Average Total RMS Current.png"
plt.savefig (image_name)
# Show the plot
plt.show()

In [None]:
# Plot the daily hourly averages
fig, ax = plt.subplots(figsize=(10, 6))

# Iterate over the unique dates and plot the hourly averages
lines = []
for date in df_avg['Date'].unique():
    data = df_avg[df_avg['Date'] == date]
    line, = ax.plot(data['Hour'], data['Average'], label=str(date), color='black', alpha=0.10, linewidth=1.5)
    lines.append(line)

# Create hover events for each line using mplcursors
mplcursors.cursor(lines).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))

# Set the labels and title
ax.set_xlabel('Hour')
ax.set_ylabel('Value (Ah)')
ax.set_title('Daily Hourly Average Total RMS Current')
ax.set_xticks(range(25))

# Add a legend
#ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Set the same limits for both x and y axes
ax.set_xlim(0, 24)
ax.set_ylim(0, 15)  # Adjust the limits based on your data

plt.tight_layout()
image_name = f"{sim}_Daily Hourly Average Total RMS Current_reduced transparency.png"
plt.savefig (image_name)
# Show the plot
plt.show()

In [None]:
# Calculate daily hourly averages
df_avg = df.groupby([df.index.date, df['Time'] / 3600])['phase1_max'].mean().reset_index()
df_avg.columns = ['Date', 'Hour', 'Average']

# Plot the daily hourly averages
fig, ax = plt.subplots(figsize=(10, 6))

# Iterate over the unique dates and plot the hourly averages
lines = []
for date in df_avg['Date'].unique():
    data = df_avg[df_avg['Date'] == date]
    line, = ax.plot(data['Hour'], data['Average'], label=str(date))
    lines.append(line)

# Create hover events for each line using mplcursors
mplcursors.cursor(lines).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))

# Set the labels and title
ax.set_xlabel('Hour')
ax.set_ylabel('Value (Ah)')
ax.set_title('Daily Hourly Averages phase1 max')
ax.set_xticks(range(25))

# Add a legend
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Set the same limits for both x and y axes
ax.set_xlim(0, 24)
ax.set_ylim(0, 80)  # Adjust the limits based on your data

plt.tight_layout()
image_name = f"{sim}_Daily Hourly Averages phase1 max.png"
plt.savefig (image_name)
# Show the plot
plt.show()

In [None]:
# Calculate daily hourly averages
df_avg = df.groupby([df.index.date, df['Time'] / 3600])['phase2_max'].mean().reset_index()
df_avg.columns = ['Date', 'Hour', 'Average']

# Plot the daily hourly averages
fig, ax = plt.subplots(figsize=(10, 6))

# Iterate over the unique dates and plot the hourly averages
lines = []
for date in df_avg['Date'].unique():
    data = df_avg[df_avg['Date'] == date]
    line, = ax.plot(data['Hour'], data['Average'], label=str(date))
    lines.append(line)

# Create hover events for each line using mplcursors
mplcursors.cursor(lines).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))

# Set the labels and title
ax.set_xlabel('Hour')
ax.set_ylabel('Value (Ah)')
ax.set_title('Daily Hourly Averages phase2 max')
ax.set_xticks(range(25))

# Add a legend
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Set the same limits for both x and y axes
ax.set_xlim(0, 24)
ax.set_ylim(0, 60)  # Adjust the limits based on your data

plt.tight_layout()
image_name = f"{sim}_Daily Hourly Average of Phase2 max.png"
plt.savefig (image_name)
# Show the plot
plt.show()

In [None]:
# Calculate daily hourly averages
df_avg = df.groupby([df.index.date, df['Time'] / 3600])['phase3_max'].mean().reset_index()
df_avg.columns = ['Date', 'Hour', 'Average']

# Plot the daily hourly averages
fig, ax = plt.subplots(figsize=(10, 6))

# Iterate over the unique dates and plot the hourly averages
lines = []
for date in df_avg['Date'].unique():
    data = df_avg[df_avg['Date'] == date]
    line, = ax.plot(data['Hour'], data['Average'], label=str(date))
    lines.append(line)

# Create hover events for each line using mplcursors
mplcursors.cursor(lines).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))

# Set the labels and title
ax.set_xlabel('Hour')
ax.set_ylabel('Value (Ah)')
ax.set_title('Daily Hourly Averages phase3 max')
ax.set_xticks(range(25))

# Add a legend
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Set the same limits for both x and y axes
ax.set_xlim(0, 24)
ax.set_ylim(0, 60)  # Adjust the limits based on your data

plt.tight_layout()
image_name = f"{sim}_Daily Hourly averages phase3 max.png"
plt.savefig (image_name)
# Show the plot
plt.show()

In [None]:
import seaborn as sns
df_hourly = df.resample('H').mean()
#plt.figure(figsize=(12, 6))  # Adjust the figure size as needed

df_pivot = df_hourly.pivot_table(index=df_hourly.index.date, columns=df_hourly.index.hour, values='Value')

# Create a heatmap using seaborn
plt.figure(figsize=(12, 6))  # Adjust the figure size as needed
sns.heatmap(df_pivot, cmap='inferno')

# Set labels and title
plt.xlabel('Hour of the Day')
plt.ylabel('Date')
plt.title('Current Consumption Heatmap')
plt.tight_layout()
image_name = f"{sim}_current consumption Heatmap.png"
plt.savefig (image_name)

# Display the heatmap
plt.show()

In [None]:
df_hourly = df.resample('H').mean()
#plt.figure(figsize=(12, 6))  # Adjust the figure size as needed

df_pivot = df_hourly.pivot_table(index=df_hourly.index.date, columns=df_hourly.index.hour, values='phase1_max')

# Create a heatmap using seaborn
plt.figure(figsize=(12, 6))  # Adjust the figure size as needed
sns.heatmap(df_pivot, cmap='inferno')

# Set labels and title
plt.xlabel('Hour of the Day')
plt.ylabel('Date')
plt.title('Current Consumption Heatmap for phase1-max')
plt.tight_layout()
image_name = f"{sim}_current consumption Heatmap for phase1.png"
plt.savefig (image_name)

# Display the heatmap
plt.show()

In [None]:
# Calculate the number of days since the earliest date
df['Day'] = (df['Date'] - df['Date'].min()).dt.days
df.head(10)

In [None]:
idx=df.columns.get_loc("Value")
print("Column Index : "+ str(idx))

In [None]:
# create two dimentional matrix 
x=df.loc[:,['Day', 'Value']].values

In [None]:
# find the optimal number of clusters using elbow method

WCSS = []
for i in range(1,11):
    model = KMeans(n_clusters = i,init = 'k-means++')
    model.fit(x)
    WCSS.append(model.inertia_)
fig = plt.figure(figsize = (7,7))
plt.plot(range(1,11),WCSS, linewidth=4, markersize=12,marker='o',color = 'green')
plt.xticks(np.arange(11))
plt.title('Kmeans optimal cluster analysis')
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
image_name = f"{sim}_Kmeans Cluster Size prediction.png"
plt.savefig (image_name)
plt.show()

In [None]:
from sklearn.metrics import silhouette_score
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
silhouette_avg = []
for num_clusters in range_n_clusters:
 
 # initialise kmeans
 kmeans = KMeans(n_clusters=num_clusters)
 kmeans.fit(x)
 cluster_labels = kmeans.labels_
 
 # silhouette score
 silhouette_avg.append(silhouette_score(x, cluster_labels))
fig = plt.figure(figsize = (7,7))
plt.plot(range_n_clusters,silhouette_avg,'bx-')
plt.xlabel('Values of K') 
plt.ylabel('Silhouette score') 
plt.title('Silhouette analysis For Optimal k')
image_name = f"{sim}_Silhoute score for Kmeans Cluster.png"
plt.savefig (image_name)
plt.show()

In [None]:
# since elbow occured at 3, hence the optimal number of clusters for the mall data is 3 

model = KMeans(n_clusters = 2, init = "k-means++", max_iter = 300, n_init = 10, random_state = 0)
y_clusters = model.fit_predict(x)

In [None]:
plt.figure(figsize = (10,5))
plt.scatter(x[y_clusters == 0,0],x[y_clusters == 0,1],s = 50, c = 'green')
plt.scatter(x[y_clusters == 1,0],x[y_clusters == 1,1],s = 50, c = 'blue')
#plt.scatter(x[y_clusters == 2,0],x[y_clusters == 2,1],s = 50, c = 'black')
#plt.scatter(x[y_clusters == 3,0],x[y_clusters == 3,1],s = 50, c = 'red')
#plt.scatter(x[y_clusters == 4,0],x[y_clusters == 4,1],s = 50, c = 'pink')
plt.scatter(model.cluster_centers_[:,0],model.cluster_centers_[:,1], s = 100, c = "yellow", label = "centroids")
plt.xlabel("Dates")
plt.ylabel("Current consumption ")
plt.legend()
image_name = f"{sim}_Kmeans 2D clustering.png"
plt.savefig (image_name)
plt.show()

In [None]:
df['time_hours']=df['Time']/3600

In [None]:
#include third variable and make prediction with time number of days and current value
x1 = df[['Day','Value','time_hours']].values

In [None]:
# find the optimal number of clusters using elbow method  -- >This is for 3 features = [age,anual income,spending score]

WCSSs = []
for i in range(1,11):
    model = KMeans(n_clusters = i,init = 'k-means++')
    model.fit(x1)
    WCSSs.append(model.inertia_)
fig = plt.figure(figsize = (7,7))
plt.plot(range(1,11),WCSSs, linewidth=4, markersize=12,marker='o',color = 'red')
plt.xticks(np.arange(11))
plt.xlabel("Number of clusters")
plt.ylabel("WCSSs")
image_name = f"{sim}_Kmeans 3d cluster analysis.png"
plt.savefig (image_name)
plt.show()

In [None]:
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
silhouette_avg = []
for num_clusters in range_n_clusters:
 
 # initialise kmeans
 kmeans = KMeans(n_clusters=num_clusters)
 kmeans.fit(x1)
 cluster_labels = kmeans.labels_
 
 # silhouette score
 silhouette_avg.append(silhouette_score(x1, cluster_labels))
fig = plt.figure(figsize = (7,7))
plt.plot(range_n_clusters,silhouette_avg,'bx-')
plt.xlabel('Values of K') 
plt.ylabel('Silhouette score') 
plt.title('Silhouette analysis For Optimal k')
image_name = f"{sim}_Kmeans 3d cluster Silhoutte score.png"
plt.savefig (image_name)
plt.show()

In [None]:
# finding the clusters based on input matrix "x"
model = KMeans(n_clusters = 4, init = "k-means++", max_iter = 300, n_init = 10, random_state = 0)
y_clusters = model.fit_predict(x1)

In [None]:
# 3d scatterplot using matplotlib

fig = plt.figure(figsize = (10,5))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x1[y_clusters == 0,0],x1[y_clusters == 0,1],x1[y_clusters == 0,2], s = 10 , color = 'blue')
ax.scatter(x1[y_clusters == 1,0],x1[y_clusters == 1,1],x1[y_clusters == 1,2], s = 10 , color = 'orange')
ax.scatter(x1[y_clusters == 2,0],x1[y_clusters == 2,1],x1[y_clusters == 2,2], s = 10 , color = 'green')
ax.scatter(x1[y_clusters == 3,0],x1[y_clusters == 3,1],x1[y_clusters == 3,2], s = 10 , color = 'Red')
#ax.scatter(x[y_clusters == 4,0],x[y_clusters == 4,1],x[y_clusters == 4,2], s = 40 , color = 'purple', label = "cluster 4")
ax.set_xlabel('Days')
ax.set_ylabel('Currrent consumption')
ax.set_zlabel('Time')
image_name = f"{sim}_Kmeans 3d clustering.png"
plt.savefig (image_name)
ax.legend()
plt.show()

In [None]:
from sklearn.cluster import DBSCAN
df1=df[['Day', 'Value']]
Day, Value = df.Day, df.Value
X=df1.to_numpy()
dbscan_cluster_model =DBSCAN(eps=0.21842105263157896, min_samples=13).fit(X)
dbscan_cluster_model

In [None]:
df['dbscan clusters']=dbscan_cluster_model.labels_

In [None]:
df['dbscan clusters'].value_counts()

In [None]:
import plotly.express as px
fig = px.scatter(x=Day, y= Value, color=df['dbscan clusters'])
fig.show()

In [None]:
from sklearn.metrics import silhouette_score as ss
ss(X,df['dbscan clusters'])

In [None]:
epsilon= np.linspace(0.01,1,num=20)
epsilon

In [None]:
min_samples=np.arange(1,20, step=2)
min_samples
import itertools 
combinations =list(itertools.product(epsilon, min_samples))
combinations
N=len(combinations)
N
def get_scores_and_labels(combinations, X):
  scores = []
  all_labels_list = []

  for i, (eps, num_samples) in enumerate(combinations):
    dbscan_cluster_model = DBSCAN(eps=eps, min_samples=num_samples).fit(X)
    labels = dbscan_cluster_model.labels_
    labels_set = set(labels)
    num_clusters = len(labels_set)
    if -1 in labels_set:
      num_clusters -= 1
    
    if (num_clusters < 2) or (num_clusters > 50):
      scores.append(-10)
      all_labels_list.append('bad')
      c = (eps, num_samples)
      print(f"Combination {c} on iteration {i+1} of {N} has {num_clusters} clusters. Moving on")
      continue
    
    scores.append(ss(X, labels))
    all_labels_list.append(labels)
    print(f"Index: {i}, Score: {scores[-1]}, Labels: {all_labels_list[-1]}, NumClusters: {num_clusters}")

  best_index = np.argmax(scores)
  best_parameters = combinations[best_index]
  best_labels = all_labels_list[best_index]
  best_score = scores[best_index]

  return {'best_epsilon': best_parameters[0],
          'best_min_samples': best_parameters[1], 
          'best_labels': best_labels,
          'best_score': best_score}

best_dict = get_scores_and_labels(combinations, X)