### **Import All Used Library**


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import glob
import winsound

duration = 600  # milliseconds
freq = 3500  # Hz

# ref: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
# https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.scatter.html
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.array.html
# https://docs.python.org/3/library/glob.html
# https://docs.python.org/3/library/winsound.html


### **Read xx_tracks.csv from data folder**

In [None]:
path = '../data/'
file_name = glob.glob(path + "/*_tracks.csv")

tracks = []

for filename in file_name:
    tracks.append(pd.read_csv(filename))
    
# ref: https://stackoverflow.com/questions/41262641/how-to-concatenate-multiple-dataframes-in-pandas
# https://www.geeksforgeeks.org/how-to-read-all-csv-files-in-a-folder-in-pandas/

### **define all utilities functions to calculate the mean (speed,accelaration,deccelartion)** 

In [None]:
# get the mean
def mean_id(data, id):
    return data[data['id'] == id].mean()

# get the standard deviation
def std_id(data, id):
    return data[data['id'] == id].std()

# get the length of specific track id
def len_id(data, id):
    return data[data['id'] == id].shape[0]

# get the mean for deceleration
def mean_deceleration(data, id):
    return data[data['id'] == id]['xAcceleration'][data[data['id'] == id]['xAcceleration'] < 0].mean()

# get the mean for acceleration
def mean_acceleration(data, id):
    return data[data['id'] == id]['xAcceleration'][data[data['id'] == id]['xAcceleration'] > 0].mean()

# get the mean for acceleration
def acceleration_id(data, id):
    return data[data['id'] == id]['xAcceleration'].mean()

# get the quantile for velocity
def quantile_velocity(data, id, quantile):
   return data[data['id'] == id]['xVelocity'].quantile(quantile)

# get the quantile for acceleration
def quantile_acceleration(data, id, quantile):
    return data[data['id'] == id][data['xAcceleration'] > 0]['xAcceleration'].quantile(quantile)

# get the quantile for deceleration
def quantile_deceleration(data, id, quantile):
    return data[data['id'] == id][data['xAcceleration'] < 0]['xAcceleration'].quantile(quantile)

### Get a **unique list include id's of driver**

In [None]:
df = tracks[0] # get the first track for testing , my pc is slow to read all the tracks
unique_id=df['id'].unique()

# ref: https://thispointer.com/pandas-get-unique-values-in-single-or-multiple-columns-of-a-dataframe-in-python/


### **Define functions to Find All velotity Mesures**

In [None]:
# DV1 Loai
def DV1(data, id):
    mean =mean_id(data,id)['xVelocity']
    length = len_id(data,id)
    data['DV1'] = np.sqrt((data['xVelocity']-mean)**2)/length
    
# DV2 Loai
def DV2(data, id):
    mean =mean_id(data,id)['xAcceleration']
    length = len_id(data,id)
    data.loc[data['id']==id,'DV2'] = np.sqrt((data['xAcceleration'][data['id'] == id]-mean)**2/length)
    
    
# DV6 Loai
def DV6(data, id):
    mean =mean_id(data,id)['xVelocity']
    length = len_id(data,id)
    data['DV6'] = np.abs(data['xVelocity']-mean)/length

# DV3 Nabeel
def DV3(data, id):
    mean = mean_id(data,id)['xVelocity']
    length = len_id(data,id)
    data['DV3'] = (np.sqrt((data['xVelocity']-mean)**2/length)/mean) * 100

# DV4 Nabeel
def DV4(data, id):
    mean = mean_acceleration(data,id)
    length = len_id(data,id)
    data.loc[data['id']==id,'DV4'] = (np.sqrt(
        ((data['xAcceleration'][data['xAcceleration'] > 0][data['id'] == id]-mean)**2)/length))*100

# DV5 Jehad
def DV5(data, id):
    mean = mean_deceleration(data,id)
    length = len_id(data,id)
    data.loc[data['id']==id,'DV5'] = (np.sqrt(
        (data['xAcceleration'][data['xAcceleration'] < 0]-mean)**2/length)/mean) * 100

# DV7 Jehad
def DV7(data, id):
    mean = mean_id(data,id)['xAcceleration']
    length = len_id(data,id)
    data['DV7'] = np.abs(data['xAcceleration']-mean)/length

# DV8 Muthana
def DV8(data, id):
    Q1 = quantile_velocity(data,id,0.25)
    Q3 = quantile_velocity(data,id,0.75)
    data.loc[data['id']==id,'DV8'] = 100*((Q3-Q1)/(Q3+Q1))
    
# DV9 Muthana
def DV9(data, id):
    Q1 = quantile_acceleration(data,id,0.25)
    Q3 = quantile_acceleration(data,id,0.75)
    data.loc[data['id']==id,'DV9'] = 100*((Q3-Q1)/(Q3+Q1))
    
# DV10 Loai 
def DV10(data, id):
    Q1 = quantile_deceleration(data,id,0.25)
    Q3 = quantile_deceleration(data,id,0.75)
    data.loc[data['id']==id,'DV10'] = 100*((Q3-Q1)/(Q3+Q1))
    
# DV11 Loai
def DV11(data, id):
    mean =mean_id(data,id)['xVelocity']
    length = len_id(data,id)
    number = (data[data['id'] == id]['xVelocity'] >= mean).shape[0]
    data.loc[data['id']==id,'DV11'] = 100 * \
        (number+2*data['DV1'][data['id'] == id])/length
        
# DV12 Ahmad   
def DV12(data, id):
    mean = mean_acceleration(data,id)
    length = len_id(data,id)
    number = (data[data['id'] == id][data['xAcceleration'] > 0]\
        ['xAcceleration'] >= mean).shape[0]
    data.loc[data['id']==id,'DV12'] = 100 * \
           (number+2*data['DV2'][data['id'] == id])/length
           
# DV13 Ahmad
def DV13(data, id):
    mean = mean_deceleration(data,id)
    length = len_id(data,id)
    number = (data[data['id'] == id][data['xAcceleration'] <= 0]['xAcceleration'] >= mean).shape[0]
    data.loc[data['id']==id,'DV13'] = 100 * \
        (number+2*data['DV5'][data['id'] == id])/length

# ref : https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html
# ref : https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html

In [None]:
#DV10 Ousama wrong code
import numpy as np
for i in range(len(file_name)):
    k = 0
    file = pd.read_csv(file_name[i]+'_tracks.csv')
    for j in range(1, len(file)):
        id = file[(file['id'] == j)]
        if len(id) == 0:
            break
        decel = -id['xAcceleration']
        DV10 = ((np.quantile(decel, 0.75) - np.quantile(decel, 0.25)) /
                (np.quantile(decel, 0.75) + np.quantile(decel, 0.25)))*100
        print(DV10)

#DV11 Ousama wrong code
for i in range(len(file_name)):
    k = 0
    file = pd.read_csv(file_name[i]+'_tracks.csv')
    for j in range(1, len(file)):
        id = file[(file['id'] == j)]
        if len(id) == 0:
            break
        length = len(id)
        vMean = id['xVelocity'].mean()
        count = 0
        for f in range(length):
            if abs(id['xVelocity'][k]) >= abs((vMean)+2):
                count = count + 1
            k = k+1
        DV11 = 100*count/length
        'print(DV11)'

### **find DV's For all for each id in data file**

In [None]:
for i in range(len(unique_id)):
    DV1(df, unique_id[i])
    DV2(df, unique_id[i])
    DV6(df, unique_id[i])
    DV3(df, unique_id[i])
    DV4(df, unique_id[i])
    DV5(df, unique_id[i])
    DV7(df, unique_id[i])
    DV8(df, unique_id[i])
    DV9(df, unique_id[i])
    DV10(df, unique_id[i])
    DV11(df, unique_id[i])
    DV12(df, unique_id[i])
    DV13(df, unique_id[i])
    
winsound.Beep(freq, duration) # play a beep sound when process is done

# ref: https://stackoverflow.com/questions/17086263/how-to-play-a-sound-in-python

### **export csv file after find DV's for each Driver**

In [None]:
new_data = df[['DV1', 'DV2', 'DV3', 'DV4', 'DV5', 'DV6', 'DV7', 'DV8',\
               'DV9', 'DV10', 'DV11', 'DV12', 'DV13']].groupby(tracks[0]['id']).mean()

new_data.fillna('0', inplace=True)
new_data.to_csv('../new/new.csv') # generate a new csv file with all features for each track in the dataset

### **Read xx_tracksMeta.csv from data folder to get the class for each driver**

In [None]:
path = '../data/'
file_name = glob.glob(path + "/*_tracksMeta.csv")

data = []

for filename in file_name:
    data.append(pd.read_csv(filename))

# tracksMeta

### **Read xx_new.csv from new folder to add the class column into it**

In [None]:
path = '../new/'
file_name = glob.glob(path + "/*_new.csv")

new = []

for filename in file_name:
    new.append(pd.read_csv(filename))

# new data only with DV 

### **add the class column into new dataframe and export it into data_new** 


In [None]:
class_names = []
for i in range(len(data)):
    class_names.append(pd.Series(data[i]['class'].values, index=data[i]['id']))

for i in range(len(new)):
    new[i]['class'] = new[i]['id'].map(class_names[0])

for i in range(len(new)):
    new[i].to_csv('../data_new/'+'new_' + str(i+1) + '.csv', index=False)

### **Read all files with .csv extention in new_data folder**

In [None]:
path = '../data_new/'
file_name = glob.glob(path + "/*.csv")

data = []

for filename in file_name:
    data.append(pd.read_csv(filename))
    
# new data with DV and class name for each driver id

data = pd.concat(data) # concat all data into one dataframe 

# ref: https://pandas.pydata.org/docs/reference/api/pandas.concat.html

### **to check if there is any classes not truck and car**

In [None]:
class_name=[]
classes = ['Car', 'Truck']

class_name.append(data['class'].unique())

count=0
for i in range(len(class_name)):
    if classes[0] not in class_name[i] and classes[1] not in class_name[i]:
        count+=1
        
if count==0:
    print('All class name is car and truck')
else:
    print('There is class name not car and not truck')

### **Elbow Methode to get the optimal number of cluster's**

In [None]:
cluster=data.drop(['id','class'], axis=1)
x=cluster.iloc[:,:-1].values

# sum of squared distance between each point and the centroid in a cluster
wcss=[]
for i in range(1,11):
    kmeans=KMeans(n_clusters=i,init='k-means++',max_iter=300,n_init=10,random_state=0)
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)

plt.plot(range(1,11),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

# elbow method

# ref : https://www.analyticsvidhya.com/blog/2021/05/k-mean-getting-the-optimal-number-of-clusters/


### **drop the column with 25% zero's to prepare data to clustering** 

In [None]:
percentage_zeros = 0.25  # 25% of the data will be zeros

cluster = data.drop(['id', 'class'], axis=1)


drop_cols = cluster.columns[(cluster == 0).sum() >
                            percentage_zeros*cluster.shape[1]]
cluster.drop(drop_cols, axis=1, inplace=True)

# ref: https://thispointer.com/pandas-drop-dataframe-columns-based-on-nan-percentage/


### **clustering the data with K-mean Algorithm**

In [None]:
n_clusters = 2

scaler = MinMaxScaler()
scaler_feature=scaler.fit_transform(cluster)
k_mean = KMeans(n_clusters=n_clusters, init='k-means++',max_iter=30, n_init=10, random_state=24)
k_mean.fit(scaler_feature)

y_predict = k_mean.predict(scaler_feature)

plt.scatter(scaler_feature[y_predict == 0, 0],scaler_feature[y_predict == 0, 1], s=20, c='r', label='Cluster 1')
plt.scatter(scaler_feature[y_predict == 1, 0],scaler_feature[y_predict == 1, 1], s=20, c='g', label='Cluster 2')
plt.scatter(k_mean.cluster_centers_[:, 0], k_mean.cluster_centers_[:, 1], s=300, c='black', marker='*')
plt.legend()
plt.title('K-Means')

# ref: https://datatofish.com/k-means-clustering-python/
# https://blog.dominodatalab.com/getting-started-with-k-means-clustering-in-python


### **generate a csv file with the describe of each feature**

In [None]:
desc = data.drop(['id', 'class'], axis=1)
desc.describe().to_csv('describe.csv') # generate a csv file with the describe of each feature

# ref: https://www.machinelearningplus.com/pandas/pandas-describe/


### **Scaled Cluster Centers Using only the data with DV**

In [None]:
# Scaled Cluster Centers Using all the data

# Scaled Cluster Centers Using only the data with DV

data_dv = data.drop(['id', 'class'], axis=1)
drop_cols = data_dv.columns[(data_dv == 0).sum() >\
                            percentage_zeros*data_dv.shape[1]]
data_dv.drop(drop_cols, axis=1, inplace=True)

scaler = MinMaxScaler()
scaler_feature = scaler.fit_transform(data_dv)
scale_labels_dv = k_mean.fit_predict(scaler_feature)
scale_centers_dv = k_mean.cluster_centers_

scaled_output_dv = pd.DataFrame(
    scale_centers_dv, columns=data_dv.columns, index=['Cluster 1', 'Cluster 2'])

scaled_output_dv.to_csv('../clusters/scaled_output_dv.csv') # Scaled Cluster Centers

# ref : https://scikit-learn.org/stable/modules/clustering.html#clustering
