In [1]:
import numpy as np
import matplotlib.pyplot as plt

# You may change the mhealth_activity module but your algorithm must support the original version
from mhealth_activity import Recording, Trace, Activity, WatchLocation, Path

# For interactive plots, uncomment the following line
# %matplotlib widget
import os
import pandas as pd
import pickle
from tqdm import tqdm
from scipy.fft import fft, fftfreq
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from scipy.signal import find_peaks
from scipy.signal import peak_prominences
from sklearn.metrics import mean_absolute_error,accuracy_score,precision_score,recall_score,confusion_matrix,classification_report,f1_score
from multiprocessing import Pool

In [2]:
def features_extraction_common(inp): 
    FEATURES = ['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist', 'Sum_f', 'Max_f', 'NPeak_f', 'Avgprom_f','avgpeakdist_f', 'Mean_f', 'Skew_f', 'Kurtosis_f']

    
    print(FEATURES)
    Median=[];Numneg=[];Numpos=[];Numabovmed=[];
    Min=[];Max=[];Mean=[];Mad=[];Sma=[];Eng=[];Iqr=[];Entr=[];Std=[];Var=[];Kurt=[];Skew=[];Npeaks=[];Avgprom=[]
    Min_d=[];Max_d=[];Mean_d=[];Mad_d=[];Sma_d=[];Eng_d=[];Iqr_d=[];Entr_d=[];Std_d=[];Var_d=[];
    Max_f=[];NPeak_f=[];Avgprom_f=[];Mean_f=[];Skew_f=[];Kurtosis_f=[];Sum_f=[]; Avgpeakdist=[]; Avgpeakdist_f=[];
    
    #X = df.values
    ## TIME DOMAIN ##
    #list of lists of lists, ugly as fuck but it works 
    Median.append(np.median(inp))
    Numneg.append(np.sum(np.array(inp) < 0, axis=0))
    Numpos.append(np.sum(np.array(inp) > 0, axis=0))
    Numabovmed.append(np.sum(np.array(inp) > np.median(inp), axis=0))

    Mean.append(np.mean(inp))
    Std.append(np.std(inp))
    #median absolute deviation
    Mad.append(stats.median_abs_deviation(inp, scale=1))
    Var.append(np.var(inp))
    Min.append(np.min(inp))
    Max.append(np.max(inp))
    #Signal Magnitude Area
    Sma.append(np.sum(inp))
    #energy measure
    Eng.append(np.sum(inp**2)/len(inp))
    Iqr.append(stats.iqr(inp))
    Entr.append(stats.entropy(inp))

    npeaks, _ = find_peaks(inp, distance=5)
    Npeaks.append(len(npeaks))
    prom = peak_prominences(inp, npeaks)
    Avgprom.append(np.mean(prom))

    Apeakdist = 0
    for i in range(len(npeaks)-1):
        Apeakdist += abs(npeaks[i] - npeaks[i+1])

    if len(npeaks) == 1:
        Avgpeakdist.append(0)
    else:
        Avgpeakdist.append(Apeakdist/(len(npeaks)-1))

    ## FREQ DOMAIN ##
    ft = np.abs(fft(inp))
    Sum_f.append(np.sum(ft))
    Max_f.append(np.max(ft))

    npeaks, _ = find_peaks(ft, distance=5)
    NPeak_f.append(len(npeaks))
    prom = peak_prominences(ft, npeaks)
    Avgprom_f.append(np.mean(prom))

    Apeakdist = 0
    for i in range(len(npeaks)-1):
        Apeakdist += abs(npeaks[i] - npeaks[i+1])

    if len(npeaks) == 1:
        Avgpeakdist_f.append(0)
    else:
        Avgpeakdist_f.append(Apeakdist/(len(npeaks)-1))

    Mean_f.append(np.mean(ft))
    Skew_f.append(stats.skew(ft))
    Kurtosis_f.append(stats.kurtosis(ft))

    #derivative
    # f = np.gradient(inp)
    # Mean_d.append(np.mean(inp))
    # Std_d.append(np.std(recording[0]))
    # #median absolute deviation
    # Mad_d.append(stats.median_abs_deviation(recording[0], scale=1))
    # Var_d.append(np.var(recording[0]))
    # Min_d.append(np.min(recording[0]))
    # Max_d.append(np.max(recording[0]))
    # #Signal Magnitude Area
    # Sma_d.append(np.sum(recording[0]))
    # #energy measure
    # Eng_d.append(np.sum(recording[0]**2)/len(recording[0]))
    # Iqr_d.append(stats.iqr(recording[0]))
    # Entr_d.append(stats.entropy(recording[0]))

    #Create dataframe from features
    return np.array([Median, Numneg, Numpos, Numabovmed, Mean,Std, Mad, Var, Min, Max, Sma, Eng, Iqr, Entr, Npeaks, Avgprom, Avgpeakdist, Sum_f, Max_f, NPeak_f, Avgprom_f, Avgpeakdist_f, Mean_f, Skew_f, Kurtosis_f]).reshape(-1) 



In [3]:
#load pickled training 3d norm accelerometer data
file = open('doruks_data/accel_mag_train.pkl', 'rb')
pickled = pickle.load(file)
accel_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['accel'])
print(accel_mag_train.shape)
file.close()

file = open('doruks_data/magneto_mag_train.pkl', 'rb')
pickled = pickle.load(file)
magneto_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['magneto'])
print(magneto_mag_train.shape)
file.close()

file = open('doruks_data/gyro_mag_train.pkl', 'rb')
pickled = pickle.load(file)
gyro_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['gyro'])
print(gyro_mag_train.shape)
file.close()

(396, 1)
(396, 1)
(396, 1)


check if there is standing

In [4]:
christophs_fussy_mess = pd.read_pickle('activitylabelcreation_data.pkl')

In [5]:
def check_zero_in_activities(row):
    activities = row['activities']
    return 0 in activities

# Apply the function to each row of the DataFrame
indices = christophs_fussy_mess[christophs_fussy_mess.apply(check_zero_in_activities, axis=1)].index

# Print the indices of the rows that meet the condition
print(indices)


Index([  2,  14,  15,  17,  19,  21,  26,  32,  34,  36,
       ...
       364, 367, 369, 370, 372, 376, 380, 383, 391, 395],
      dtype='int64', length=102)


add magneto data

In [6]:
#load pickled training 3d norm accelerometer data
file = open('doruks_data/accel_mag_train.pkl', 'rb')
pickled = pickle.load(file)
accel_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['accel'])
print(accel_mag_train.shape)
file.close()

file = open('doruks_data/magneto_mag_train.pkl', 'rb')
pickled = pickle.load(file)
magneto_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['magneto'])
print(magneto_mag_train.shape)
file.close()

file = open('doruks_data/gyro_mag_train.pkl', 'rb')
pickled = pickle.load(file)
gyro_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['gyro'])
print(gyro_mag_train.shape)
file.close()

file = open('doruks_data/accel_gyro_magneto_timestamps.pkl', 'rb')
timestamps = pickle.load(file)

(396, 1)
(396, 1)
(396, 1)


In [7]:
christophs_fussy_mess = pd.concat([christophs_fussy_mess, timestamps, gyro_mag_train, magneto_mag_train, accel_mag_train], axis = 1)

In [8]:
christophs_fussy_mess

Unnamed: 0,path_idx,activities,longitude,latitude,speed,altitude,step_count,phone_steps,accel_time,gyro_time,magneto_time,gyro,magneto,accel
0,2,[1],"Trace(title='longitude', total_time=584.87, sa...","Trace(title='latitude', total_time=584.87, sam...","Trace(title='speed', total_time=584.87, sample...","Trace(title='altitude', total_time=584.87, sam...",,"Trace(title='phone_steps', total_time=584.87, ...","[0.0, 0.004992479791038916, 0.0099849595820778...","[0.0, 0.004992479791038916, 0.0099849595820778...","[0.0, 0.07988990575058053, 0.15977981150116105...","[53.384678630069224, 55.44401057024837, 54.294...","[221.89445350659278, 221.10276820131642, 221.3...","[0.9480597873558152, 0.8834795162112591, 0.826..."
1,0,"[1, 2]","Trace(title='longitude', total_time=519.01, sa...","Trace(title='latitude', total_time=519.01, sam...","Trace(title='speed', total_time=519.01, sample...","Trace(title='altitude', total_time=519.01, sam...",,"Trace(title='phone_steps', total_time=519.01, ...","[0.0, 0.004993592148939241, 0.0099871842978784...","[0.0, 0.004993592148939241, 0.0099871842978784...","[0.0, 0.07990900692840647, 0.15981801385681293...","[115.5018369582274, 114.54160304751902, 113.66...","[70.72606256336667, 70.72606256336667, 70.7260...","[1.01139128109495, 1.0462793676978597, 1.06699..."
2,0,"[0, 1, 2]","Trace(title='longitude', total_time=594.12, sa...","Trace(title='latitude', total_time=594.12, sam...","Trace(title='speed', total_time=594.12, sample...","Trace(title='altitude', total_time=594.12, sam...",,,"[0.0, 0.004999697048749905, 0.0099993940974998...","[0.0, 0.004999697048749905, 0.0099993940974998...","[0.0, 0.08000525181793698, 0.16001050363587396...","[41.796000265268184, 45.88071200036655, 46.704...","[65.75126489417492, 65.75126489417492, 65.7512...","[1.3100667737832823, 1.2199582889692078, 1.170..."
3,2,[1],"Trace(title='longitude', total_time=519.63, sa...","Trace(title='latitude', total_time=519.63, sam...","Trace(title='speed', total_time=519.63, sample...","Trace(title='altitude', total_time=519.63, sam...",,,"[0.0, 0.004998807130282537, 0.0099976142605650...","[0.0, 0.004998807130282537, 0.0099976142605650...","[0.0, 0.07999245689655173, 0.15998491379310345...","[2.1428685815022215, 2.3109384227933973, 2.486...","[248.5280522182176, 248.5280522182176, 248.528...","[0.9713710479087602, 0.9788491672371056, 0.979..."
4,1,[1],"Trace(title='longitude', total_time=625.44, sa...","Trace(title='latitude', total_time=625.44, sam...","Trace(title='speed', total_time=625.44, sample...","Trace(title='altitude', total_time=625.44, sam...",,,"[0.0, 0.005001351405381632, 0.0100027028107632...","[0.0, 0.005001351405381632, 0.0100027028107632...","[0.0, 0.08003122200895713, 0.16006244401791425...","[3.500952797321439, 3.553201645342359, 3.49266...","[271.8695491537745, 271.8695491537745, 269.464...","[0.9858525450565528, 0.9902738944131763, 0.984..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,3,"[0, 1, 2]","Trace(title='longitude', total_time=325.70, sa...","Trace(title='latitude', total_time=325.70, sam...","Trace(title='speed', total_time=325.70, sample...","Trace(title='altitude', total_time=325.70, sam...",,"Trace(title='phone_steps', total_time=325.70, ...","[0.0, 0.005001658502126875, 0.0100033170042537...","[0.0, 0.005001658502126875, 0.0100033170042537...","[0.0, 0.08004497419513393, 0.16008994839026786...","[4.699013398742164, 4.880380024598859, 5.09684...","[197.29368130421352, 197.29368130421352, 195.0...","[1.0027888569670373, 1.0068028737808061, 1.005..."
392,4,[1],"Trace(title='longitude', total_time=619.13, sa...","Trace(title='latitude', total_time=619.13, sam...","Trace(title='speed', total_time=619.13, sample...","Trace(title='altitude', total_time=619.13, sam...",,"Trace(title='phone_steps', total_time=619.13, ...","[0.0, 0.005000161520880612, 0.0100003230417612...","[0.0, 0.005000161520880612, 0.0100003230417612...","[0.0, 0.08001227707417938, 0.16002455414835876...","[28.375165532643877, 26.60520728176994, 25.087...","[173.8645412195123, 175.58228794049327, 175.18...","[1.032220643747444, 1.037333175161696, 1.03870..."
393,2,[1],"Trace(title='longitude', total_time=747.65, sa...","Trace(title='latitude', total_time=747.65, sam...","Trace(title='speed', total_time=747.65, sample...","Trace(title='altitude', total_time=747.65, sam...",,,"[0.0, 0.0049992778383294, 0.0099985556766588, ...","[0.0, 0.0049992778383294, 0.0099985556766588, ...","[0.0, 0.0799964690776803, 0.1599929381553606, ...","[20.97477471110544, 23.243242709639247, 25.137...","[213.0347063402236, 213.0347063402236, 213.034...","[0.9992077464293622, 1.0128367347197746, 1.017..."
394,3,[1],"Trace(title='longitude', total_time=456.01, sa...","Trace(title='latitude', total_time=456.01, sam...","Trace(title='speed', total_time=456.01, sample...","Trace(title='altitude', total_time=456.01, sam...",,,"[0.0, 0.005000142545422647, 0.0100002850908452...","[0.0, 0.005000142545422647, 0.0100002850908452...","[0.0, 0.0800154413054922, 0.1600308826109844, ...","[30.691248240453945, 30.977619609310153, 31.54...","[135.60981840623344, 135.60981840623344, 135.6...","[1.2119177246063602, 1.1986363465519823, 1.182..."


In [9]:
christophs_fussy_mess['label'] = 1 # for all without standing

In [10]:
christophs_fussy_mess.loc[indices, 'label'] = 0 #I know it might be a bit confusing to have standing == 0 but that's kind of their convention

I want to segment the data again. This time I think only a few segments should be sufficient (5)

In [55]:
features_of_interest = np.array(["gyro", "magneto", "accel"])


skip = 15

extracted_data_dictionary = {}

counter = 0

num_chunks = 5


for sample_index, sample in christophs_fussy_mess.iterrows():

    print("Sample_index: ", sample_index)
    feature_dictionary = {}

    counter = counter + 1

    print("Progress: ", round(100 *counter/christophs_fussy_mess.shape[0], 2), "%")

    #print("Sample: ", sample)

    for feature_name in features_of_interest:

        sample_values = christophs_fussy_mess.loc[sample_index][feature_name]

        #print(feature_name)

        time_name = f'{feature_name}_time'

        #print(time_name)
        #time_column = unique_all[sample_index].filter(regex=f'{feature_name}.*_time$').columns


        sample_timestamps = christophs_fussy_mess[time_name][sample_index]

        total_timestamps = len(sample_timestamps)

        chunk_size = total_timestamps // num_chunks

        timestamp_chunks = np.array_split(sample_timestamps, num_chunks)

        value_chunks = np.array_split(sample_values, num_chunks)




        # Initialize a dictionary to store steps for each second
        values_per_interval = {}

        if sample_values is not None:

            for chunk_index in range(num_chunks):

                current_timestamps = timestamp_chunks[chunk_index]
                current_values = value_chunks[chunk_index]

                # Iterate over the timestamps and values
                for t, x in zip(current_timestamps[::skip], current_values[::skip]):
                    # Get the second part of the timestamp as the key

                    if chunk_index not in values_per_interval:
                        values_per_interval[chunk_index] = np.array([[], []])            

                        print(chunk_index)

                    # Add the timestamps and values to the list for the current interval
                    
                    #print("Timestamp:", t)
                    #print("Value:", x)

                    new_value = np.array([[t], [x]])

                    values_per_interval[chunk_index] = np.concatenate((values_per_interval[chunk_index], new_value), axis = 1)

                    

            # Calculate the average steps for each second

            
            intervals_array = np.array([])
            embeddings_array = np.empty((0, 25))
            #embeddings = np.empty((0,25))

            for interval, values in values_per_interval.items():
                
                

                #print("Just checking if this is actually a series", values[1])
                average_values = features_extraction_common(values[1]) #np.average(values) #sum(values) / len(values)
                #print(f"{second}s \t{average_values:.5f} steps")
                
                print(embeddings_array.shape)
                print(average_values.shape)
                #print(average_values)

                intervals_array = np.append(intervals_array, interval)
                #embeddings_array = np.append(embeddings_array, average_values) this flattens everything. we want a 2d array though
                #add "average_values" to the embeddings_array in a nicer way
                embeddings_array = np.concatenate((embeddings_array, average_values[np.newaxis, :]), axis=0)
            
                
                

            #das pooled alles in eine sekunde jeweils, eine überlegung wert
            
            feature_dictionary["intervals"] = intervals_array

            feature_dictionary[feature_name]  = embeddings_array

        else:
            feature_dictionary[feature_name]  = np.array([])            

    #print(feature_dictionary)

    extracted_data_dictionary[sample_index] = feature_dictionary

Sample_index:  0
Progress:  0.25 %
0
1
2
3
4
['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist', 'Sum_f', 'Max_f', 'NPeak_f', 'Avgprom_f', 'avgpeakdist_f', 'Mean_f', 'Skew_f', 'Kurtosis_f']
(0, 25)
(25,)
['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist', 'Sum_f', 'Max_f', 'NPeak_f', 'Avgprom_f', 'avgpeakdist_f', 'Mean_f', 'Skew_f', 'Kurtosis_f']
(1, 25)
(25,)
['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist', 'Sum_f', 'Max_f', 'NPeak_f', 'Avgprom_f', 'avgpeakdist_f', 'Mean_f', 'Skew_f', 'Kurtosis_f']
(2, 25)
(25,)
['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist'

In [58]:
extracted_data_dictionary[0]["gyro"].shape

(5, 25)

In [133]:
FEATURES = ['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist', 'Sum_f', 'Max_f', 'NPeak_f', 'Avgprom_f','avgpeakdist_f', 'Mean_f', 'Skew_f', 'Kurtosis_f']

features_of_interest = np.array(["gyro", "magneto", "accel"])

sample0_gyro_df = pd.DataFrame(extracted_data_dictionary[0]["gyro"], columns=[f"{features_of_interest[0]}_{feature}" for feature in FEATURES])
sample0_magneto_df = pd.DataFrame(extracted_data_dictionary[0]["magneto"], columns=[f"{features_of_interest[1]}_{feature}" for feature in FEATURES])
sample0_accel_df = pd.DataFrame(extracted_data_dictionary[0]["accel"], columns=[f"{features_of_interest[2]}_{feature}" for feature in FEATURES])

# Concatenate the DataFrames horizontally
result_df = pd.concat([sample0_gyro_df, sample0_magneto_df, sample0_accel_df], axis=1)

In [134]:
result_df

Unnamed: 0,gyro_Median,gyro_Numneg,gyro_Numpos,gyro_Numabovmed,gyro_Mean,gyro_STD,gyro_MAD,gyro_Var,gyro_Min,gyro_Max,...,accel_avgprom,accel_avgpeakdist,accel_Sum_f,accel_Max_f,accel_NPeak_f,accel_Avgprom_f,accel_avgpeakdist_f,accel_Mean_f,accel_Skew_f,accel_Kurtosis_f
0,40.599675,0.0,1563.0,781.0,46.356337,35.63394,19.443962,1269.777682,0.355238,363.388872,...,518.109944,7.123853,12810.740432,1703.245761,230.0,523.852259,6.80786,8.196251,36.156095,1379.546973
1,45.687831,0.0,1563.0,781.0,52.318977,33.702323,17.838221,1135.846599,3.428205,405.254158,...,515.767276,7.475962,15246.425498,1704.36119,224.0,524.464516,6.955157,9.754591,36.356002,1391.77302
2,47.380852,0.0,1562.0,781.0,55.753125,37.21401,18.760813,1384.882552,4.506021,521.27161,...,516.617293,7.237209,15816.741175,1705.961696,216.0,524.486365,7.218605,10.125955,34.846531,1311.546329
3,50.23997,0.0,1562.0,781.0,56.03014,33.218718,17.517888,1103.483245,3.331906,390.644378,...,523.691485,7.271028,14363.413081,1709.29875,226.0,523.985347,6.924444,9.195527,34.457027,1289.820229
4,61.048181,0.0,1562.0,781.0,65.04145,33.38773,20.018288,1114.740535,4.638496,378.173721,...,519.683844,6.884956,15197.791003,1714.979723,223.0,524.044205,6.990991,9.7297,33.860351,1255.470786


In [135]:
reshaped_df = pd.DataFrame(result_df.stack().values.reshape(1, -1))

In [136]:
result_df = reshaped_df #wichtig
reshaped_df["y-value"] = christophs_fussy_mess["label"][0]

In [137]:
christophs_fussy_mess.shape[0]

396

In [138]:
for i in range(1, christophs_fussy_mess.shape[0]):

    sample_gyro_df = pd.DataFrame(extracted_data_dictionary[i]["gyro"], columns=[f"{features_of_interest[0]}_{feature}" for feature in FEATURES])
    sample_magneto_df = pd.DataFrame(extracted_data_dictionary[i]["magneto"], columns=[f"{features_of_interest[1]}_{feature}" for feature in FEATURES])
    sample_accel_df = pd.DataFrame(extracted_data_dictionary[i]["accel"], columns=[f"{features_of_interest[2]}_{feature}" for feature in FEATURES])

    inter_result_df = pd.concat([sample_gyro_df, sample_magneto_df, sample_accel_df], axis=1)
    
    reshaped_df = pd.DataFrame(inter_result_df.stack().values.reshape(1, -1))

    print(reshaped_df.shape)

    reshaped_df["y-value"] = christophs_fussy_mess["label"][i] #the activity is in a list but as we selected for, it only has one value of course

    print(reshaped_df.shape)

    result_df = pd.concat([result_df, reshaped_df], axis = 0)




(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(1, 376)
(1, 375)
(

In [139]:
result_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,366,367,368,369,370,371,372,373,374,y-value
0,40.599675,0.0,1563.0,781.0,46.356337,35.633940,19.443962,1269.777682,0.355238,363.388872,...,6.884956,15197.791003,1714.979723,223.0,524.044205,6.990991,9.729700,33.860351,1255.470786,1
0,134.691144,0.0,1386.0,693.0,130.913725,65.848880,42.415276,4336.075008,6.215068,605.340745,...,7.358289,11407.366671,1492.167334,203.0,464.700569,6.841584,8.230423,34.158077,1230.232333,1
0,115.945665,0.0,1585.0,792.0,121.702546,69.129985,53.839788,4778.954776,6.632468,339.607140,...,6.960352,10795.427188,1880.098854,234.0,530.696340,6.785408,6.810995,38.360942,1504.567982,0
0,43.581658,0.0,1387.0,693.0,45.238355,19.622016,10.388734,385.023511,0.767352,220.718366,...,6.984772,13531.861109,1450.011353,202.0,465.326524,6.835821,9.763248,33.145440,1179.684116,1
0,33.177463,0.0,1668.0,834.0,35.520853,15.851222,10.737528,251.261224,3.500953,110.320223,...,7.725581,13331.367229,1693.345244,240.0,558.652648,6.962343,7.992426,38.596704,1542.672532,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,155.912155,0.0,869.0,434.0,174.783206,116.180127,91.281838,13497.821898,4.699013,457.008337,...,7.660714,16514.790970,1910.681066,121.0,296.390882,7.141667,19.004362,26.979850,767.654368,0
0,84.963542,0.0,1651.0,825.0,88.178561,39.182741,23.117569,1535.287189,3.806204,279.279682,...,6.722449,10735.832119,1803.607039,233.0,552.796793,7.056034,6.502624,39.053625,1561.994035,1
0,44.016339,0.0,1995.0,997.0,51.508957,37.145780,17.896132,1379.808970,1.246840,475.377157,...,7.162455,14181.178745,2018.153017,283.0,667.286281,7.056738,7.111925,42.411806,1855.910757,1
0,120.422412,0.0,1216.0,608.0,126.207299,60.552789,47.280246,3666.640235,5.652120,376.974996,...,6.813559,11155.804889,1406.068272,180.0,408.681397,6.759777,9.174182,30.808938,1021.537093,1


Prediction

In [140]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image


In [142]:
columns_to_exclude = ['y-value']

X = result_df.drop(columns=columns_to_exclude)
y = result_df["y-value"]

In [143]:
nones_df = X[X.isnull().any(axis=1)]

In [144]:
print(nones_df)

Empty DataFrame
Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]
Index: []

[0 rows x 375 columns]


In [145]:
from sklearn.impute import SimpleImputer

# Assuming X is your feature matrix with NaN values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)


In [146]:
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2)

In [162]:
rf = RandomForestClassifier(random_state=17)
rf.fit(X_train, y_train)

In [163]:
y_pred = rf.predict(X_test)

In [164]:
from sklearn.metrics import accuracy_score


accuracy_score(y_test, y_pred)

0.8875