# Step 3 - Create Dataset with More Features

Sources: 
Previous Weeks Code, Generative AI, 
https://cran.r-project.org/web/packages/celltrackR/vignettes/clustering.html

### 1. Read Data

In [10]:
# Imports
import pandas as pd

# Read in 2 Data Files and Combine, Mark as M/F
female_data = pd.read_csv('file_list_female_parsed_with_data_failed.csv')
male_data = pd.read_csv('file_list_male_parsed_with_data.csv')
female_data['gender'] = 0  # Female data is 0
male_data['gender'] = 1    # Male data is 1
combined_data = pd.concat([female_data, male_data], ignore_index=True)
# combined_data.to_csv('combined_dataset.csv', index=False)
print(combined_data.shape)
print(combined_data.head())

(752827, 17)
                experiment_name video_base_name  track_id  count_uncalibrated  \
0  MC_singlenuc23_1_Tk33_021220        0001_vid      1330                   0   
1  MC_singlenuc23_1_Tk33_021220        0001_vid      1330                   1   
2  MC_singlenuc23_1_Tk33_021220        0001_vid      1330                  10   
3  MC_singlenuc23_1_Tk33_021220        0001_vid      1330                 100   
4  MC_singlenuc23_1_Tk33_021220        0001_vid      1330                1000   

   new_count_uncalibrated          xc          yc     u_dot      v_dot  \
0                  382058  117.405639  104.372970  6.973857  22.796302   
1                  382059  123.302195  123.913192  6.585702  21.623125   
2                  382068  111.683189  400.633206 -0.382109  28.731078   
3                  382158  821.635502  527.801237  5.468046  -3.778739   
4                  383058  883.833486  309.388663  2.265553   0.529649   

         s_dot  class_id   p_value  track_length       

In [11]:
# Rename Data
data = combined_data

# Save Class ID for Later Comparison
class_id_array = data[['gender', 'track_id']].copy()

# Drop Potential Labels
data.drop('gender', inplace=True, axis=1) 

# Drop Un-needed Columns
data.drop('video_base_name', inplace=True, axis=1) 

# Drop Cols that We Might Need
data.drop('w', inplace=True, axis=1) 
data.drop('h', inplace=True, axis=1) 
data.drop('p_value', inplace=True, axis=1) 
data.drop('class_id', inplace=True, axis=1)
data.drop('track_length', inplace=True, axis=1) 
data.drop('experiment_name', inplace=True, axis=1) 
data.drop('count_uncalibrated', inplace=True, axis=1) 
data.drop('new_count_uncalibrated', inplace=True, axis=1) 

# Check Modified Data
print("Modified CSV Data: \n") 
print(data)
data.to_csv('data_for_sample.csv', index=False)

# Rename DF
df = data

Modified CSV Data: 

        track_id          xc          yc     u_dot      v_dot        s_dot  \
0           1330  117.405639  104.372970  6.973857  22.796302  4094.324244   
1           1330  123.302195  123.913192  6.585702  21.623125  4154.148577   
2           1330  111.683189  400.633206 -0.382109  28.731078  1993.790100   
3           1330  821.635502  527.801237  5.468046  -3.778739    82.335724   
4           1330  883.833486  309.388663  2.265553   0.529649    -4.429363   
...          ...         ...         ...       ...        ...          ...   
752822      9350  156.276651  671.360475  4.437002   6.482767    23.792231   
752823      9350  162.206921  672.395285  4.571728   5.991240    25.240829   
752824      9350  170.841343  672.257588  4.938269   5.438280    26.805397   
752825      9350  178.862178  673.323482  5.216379   5.043803    29.915899   
752826      9350  187.945173  674.862680  5.565223   4.727619    30.420474   

         frame  
0       382058  
1       

### 2. Combine Frames by Track & Create Features

In [3]:
# Imports
import numpy as np

In [4]:
# Create Features
def calculate_speed(df):
    df['speed'] = np.sqrt(df['u_dot']**2 + df['v_dot']**2)
    return df

def calculate_mean_turning_angle(df):
    df['angle'] = np.arctan2(df['v_dot'], df['u_dot'])
    df['turning_angle'] = df['angle'].diff().abs()
    mean_turning_angle = df['turning_angle'].mean()
    return mean_turning_angle

def calculate_outreach_ratio(df):
    start_x, start_y = df.iloc[0][['xc', 'yc']]
    end_x, end_y = df.iloc[-1][['xc', 'yc']]
    displacement = np.sqrt((end_x - start_x)**2 + (end_y - start_y)**2)
    path_length = df['speed'].sum()
    outreach_ratio = displacement / path_length if path_length != 0 else 0
    return outreach_ratio

def calculate_square_displacement(df):
    start_x, start_y = df.iloc[0][['xc', 'yc']]
    displacements = np.sqrt((df['xc'] - start_x)**2 + (df['yc'] - start_y)**2)
    square_displacement = np.sum(displacements**2)
    return square_displacement

def calculate_acceleration(u_dot, v_dot, time_intervals):
    """
    Calculate the acceleration components and magnitude.

    Parameters:
    u_dot (list or np.array): Velocity in the x direction.
    v_dot (list or np.array): Velocity in the y direction.
    time_intervals (list or np.array): Time intervals between frames.

    Returns:
    ax (np.array): Acceleration in the x direction.
    ay (np.array): Acceleration in the y direction.
    acceleration (np.array): Magnitude of the acceleration.
    """
    # Ensure inputs are numpy arrays for element-wise operations
    u_dot = np.array(u_dot)
    v_dot = np.array(v_dot)
    time_intervals = np.array(time_intervals)

    # Calculate acceleration components
    ax = np.diff(u_dot) / time_intervals[:-1]
    ay = np.diff(v_dot) / time_intervals[:-1]

    # Calculate the magnitude of the acceleration
    acceleration = np.sqrt(ax**2 + ay**2)

    return ax, ay, acceleration

def calculate_distance_traveled(xc, yc):
    """
    Calculate the total distance traveled based on coordinates.

    Parameters:
    xc (list or np.array): x coordinates.
    yc (list or np.array): y coordinates.

    Returns:
    distance_traveled (float): Total distance traveled.
    """
    # Ensure inputs are numpy arrays for element-wise operations
    xc = np.array(xc)
    yc = np.array(yc)

    # Calculate the differences between consecutive coordinates
    dx = np.diff(xc)
    dy = np.diff(yc)

    # Calculate the distance between consecutive points
    distances = np.sqrt(dx**2 + dy**2)

    # Sum up the distances to get the total distance traveled
    distance_traveled = np.sum(distances)

    return distance_traveled

In [5]:
# Combine Frames by Track and Calculate Feature Values
features = []

# Group by track_id
grouped = df.groupby('track_id')

for track_id, group in grouped:
    group = group.sort_values(by='frame')
    speed = calculate_speed(group)['speed'].mean()
    speed_var = calculate_speed(group)['speed'].var()
    speed_max = calculate_speed(group)['speed'].max()
    speed_min = calculate_speed(group)['speed'].min()
    mean_turning_angle = calculate_mean_turning_angle(group)
    outreach_ratio = calculate_outreach_ratio(group)
    square_displacement = calculate_square_displacement(group)
    
    # Calculate acceleration
    time_intervals = group['frame'].diff().fillna(1).values  # Assume frame difference as time interval
    ax, ay, acceleration = calculate_acceleration(group['u_dot'], group['v_dot'], time_intervals)
    mean_acceleration = np.mean(acceleration)
    max_acceleration = np.max(acceleration)
    min_acceleration = np.min(acceleration)
    
    # Calculate distance traveled
    distance_traveled = calculate_distance_traveled(group['xc'], group['yc'])
    features.append({
        'track_id': track_id,
        'speed': speed,
        # 'speed_var': speed_var, # Less difference than speed.
        'speed_max': speed_max,
        # 'speed_min': speed_min, # No significant difference.
        'mean_turning_angle': mean_turning_angle,
        'outreach_ratio': outreach_ratio,
        # 'square_displacement': square_displacement # Nearly identical.
        'mean_acceleration': mean_acceleration,
        'min_acceleration': min_acceleration,
        # 'max_acceleration': max_acceleration, # Nearly identical.
        'distance_traveled': distance_traveled
    })

# Convert to DF
features_df = pd.DataFrame(features)

# Check New Features
print(features_df.head())

   track_id     speed  speed_max  mean_turning_angle  outreach_ratio  \
0         6  3.827775  15.422913            0.103121        0.136342   
1        14  5.366490  16.410934            0.159354        0.592727   
2        17  3.123633  28.774327            0.145173        0.083003   
3        18  1.663800  15.225459            0.303789        0.027139   
4        31  4.150165  17.482277            0.139008        0.056169   

   mean_acceleration  min_acceleration  distance_traveled  
0           0.294142          0.002012        3133.948689  
1           0.395413          0.040245        1364.319554  
2           0.253220          0.000746       13529.000365  
3           0.179134          0.000613       21258.660167  
4           0.337333          0.003294       10019.624026  


In [6]:
class_id_array.to_csv('class_id_array.csv', index=False)
features_df.to_csv('data.csv', index=False)

### 3. Standardize Data

In [7]:
# Imports
from sklearn.preprocessing import StandardScaler

In [8]:
# Standardize Data
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features_df[['speed', 'speed_max', 'mean_acceleration', 'distance_traveled', 'mean_turning_angle', 'outreach_ratio']])
normalized_features_df = pd.DataFrame(normalized_features, columns=['speed', 'speed_max', 'mean_acceleration', 'distance_traveled', 'mean_turning_angle', 'outreach_ratio'])

# Add track_id Back
normalized_features_df['track_id'] = features_df['track_id']

# Check Data
data = normalized_features_df
print(data.head())

      speed  speed_max  mean_acceleration  distance_traveled  \
0  0.017104  -0.229202          -0.144125          -0.738726   
1  0.903157  -0.100692           0.615237          -1.116520   
2 -0.388369   1.507394          -0.450970           1.480491   
3 -1.228999  -0.254885          -1.006485           3.130679   
4  0.202749   0.038656           0.179734           0.731282   

   mean_turning_angle  outreach_ratio  track_id  
0           -0.763880       -0.095906         6  
1           -0.088360        2.758743        14  
2           -0.258712       -0.429537        17  
3            1.646708       -0.778964        18  
4           -0.332778       -0.597380        31  


In [9]:
class_id_array.to_csv('class_id_array_standardized.csv', index=False)
data.to_csv('data_standardized.csv', index=False)